From a3645ed94d9f587e22a24f13fccd26be9d35d8aa Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Sat, 6 Sep 2025 13:27:15 -0700 Subject: [PATCH 001/584] [Frontend][Responses API] Support reporting tool output tokens and fix reasoning token count (#24285) Signed-off-by: Ye (Charlotte) Qi --- tests/entrypoints/test_context.py | 425 +++++++++++++++++++ vllm/entrypoints/context.py | 158 +++++-- vllm/entrypoints/openai/protocol.py | 3 +- vllm/entrypoints/openai/serving_responses.py | 7 +- 4 files changed, 557 insertions(+), 36 deletions(-) create mode 100644 tests/entrypoints/test_context.py diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py new file mode 100644 index 0000000000000..5e6a4c85ff790 --- /dev/null +++ b/tests/entrypoints/test_context.py @@ -0,0 +1,425 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock, patch + +import pytest +from openai_harmony import StreamState + +from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext +from vllm.outputs import CompletionOutput, RequestOutput + + +# Helper function for Python < 3.10 compatibility +async def async_next(async_iterator): + """Compatibility function equivalent to Python 3.10's anext().""" + return await async_iterator.__anext__() + + +def create_mock_request_output( + prompt_token_ids=None, + output_token_ids=None, + num_cached_tokens=0, + finished=True, +): + """Helper function to create a mock RequestOutput object for testing.""" + outputs = [] + token_ids = output_token_ids if output_token_ids is not None else [] + outputs = [ + CompletionOutput( + index=0, + text="Test output", + token_ids=token_ids, + cumulative_logprob=0.0, + logprobs=None, + finish_reason=None, + stop_reason=None, + ) + ] + + return RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, + outputs=outputs, + finished=finished, + num_cached_tokens=num_cached_tokens, + ) + + +async def generate_mock_outputs(num_turns, + prompt_token_counts, + output_token_counts, + cached_token_counts=None): + """Generate a sequence of mock RequestOutput objects to simulate multiple + turns.""" + if cached_token_counts is None: + cached_token_counts = [0] * num_turns + + for i in range(num_turns): + # Create mock prompt token IDs and output token IDs + prompt_token_ids = list(range(1, prompt_token_counts[i] + 1)) + output_token_ids = list(range(1, output_token_counts[i] + 1)) + + # Create and yield the RequestOutput + yield create_mock_request_output( + prompt_token_ids=prompt_token_ids, + output_token_ids=output_token_ids, + num_cached_tokens=cached_token_counts[i], + ) + + +@pytest.fixture +def mock_parser(): + """Set up a mock parser for tests.""" + with patch("vllm.entrypoints.context.get_streamable_parser_for_assistant" + ) as mock_parser_factory: + # Create a mock parser object + parser = MagicMock() + parser.messages = [] + parser.current_channel = None + parser.state = StreamState.EXPECT_START + mock_parser_factory.return_value = parser + yield parser + + +def test_single_turn_token_counting(): + """Test token counting behavior for a single turn.""" + # Create a context + context = HarmonyContext(messages=[], available_tools=[]) + + # Create a mock RequestOutput with specific token counts + mock_output = create_mock_request_output( + prompt_token_ids=[1, 2, 3, 4, 5], # 5 prompt tokens + output_token_ids=[6, 7, 8], # 3 output tokens + num_cached_tokens=2, # 2 cached tokens + ) + + # Append the output to the context + context.append_output(mock_output) + + # Verify the token counts + assert context.num_prompt_tokens == 5 + assert context.num_output_tokens == 3 + assert context.num_cached_tokens == 2 + assert context.num_tool_output_tokens == 0 # No tool tokens in first turn + + # Verify internal state tracking + assert not context.is_first_turn + assert context.previous_turn.input_tokens == 5 + assert context.previous_turn.output_tokens == 3 + + +@pytest.mark.asyncio +async def test_multi_turn_token_counting(): + """Test token counting behavior across multiple turns with tool output.""" + # Create a context + context = HarmonyContext(messages=[], available_tools=["browser"]) + + # Simulate a conversation with 3 turns + # Turn 1: prefill 5, decode 3, tool 7 + # Turn 2: prefill 15, cached 5, decode 4, tool 1 + # Turn 3: prefill 20, cached 15, decode 5 + prompt_token_counts = [5, 15, 20] + output_token_counts = [3, 4, 5] + cached_token_counts = [0, 5, 15] + mock_generator = generate_mock_outputs(3, prompt_token_counts, + output_token_counts, + cached_token_counts) + + # First turn - initial prompt and response + mock_output1 = await async_next(mock_generator) + context.append_output(mock_output1) + + # At this point, we should have 5 prompt tokens and 3 output tokens + assert context.num_prompt_tokens == 5 + assert context.num_output_tokens == 3 + assert context.num_tool_output_tokens == 0 + + # Second turn - after tool output + mock_output2 = await async_next(mock_generator) + context.append_output(mock_output2) + # Current prompt tokens (15) - last_turn_input_tokens (5) - + # last_turn_output_tokens (3) = 7 + expected_tool_output = 7 + + assert context.num_prompt_tokens == 5 + 15 + assert context.num_output_tokens == 3 + 4 + assert context.num_tool_output_tokens == expected_tool_output + assert context.num_cached_tokens == 5 + + # Third turn - final response + mock_output3 = await async_next(mock_generator) + context.append_output(mock_output3) + # Additional tool output tokens from third turn: + # Current prompt (20) - last_turn_input_tokens (15) - + # last_turn_output_tokens (4) = 1 + expected_tool_output = 7 + 1 + + assert context.num_prompt_tokens == 5 + 15 + 20 + assert context.num_output_tokens == 3 + 4 + 5 + assert context.num_tool_output_tokens == expected_tool_output + assert context.num_cached_tokens == 5 + 15 + + +def test_empty_output_tokens(): + """Test behavior when RequestOutput has empty output tokens.""" + context = HarmonyContext(messages=[], available_tools=[]) + + # Create a RequestOutput with empty output tokens + mock_output = create_mock_request_output( + prompt_token_ids=[1, 2, 3], # 3 prompt tokens + output_token_ids=[], # Empty output tokens list + num_cached_tokens=1, + ) + + context.append_output(mock_output) + + # Should handle empty outputs gracefully + assert context.num_prompt_tokens == 3 + assert context.num_output_tokens == 0 # No output tokens + assert context.num_cached_tokens == 1 + assert context.num_tool_output_tokens == 0 + + +def test_missing_prompt_token_ids(): + """Test behavior when RequestOutput has None prompt_token_ids.""" + context = HarmonyContext(messages=[], available_tools=[]) + + mock_output = create_mock_request_output( + prompt_token_ids=None, # No prompt token IDs + output_token_ids=[1, 2], # 2 output tokens + num_cached_tokens=0, + ) + + # Logger.error will be called, but we don't need to check for warnings + # here Just ensure it doesn't raise an exception + context.append_output(mock_output) + + # Should handle missing prompt tokens gracefully + assert context.num_prompt_tokens == 0 + assert context.num_output_tokens == 2 + assert context.num_cached_tokens == 0 + assert context.num_tool_output_tokens == 0 + + +def test_reasoning_tokens_counting(mock_parser): + """Test that reasoning tokens are counted correctly.""" + context = HarmonyContext(messages=[], available_tools=[]) + + # Mock parser to simulate reasoning channel + mock_parser.current_channel = "analysis" # Reasoning channel + + mock_output = create_mock_request_output( + prompt_token_ids=[1, 2, 3], + output_token_ids=[4, 5, 6, 7], # 4 tokens, all in reasoning + num_cached_tokens=0, + ) + + context.append_output(mock_output) + + # All output tokens should be counted as reasoning + assert context.num_reasoning_tokens == 4 + assert context.num_output_tokens == 4 + + +def test_zero_tokens_edge_case(): + """Test behavior with all zero token counts.""" + context = HarmonyContext(messages=[], available_tools=[]) + + # Create a request with empty lists (not None) for both prompt and + # output tokens + mock_output = create_mock_request_output( + prompt_token_ids=[], # Empty prompt tokens + output_token_ids=[], # Empty output tokens + num_cached_tokens=0, + ) + + context.append_output(mock_output) + + # All counts should be zero + assert context.num_prompt_tokens == 0 + assert context.num_output_tokens == 0 + assert context.num_cached_tokens == 0 + assert context.num_tool_output_tokens == 0 + assert context.num_reasoning_tokens == 0 + + +@pytest.mark.asyncio +async def test_single_turn_no_tool_output(): + """Test that first turn never generates tool output tokens.""" + context = HarmonyContext( + messages=[], + available_tools=["browser"] # Tools available + ) + + # Even with large prompt in first turn, no tool tokens should be counted + mock_output = create_mock_request_output( + prompt_token_ids=list(range(100)), # 100 tokens + output_token_ids=[1, 2, 3], + num_cached_tokens=0, + ) + + context.append_output(mock_output) + + # First turn should never have tool output tokens + assert context.num_tool_output_tokens == 0 + assert context.is_first_turn is False # Should be updated after first turn + + +@pytest.mark.asyncio +async def test_negative_tool_tokens_edge_case(): + """Test edge case where calculation could result in negative tool + tokens. We should log an error and clamp the value to 0.""" + # Use patch to check if logger.error was called + with patch("vllm.entrypoints.context.logger.error") as mock_log: + context = HarmonyContext(messages=[], available_tools=["browser"]) + + # First turn + mock_output1 = create_mock_request_output( + prompt_token_ids=list(range(10)), # 10 tokens + output_token_ids=[1, 2, 3, 4, 5], # 5 tokens + ) + context.append_output(mock_output1) + + # Second turn with fewer new tokens than previous output + # This could happen in edge cases with aggressive caching + mock_output2 = create_mock_request_output( + prompt_token_ids=list(range(12)), # 12 tokens (only 2 new) + output_token_ids=[6, 7], # 2 tokens + ) + context.append_output(mock_output2) + + # Calculated negative tool tokens (12 - 10 - 5 = -3) should be clamped + # to 0 and an error should be logged + assert context.num_tool_output_tokens == 0 + assert context.num_prompt_tokens == 10 + 12 + assert context.num_output_tokens == 5 + 2 + + # Verify the error was logged properly + mock_log.assert_called_once() + + # Extract the actual log message and arguments from the call + args, _ = mock_log.call_args + log_message = args[0] + + # Check for key parts of the message + assert "Negative tool output tokens calculated" in log_message + assert "-3" in str(args) # Check that -3 is in the arguments + + +@pytest.mark.asyncio +async def test_streaming_multi_turn_token_counting(mock_parser): + """Test token counting for streaming multi-turn conversations. + + This test focuses on how StreamingHarmonyContext counts tokens in a + multi-turn conversation with streaming (token-by-token) outputs and + message boundaries. + """ + # Create a streaming context + context = StreamingHarmonyContext(messages=[], available_tools=["browser"]) + + # Simulate three turns of conversation: + # Turn 1: stream tokens one by one, then finish the message + # Turn 2: new prompt, stream more tokens with a reasoning segment + # Turn 3: new prompt with tool output and cached tokens + + # First turn: 3 tokens streamed one by one + # First token of first turn + context.append_output( + create_mock_request_output( + prompt_token_ids=[1, 2, 3], # 3 prompt tokens + output_token_ids=[101], # Single token + num_cached_tokens=0, + finished=False, # Not end of message yet + )) + + # Second token of first turn + context.append_output( + create_mock_request_output( + output_token_ids=[102], + finished=False, + )) + + # Last token of first turn (finished=True signals end of message) + context.append_output( + create_mock_request_output( + output_token_ids=[103], + finished=True, # End of message + )) + + # Check token counts after first turn + assert context.num_prompt_tokens == 3 # Initial prompt tokens + assert context.num_output_tokens == 3 # Three output tokens + assert context.num_cached_tokens == 0 + assert context.num_tool_output_tokens == 0 # No tool output in first turn + assert context.first_tok_of_message is True # Ready for next message + + # Second turn: reasoning tokens in analysis channel + mock_parser.current_channel = "analysis" # Set to reasoning channel + + # First token of second turn + context.append_output( + create_mock_request_output( + prompt_token_ids=[1, 2, 3, 101, 102, 103, 4, + 5], # 8 tokens (includes previous) + output_token_ids=[201], + num_cached_tokens=3, # Some tokens cached + finished=False, + )) + + # More tokens in reasoning channel + context.append_output( + create_mock_request_output( + output_token_ids=[202], + finished=False, + )) + + context.append_output( + create_mock_request_output( + output_token_ids=[203], + finished=True, # End of reasoning message + )) + + # Check counts after second turn (reasoning message) + assert context.num_prompt_tokens == 3 + 8 # Initial + second prompt + assert context.num_output_tokens == 3 + 3 # First turn + second turn + assert context.num_reasoning_tokens == 3 # All tokens in analysis channel + assert context.num_cached_tokens == 3 # Cached tokens from second turn + + # Formula: this turn prompt tokens - last turn prompt - last turn output + expected_tool_tokens = 8 - 3 - 3 # = 2 + assert context.num_tool_output_tokens == expected_tool_tokens + + # Third turn: regular output channel + mock_parser.current_channel = "final" # Switch back to regular channel + + # Third turn (with more cached tokens) + context.append_output( + create_mock_request_output( + prompt_token_ids=[ + 1, 2, 3, 101, 102, 103, 4, 5, 201, 202, 203, 6, 7 + ], # 13 tokens + output_token_ids=[301], + num_cached_tokens=8, # More cached tokens + finished=False, + )) + + context.append_output( + create_mock_request_output( + output_token_ids=[302], + finished=True, + )) + + # Final token counts check + assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts + assert context.num_output_tokens == 3 + 3 + 2 # All outputs + assert context.num_reasoning_tokens == 3 # Unchanged from second turn + assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens + + # Additional tool tokens from third turn + # Formula: this turn prompt - last turn prompt - last turn output + additional_tool_tokens = 13 - 8 - 3 # = 2 + assert context.num_tool_output_tokens == expected_tool_tokens \ + + additional_tool_tokens diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index e4f2e800f94a6..7723c5d5cbcfc 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -3,7 +3,6 @@ import json import logging from abc import ABC, abstractmethod -from collections.abc import Sequence from contextlib import AsyncExitStack from typing import TYPE_CHECKING, Optional, Union @@ -21,6 +20,23 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) +class TurnTokens: + """Tracks token counts for a single conversation turn.""" + + def __init__(self, input_tokens=0, output_tokens=0): + self.input_tokens = input_tokens + self.output_tokens = output_tokens + + def reset(self): + """Reset counters for a new turn.""" + self.input_tokens = 0 + self.output_tokens = 0 + + def copy(self): + """Create a copy of this turn's token counts.""" + return TurnTokens(self.input_tokens, self.output_tokens) + + class ConversationContext(ABC): @abstractmethod @@ -92,52 +108,124 @@ class HarmonyContext(ConversationContext): self.num_init_messages = len(messages) self.num_prompt_tokens = 0 self.num_output_tokens = 0 - # TODO(woosuk): Implement the following fields. self.num_cached_tokens = 0 self.num_reasoning_tokens = 0 + self.num_tool_output_tokens = 0 - def _update_num_prompt_tokens(self, output: RequestOutput): - if output.prompt_token_ids and len(output.prompt_token_ids) > 0: - # NOTE: with built-in tools, there might be multiple rounds in - # the conversation, with the full conversation being resent - # as new prompt each time. Hence the sum. - self.num_prompt_tokens += len(output.prompt_token_ids) + # Turn tracking - replaces multiple individual tracking variables + self.current_turn = TurnTokens() + self.previous_turn = TurnTokens() + self.is_first_turn = True + self.first_tok_of_message = True # For streaming support - def _update_num_cached_tokens(self, output: RequestOutput): - if output.num_cached_tokens is not None: - #Similar to num_prompt_tokens - self.num_cached_tokens += output.num_cached_tokens - - def _update_num_output_tokens(self, token_ids: Sequence[int]): - self.num_output_tokens += len(token_ids) - - def _update_num_reasoning_tokens(self, token_ids: Sequence[int]): - # Count tokens that are part of reasoning content (analysis channel - # or tool-directed messages like python/browser calls) - is_analysis = self.parser.current_channel == "analysis" - is_tool_call = (self.parser.current_recipient is not None and - (self.parser.current_recipient.startswith("python") or - self.parser.current_recipient.startswith("browser."))) - if is_analysis or is_tool_call: - self.num_reasoning_tokens += len(token_ids) + def _update_num_reasoning_tokens(self): + # Count all analysis and commentary channels as reasoning tokens + if self.parser.current_channel in {"analysis", "commentary"}: + self.num_reasoning_tokens += 1 def append_output(self, output) -> None: if isinstance(output, RequestOutput): - self._update_num_prompt_tokens(output) - self._update_num_cached_tokens(output) output_token_ids = output.outputs[0].token_ids - self._update_num_output_tokens(output_token_ids) self.parser = get_streamable_parser_for_assistant() for token_id in output_token_ids: self.parser.process(token_id) # Check if the current token is part of reasoning content - self._update_num_reasoning_tokens([token_id]) + self._update_num_reasoning_tokens() + self._update_prefill_token_usage(output) + # Reset current turn output tokens for this turn + self.current_turn.output_tokens = 0 + self._update_decode_token_usage(output) + # Move current turn to previous turn for next turn's calculations + self.previous_turn = self.current_turn.copy() output_msgs = self.parser.messages else: # Tool output. output_msgs = output self._messages.extend(output_msgs) + def _update_prefill_token_usage(self, output: RequestOutput) -> None: + """Update token usage statistics for the prefill phase of generation. + + The prefill phase processes the input prompt tokens. This method: + 1. Counts the prompt tokens for this turn + 2. Calculates tool output tokens for multi-turn conversations + 3. Updates cached token counts + 4. Tracks state for next turn calculations + + Tool output tokens are calculated as: + current_prompt_tokens - last_turn_prompt_tokens - + last_turn_output_tokens + This represents tokens added between turns (typically tool responses). + + Args: + output: The RequestOutput containing prompt token information + """ + if output.prompt_token_ids is not None: + this_turn_input_tokens = len(output.prompt_token_ids) + else: + this_turn_input_tokens = 0 + logger.error( + "RequestOutput appended contains no prompt_token_ids.") + + # Update current turn input tokens + self.current_turn.input_tokens = this_turn_input_tokens + self.num_prompt_tokens += this_turn_input_tokens + + # Calculate tool tokens (except on first turn) + if self.is_first_turn: + self.is_first_turn = False + else: + # start counting tool after first turn + # tool tokens = this turn prefill - last turn prefill - + # last turn decode + this_turn_tool_tokens = (self.current_turn.input_tokens - + self.previous_turn.input_tokens - + self.previous_turn.output_tokens) + + # Handle negative tool token counts (shouldn't happen in normal + # cases) + if this_turn_tool_tokens < 0: + logger.error( + "Negative tool output tokens calculated: %d " + "(current_input=%d, previous_input=%d, " + "previous_output=%d). Setting to 0.", + this_turn_tool_tokens, self.current_turn.input_tokens, + self.previous_turn.input_tokens, + self.previous_turn.output_tokens) + this_turn_tool_tokens = 0 + + self.num_tool_output_tokens += this_turn_tool_tokens + + # Update cached tokens + if output.num_cached_tokens is not None: + self.num_cached_tokens += output.num_cached_tokens + + def _update_decode_token_usage(self, output: RequestOutput) -> int: + """Update token usage statistics for the decode phase of generation. + + The decode phase processes the generated output tokens. This method: + 1. Counts output tokens from all completion outputs + 2. Updates the total output token count + 3. Tracks tokens generated in the current turn + + In streaming mode, this is called for each token generated. + In non-streaming mode, this is called once with all output tokens. + + Args: + output: The RequestOutput containing generated token information + + Returns: + int: Number of output tokens processed in this call + """ + updated_output_token_count = 0 + if output.outputs: + for completion_output in output.outputs: + # only keep last round + updated_output_token_count += len(completion_output.token_ids) + self.num_output_tokens += updated_output_token_count + self.current_turn.output_tokens += updated_output_token_count + return updated_output_token_count + @property def messages(self) -> list: return self._messages @@ -231,8 +319,8 @@ class StreamingHarmonyContext(HarmonyContext): # append_output is called for each output token in streaming case, # so we only want to add the prompt tokens once for each message. if self.first_tok_of_message: - self._update_num_prompt_tokens(output) - self._update_num_cached_tokens(output) + self._update_prefill_token_usage(output) + self.current_turn.output_tokens = 0 # Reset self.first_tok_of_message if needed: # if the current token is the last one of the current message # (finished=True), then the next token processed will mark the @@ -240,9 +328,13 @@ class StreamingHarmonyContext(HarmonyContext): self.first_tok_of_message = output.finished for tok in output.outputs[0].token_ids: self.parser.process(tok) - self._update_num_output_tokens(output.outputs[0].token_ids) + self._update_decode_token_usage(output) + + # For streaming, update previous turn when message is complete + if output.finished: + self.previous_turn = self.current_turn.copy() # Check if the current token is part of reasoning content - self._update_num_reasoning_tokens(output.outputs[0].token_ids) + self._update_num_reasoning_tokens() self.last_tok = tok else: # Handle the case of tool output in direct message format diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 413e1dd8d6337..c56c68cf76442 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1841,7 +1841,8 @@ class InputTokensDetails(OpenAIBaseModel): class OutputTokensDetails(OpenAIBaseModel): - reasoning_tokens: int + reasoning_tokens: int = 0 + tool_output_tokens: int = 0 class ResponseUsage(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index d49724b0439cf..a102d4a4a5e68 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -460,7 +460,7 @@ class OpenAIServingResponses(OpenAIServing): if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) - # TODO: these are all 0 for now! + num_tool_output_tokens = context.num_tool_output_tokens else: assert isinstance(context, SimpleContext) final_res = context.last_output @@ -473,6 +473,8 @@ class OpenAIServingResponses(OpenAIServing): # Calculate usage. assert final_res.prompt_token_ids is not None + num_tool_output_tokens = 0 + assert isinstance(context, (SimpleContext, HarmonyContext)) num_prompt_tokens = context.num_prompt_tokens num_generated_tokens = context.num_output_tokens @@ -486,7 +488,8 @@ class OpenAIServingResponses(OpenAIServing): input_tokens_details=InputTokensDetails( cached_tokens=num_cached_tokens), output_tokens_details=OutputTokensDetails( - reasoning_tokens=num_reasoning_tokens), + reasoning_tokens=num_reasoning_tokens, + tool_output_tokens=num_tool_output_tokens), ) response = ResponsesResponse.from_request( request, From e68dc2f014369771de4892a008d994c33317834f Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Sun, 7 Sep 2025 04:39:34 +0800 Subject: [PATCH 002/584] [Bugfix] Fix unstable silu_mul+nvfp4 quant fusion test (#24370) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- tests/compile/test_silu_mul_quant_fusion.py | 46 ++++++++++++++------- tests/kernels/quantization/nvfp4_utils.py | 8 ++++ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 731ceeb905f64..736db80a2f379 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import cast + import pytest import torch import vllm.envs as envs +from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant # yapf conflicts with isort for this block # yapf: disable @@ -64,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module): class TestSiluMulNvfp4QuantModel(torch.nn.Module): - def __init__(self, hidden_size: int, **kwargs): + def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs): super().__init__() self.silu_and_mul = SiluAndMul() - self.w = torch.randint(256, (hidden_size, hidden_size // 2), - dtype=FP4_DTYPE) - self.wscale = torch.randn(hidden_size, - hidden_size // 16).to(dtype=FP8_DTYPE) - self.wscale2 = torch.rand(1, dtype=torch.float32) - self.scale = torch.rand(1, dtype=torch.float32) + + # create nvfp4 weight + w = torch.rand((hidden_size, hidden_size)) + self.w, self.w_block_scale, self.w_global_scale = quant_nvfp4_tensor(w) + + # get global scale offline + _, _, self.y_global_scale = quant_nvfp4_tensor(self.silu_and_mul(x)) + + self.alpha = 1.0 / (self.w_global_scale * self.y_global_scale) def forward(self, x): y = self.silu_and_mul(x) - y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale) + y_quant, y_block_scale = scaled_fp4_quant(y, self.y_global_scale) out = cutlass_scaled_fp4_mm(a=y_quant, b=self.w, block_scale_a=y_block_scale, - block_scale_b=self.wscale, - alpha=self.scale * self.wscale2, + block_scale_b=self.w_block_scale, + alpha=self.alpha, out_dtype=y.dtype) return out @@ -95,8 +101,9 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): @pytest.mark.parametrize("num_tokens", [64]) @pytest.mark.parametrize("hidden_size", [128]) @pytest.mark.parametrize( - "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel] - if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]) + "model_class", + cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel] + if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @pytest.mark.parametrize("cuda_force_torch", @@ -111,6 +118,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, torch.set_default_device("cuda") torch.set_default_dtype(torch.float16) + x = torch.rand(num_tokens, hidden_size * 2) + # Reshape pass is needed for the fusion pass to work config = VllmConfig() config.compilation_config = CompilationConfig( @@ -119,10 +128,10 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, backend = TestBackend(NoOpEliminationPass(config), fusion_pass) model = model_class(hidden_size=hidden_size, - cuda_force_torch=cuda_force_torch) + cuda_force_torch=cuda_force_torch, + x=x) # First dimension dynamic - x = torch.rand(num_tokens, hidden_size * 2) torch._dynamo.mark_dynamic(x, 0) result = model(x) @@ -131,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, result2 = model2(x) # Check that it gives the same answer + if model_class == TestSiluMulFp8QuantModel: + atol, rtol = 1e-3, 1e-3 + elif model_class == TestSiluMulNvfp4QuantModel: + atol, rtol = 1e-1, 1e-1 + torch.testing.assert_close(result[0].to(dtype=torch.float16), result2[0].to(dtype=torch.float16), - atol=1e-3, - rtol=1e-3) + atol=atol, + rtol=rtol) # In pre-nodes, quant op should be present and fused kernels should not backend.check_before_ops(model.ops_in_model_before()) diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py index 1095975ab2b41..fc4e125550180 100644 --- a/tests/kernels/quantization/nvfp4_utils.py +++ b/tests/kernels/quantization/nvfp4_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch +from vllm._custom_ops import scaled_fp4_quant from vllm.scalar_type import scalar_types FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() @@ -65,3 +66,10 @@ def break_fp4_bytes(a, dtype): # Reshape to final form return values.reshape(m, n * 2).to(dtype=dtype) + + +def quant_nvfp4_tensor(a: torch.Tensor): + a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / + torch.abs(a).max().to(torch.float32)) + a_quant, a_block_scale = scaled_fp4_quant(a, a_global_scale) + return a_quant, a_block_scale, a_global_scale From 848562bd49a91d21f2dc76912d5fcb25ae80c8a8 Mon Sep 17 00:00:00 2001 From: Bangsheng Tang <5318912+bangshengtang@users.noreply.github.com> Date: Sat, 6 Sep 2025 14:02:47 -0700 Subject: [PATCH 003/584] break execute_model in gpu_model_runner into sub-functions for custom scopes (#24265) Co-authored-by: Bangsheng Tang --- vllm/envs.py | 4 + vllm/v1/utils.py | 19 +- vllm/v1/worker/gpu_model_runner.py | 294 ++++++++++++++++++----------- 3 files changed, 208 insertions(+), 109 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index e5b44893297e2..5e1585f3c3f06 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -168,6 +168,7 @@ if TYPE_CHECKING: VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False + VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False def get_default_cache_root(): @@ -1200,6 +1201,9 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), + # Add optional custom scopes for profiling, disable to avoid overheads + "VLLM_CUSTOM_SCOPES_FOR_PROFILING": + lambda: bool(int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))), } # --8<-- [end:env-vars-definition] diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index ab9bee3e4544d..e0c7d9094aa6d 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,17 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import contextlib import multiprocessing import time import weakref from collections.abc import Sequence +from contextlib import AbstractContextManager from multiprocessing import connection from multiprocessing.process import BaseProcess from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, Union, overload) import torch +from torch.autograd.profiler import record_function +import vllm.envs as envs from vllm.logger import init_logger from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) @@ -155,7 +159,7 @@ def get_engine_client_zmq_addr(local_only: bool, class APIServerProcessManager: """Manages a group of API server processes. - + Handles creation, monitoring, and termination of API server worker processes. Also monitors extra processes to check if they are healthy. """ @@ -172,7 +176,7 @@ class APIServerProcessManager: stats_update_address: Optional[str] = None, ): """Initialize and start API server worker processes. - + Args: target_server_fn: Function to call for each API server process listen_address: Address to listen for client connections @@ -181,7 +185,7 @@ class APIServerProcessManager: num_servers: Number of API server processes to start input_addresses: Input addresses for each API server output_addresses: Output addresses for each API server - stats_update_address: Optional stats update address + stats_update_address: Optional stats update address """ self.listen_address = listen_address self.sock = sock @@ -225,7 +229,7 @@ def wait_for_completion_or_failure( "CoreEngineActorManager"]] = None, coordinator: Optional["DPCoordinator"] = None) -> None: """Wait for all processes to complete or detect if any fail. - + Raises an exception if any process exits with a non-zero status. Args: @@ -368,3 +372,10 @@ def report_usage_stats( "disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce, }) + + +def record_function_or_nullcontext(name: str) -> AbstractContextManager: + if envs.VLLM_CUSTOM_SCOPES_FOR_PROFILING: + return record_function(name) + else: + return contextlib.nullcontext() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 76ed5c5a6051f..563872f8d68f3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -69,7 +69,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheGroupSpec, KVCacheSpec, MambaSpec, SlidingWindowSpec) from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, - DraftTokenIds, LogprobsTensors, ModelRunnerOutput) + DraftTokenIds, LogprobsLists, LogprobsTensors, + ModelRunnerOutput, SamplerOutput) from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata @@ -79,7 +80,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.medusa import MedusaProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer -from vllm.v1.utils import CpuGpuBuffer +from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin, KVConnectorOutput) @@ -1587,31 +1588,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_connector_output=kv_connector_output, ) - @torch.inference_mode() - def execute_model( + def _preprocess( self, scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]: - self._update_states(scheduler_output) - if not scheduler_output.total_num_scheduled_tokens: - if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if there's no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - - return self.kv_connector_no_forward(scheduler_output, - self.vllm_config) - - if self.cache_config.kv_sharing_fast_prefill: - assert not self.input_batch.num_prompt_logprobs, ( - "--kv-sharing-fast-prefill produces incorrect logprobs for " - "prompt tokens, tokens, please disable it when the requests " - "need prompt logprobs") - - # Prepare the decoder inputs. - (attn_metadata, logits_indices, spec_decode_metadata, - num_scheduled_tokens_np, spec_decode_common_attn_metadata, - max_query_len) = self._prepare_inputs(scheduler_output) + ) -> tuple[int, int, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], torch.Tensor, + Optional[IntermediateTensors], dict[str, Any]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE @@ -1683,75 +1666,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_input_tokens, intermediate_tensors, True) - uniform_decode = (max_query_len == self.uniform_decode_query_len) and ( - num_scheduled_tokens == self.input_batch.num_reqs * max_query_len) - batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, - uniform_decode=uniform_decode) - cudagraph_runtime_mode, batch_descriptor = \ - self.cudagraph_dispatcher.dispatch(batch_descriptor) - - # Run the model. - # Use persistent buffers for CUDA graphs. - with set_forward_context( - attn_metadata, - self.vllm_config, - num_tokens=num_input_tokens, - num_tokens_across_dp=num_tokens_across_dp, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, - ), self.maybe_get_kv_connector_output( - scheduler_output) as kv_connector_output: - - model_output = self.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - **model_kwargs, - ) - - if self.use_aux_hidden_state_outputs: - hidden_states, aux_hidden_states = model_output - else: - hidden_states = model_output - aux_hidden_states = None - - # Broadcast PP output for external_launcher (torchrun) - # to make sure we are synced across pp ranks - # TODO: Support overlapping mirco-batches - # https://github.com/vllm-project/vllm/issues/18019 - broadcast_pp_output = \ - self.parallel_config.distributed_executor_backend \ - == "external_launcher" and len(get_pp_group().ranks) > 0 - if not get_pp_group().is_last_rank: - # For mid-pipeline stages, return the hidden states. - assert isinstance(hidden_states, IntermediateTensors) - if not broadcast_pp_output: - hidden_states.kv_connector_output = kv_connector_output - return hidden_states - get_pp_group().send_tensor_dict(hidden_states.tensors, - all_gather_group=get_tp_group()) - logits = None - else: - if self.is_pooling_model: - return self._pool(hidden_states, num_scheduled_tokens, - num_scheduled_tokens_np, kv_connector_output) - - sample_hidden_states = hidden_states[logits_indices] - logits = self.model.compute_logits(sample_hidden_states, None) - if broadcast_pp_output: - model_output_broadcast_data = { - "logits": logits.contiguous(), - } if logits is not None else {} - model_output_broadcast_data = get_pp_group().broadcast_tensor_dict( - model_output_broadcast_data, src=len(get_pp_group().ranks) - 1) - assert model_output_broadcast_data is not None - logits = model_output_broadcast_data["logits"] - - # Apply structured output bitmasks if present - if scheduler_output.grammar_bitmask is not None: - self.apply_grammar_bitmask(scheduler_output, logits) + return ( + num_scheduled_tokens, + num_input_tokens, + num_tokens_across_dp, + input_ids, + inputs_embeds, + positions, + intermediate_tensors, + model_kwargs, + ) + def _sample( + self, logits: Optional[torch.Tensor], + spec_decode_metadata: Optional[SpecDecodeMetadata] + ) -> SamplerOutput: # Sample the next token and get logprobs if needed. sampling_metadata = self.input_batch.sampling_metadata if spec_decode_metadata is None: @@ -1785,6 +1714,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) sampler_output.sampled_token_ids = output_token_ids + return sampler_output + + def _bookkeeping_sync( + self, scheduler_output: "SchedulerOutput", + sampler_output: SamplerOutput, logits: Optional[torch.Tensor], + hidden_states: torch.Tensor, num_scheduled_tokens: int + ) -> tuple[ + dict[str, int], + Optional[LogprobsLists], + list[list[int]], + dict[str, Optional[LogprobsTensors]], + list[str], + dict[str, int], + list[int], + ]: num_nans_in_logits = {} if envs.VLLM_COMPUTE_NANS_IN_LOGITS: num_nans_in_logits = self._get_nans_in_logits(logits) @@ -1827,6 +1771,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] sampled_token_ids = sampler_output.sampled_token_ids + invalid_req_indices = [] if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] @@ -1892,20 +1837,159 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] req_state.output_token_ids.extend(sampled_ids) - if self.speculative_config: - assert spec_decode_common_attn_metadata is not None - self._draft_token_ids = self.propose_draft_token_ids( - scheduler_output, - valid_sampled_token_ids, - sampling_metadata, - hidden_states, - sample_hidden_states, - aux_hidden_states, - spec_decode_metadata, - spec_decode_common_attn_metadata, + return ( + num_nans_in_logits, + logprobs_lists, + valid_sampled_token_ids, + prompt_logprobs_dict, + req_ids_output_copy, + req_id_to_index_output_copy, + invalid_req_indices, + ) + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]: + with record_function_or_nullcontext("Preprocess"): + self._update_states(scheduler_output) + if not scheduler_output.total_num_scheduled_tokens: + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if there's no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward(scheduler_output, + self.vllm_config) + if self.cache_config.kv_sharing_fast_prefill: + assert not self.input_batch.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect logprobs for " + "prompt tokens, tokens, please disable it when the requests" + " need prompt logprobs") + + # Prepare the decoder inputs. + (attn_metadata, logits_indices, spec_decode_metadata, + num_scheduled_tokens_np, spec_decode_common_attn_metadata, + max_query_len) = self._prepare_inputs(scheduler_output) + + ( + num_scheduled_tokens, + num_input_tokens, + num_tokens_across_dp, + input_ids, + inputs_embeds, + positions, + intermediate_tensors, + model_kwargs, + ) = self._preprocess(scheduler_output, intermediate_tensors) + + uniform_decode = (max_query_len + == self.uniform_decode_query_len) and ( + num_scheduled_tokens + == self.input_batch.num_reqs * max_query_len) + batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, + uniform_decode=uniform_decode) + cudagraph_runtime_mode, batch_descriptor = \ + self.cudagraph_dispatcher.dispatch(batch_descriptor) + + # Run the model. + # Use persistent buffers for CUDA graphs. + with (set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens, + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ), record_function_or_nullcontext("Forward"), + self.maybe_get_kv_connector_output(scheduler_output) as + kv_connector_output): + model_output = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, ) - self.eplb_step() + with record_function_or_nullcontext("Postprocess"): + if self.use_aux_hidden_state_outputs: + hidden_states, aux_hidden_states = model_output + else: + hidden_states = model_output + aux_hidden_states = None + + # Broadcast PP output for external_launcher (torchrun) + # to make sure we are synced across pp ranks + # TODO: Support overlapping mirco-batches + # https://github.com/vllm-project/vllm/issues/18019 + broadcast_pp_output = \ + self.parallel_config.distributed_executor_backend \ + == "external_launcher" and len(get_pp_group().ranks) > 0 + if not get_pp_group().is_last_rank: + # For mid-pipeline stages, return the hidden states. + assert isinstance(hidden_states, IntermediateTensors) + if not broadcast_pp_output: + hidden_states.kv_connector_output = kv_connector_output + return hidden_states + get_pp_group().send_tensor_dict( + hidden_states.tensors, all_gather_group=get_tp_group()) + logits = None + else: + if self.is_pooling_model: + return self._pool(hidden_states, num_scheduled_tokens, + num_scheduled_tokens_np, + kv_connector_output) + + sample_hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(sample_hidden_states, None) + if broadcast_pp_output: + model_output_broadcast_data = { + "logits": logits.contiguous(), + } if logits is not None else {} + model_output_broadcast_data = get_pp_group( + ).broadcast_tensor_dict(model_output_broadcast_data, + src=len(get_pp_group().ranks) - 1) + assert model_output_broadcast_data is not None + logits = model_output_broadcast_data["logits"] + + # Apply structured output bitmasks if present + if scheduler_output.grammar_bitmask is not None: + self.apply_grammar_bitmask(scheduler_output, logits) + + with record_function_or_nullcontext("Sample"): + sampler_output = self._sample(logits, spec_decode_metadata) + + with record_function_or_nullcontext("Bookkeep"): + assert isinstance(hidden_states, torch.Tensor) + ( + num_nans_in_logits, + logprobs_lists, + valid_sampled_token_ids, + prompt_logprobs_dict, + req_ids_output_copy, + req_id_to_index_output_copy, + invalid_req_indices, + ) = self._bookkeeping_sync(scheduler_output, sampler_output, + logits, hidden_states, + num_scheduled_tokens) + + if self.speculative_config: + assert spec_decode_common_attn_metadata is not None + with record_function_or_nullcontext("Draft"): + self._draft_token_ids = self.propose_draft_token_ids( + scheduler_output, + valid_sampled_token_ids, + self.input_batch.sampling_metadata, + hidden_states, + sample_hidden_states, + aux_hidden_states, + spec_decode_metadata, + spec_decode_common_attn_metadata, + ) + + with record_function_or_nullcontext("EPLB"): + self.eplb_step() output = ModelRunnerOutput( req_ids=req_ids_output_copy, @@ -1923,7 +2007,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return AsyncGPUModelRunnerOutput( model_runner_output=output, - sampled_token_ids=sampled_token_ids, + sampled_token_ids=sampler_output.sampled_token_ids, invalid_req_indices=invalid_req_indices, async_output_copy_stream=self.async_output_copy_stream, ) From 4172235ab78b09989fb56edaf734dbee283dda3e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 6 Sep 2025 16:15:18 -0700 Subject: [PATCH 004/584] [V0 deprecation] Deprecate V0 Neuron backend (#21159) Signed-off-by: Woosuk Kwon --- .buildkite/release-pipeline.yaml | 16 - .../scripts/hardware_ci/run-neuron-test.sh | 64 -- MANIFEST.in | 1 - docker/Dockerfile.neuron | 56 -- examples/offline_inference/neuron.py | 49 - examples/offline_inference/neuron_eagle.py | 61 -- .../neuron_int8_quantization.py | 63 -- .../offline_inference/neuron_multimodal.py | 110 --- .../offline_inference/neuron_speculation.py | 64 -- requirements/neuron.txt | 9 - setup.py | 36 +- tests/engine/test_arg_utils.py | 9 - tests/neuron/1_core/test_activation.py | 43 - tests/neuron/1_core/test_block_table.py | 154 --- tests/neuron/1_core/test_cache.py | 86 -- tests/neuron/1_core/test_layernorm.py | 57 -- tests/neuron/1_core/test_logits_processor.py | 95 -- .../neuron/1_core/test_neuron_model_runner.py | 127 --- tests/neuron/1_core/test_neuron_quant.py | 12 - tests/neuron/1_core/test_prefix_prefill.py | 514 ---------- tests/neuron/1_core/test_rotary_embedding.py | 68 -- tests/neuron/2_core/test_comm_ops.py | 101 -- tests/neuron/2_core/test_eagle.py | 83 -- tests/neuron/2_core/test_mistral.py | 64 -- tests/neuron/2_core/test_multi_lora.py | 97 -- vllm/attention/ops/nki_flash_attn.py | 903 ------------------ vllm/collect_env.py | 16 +- vllm/config/__init__.py | 22 +- vllm/config/cache.py | 5 +- vllm/config/parallel.py | 5 +- .../neuron_communicator.py | 20 - vllm/engine/arg_utils.py | 5 - vllm/envs.py | 2 +- vllm/model_executor/custom_op.py | 7 - vllm/model_executor/layers/activation.py | 7 - .../layers/quantization/__init__.py | 3 - .../layers/quantization/neuron_quant.py | 76 -- .../layers/rotary_embedding/base.py | 83 +- vllm/model_executor/model_loader/neuron.py | 476 --------- .../model_loader/neuronx_distributed.py | 685 ------------- vllm/platforms/__init__.py | 25 - vllm/platforms/interface.py | 4 - vllm/platforms/neuron.py | 151 --- vllm/worker/neuron_model_runner.py | 455 --------- vllm/worker/neuron_worker.py | 189 ---- .../neuronx_distributed_model_runner.py | 294 ------ 46 files changed, 10 insertions(+), 5462 deletions(-) delete mode 100644 .buildkite/scripts/hardware_ci/run-neuron-test.sh delete mode 100644 docker/Dockerfile.neuron delete mode 100644 examples/offline_inference/neuron.py delete mode 100644 examples/offline_inference/neuron_eagle.py delete mode 100644 examples/offline_inference/neuron_int8_quantization.py delete mode 100644 examples/offline_inference/neuron_multimodal.py delete mode 100644 examples/offline_inference/neuron_speculation.py delete mode 100644 requirements/neuron.txt delete mode 100644 tests/neuron/1_core/test_activation.py delete mode 100644 tests/neuron/1_core/test_block_table.py delete mode 100644 tests/neuron/1_core/test_cache.py delete mode 100644 tests/neuron/1_core/test_layernorm.py delete mode 100644 tests/neuron/1_core/test_logits_processor.py delete mode 100644 tests/neuron/1_core/test_neuron_model_runner.py delete mode 100644 tests/neuron/1_core/test_neuron_quant.py delete mode 100644 tests/neuron/1_core/test_prefix_prefill.py delete mode 100644 tests/neuron/1_core/test_rotary_embedding.py delete mode 100644 tests/neuron/2_core/test_comm_ops.py delete mode 100644 tests/neuron/2_core/test_eagle.py delete mode 100644 tests/neuron/2_core/test_mistral.py delete mode 100644 tests/neuron/2_core/test_multi_lora.py delete mode 100644 vllm/attention/ops/nki_flash_attn.py delete mode 100644 vllm/distributed/device_communicators/neuron_communicator.py delete mode 100644 vllm/model_executor/layers/quantization/neuron_quant.py delete mode 100644 vllm/model_executor/model_loader/neuron.py delete mode 100644 vllm/model_executor/model_loader/neuronx_distributed.py delete mode 100644 vllm/platforms/neuron.py delete mode 100644 vllm/worker/neuron_model_runner.py delete mode 100644 vllm/worker/neuron_worker.py delete mode 100644 vllm/worker/neuronx_distributed_model_runner.py diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 53b5b23db3c21..597dfbf99022d 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -149,19 +149,3 @@ steps: - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" - - - block: "Build Neuron release image" - key: block-neuron-release-image-build - depends_on: ~ - - - label: "Build and publish Neuron release image" - depends_on: block-neuron-release-image-build - agents: - queue: neuron-postmerge - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest" - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" - env: - DOCKER_BUILDKIT: "1" diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh deleted file mode 100644 index a397457c83261..0000000000000 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# This script build the Neuron docker image and run the API server inside the container. -# It serves a sanity check for compilation and basic model usage. -set -e -set -v - -image_name="neuron/vllm-ci" -container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" - -HF_CACHE="$(realpath ~)/huggingface" -mkdir -p "${HF_CACHE}" -HF_MOUNT="/root/.cache/huggingface" -HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN) - -NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" -mkdir -p "${NEURON_COMPILE_CACHE_URL}" -NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" - -# Try building the docker image -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws - -# prune old image and containers to save disk space, and only once a day -# by using a timestamp file in tmp. -if [ -f /tmp/neuron-docker-build-timestamp ]; then - last_build=$(cat /tmp/neuron-docker-build-timestamp) - current_time=$(date +%s) - if [ $((current_time - last_build)) -gt 86400 ]; then - # Remove dangling images (those that are not tagged and not used by any container) - docker image prune -f - # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune -f - echo "$current_time" > /tmp/neuron-docker-build-timestamp - fi -else - date "+%s" > /tmp/neuron-docker-build-timestamp -fi - -docker build -t "${image_name}" -f docker/Dockerfile.neuron . - -# Setup cleanup -remove_docker_container() { - docker image rm -f "${image_name}" || true; -} -trap remove_docker_container EXIT - -# Run the image -docker run --rm -it --device=/dev/neuron0 --network bridge \ - -v "${HF_CACHE}:${HF_MOUNT}" \ - -e "HF_HOME=${HF_MOUNT}" \ - -e "HF_TOKEN=${HF_TOKEN}" \ - -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ - -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ - --name "${container_name}" \ - ${image_name} \ - /bin/bash -c " - set -e; # Exit on first error - python3 /workspace/vllm/examples/offline_inference/neuron.py; - python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; - for f in /workspace/vllm/tests/neuron/2_core/*.py; do - echo \"Running test file: \$f\"; - python3 -m pytest \$f -v --capture=tee-sys; - done - " \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 82fd22b845f09..fb3cccbb4a9c1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,7 +2,6 @@ include LICENSE include requirements/common.txt include requirements/cuda.txt include requirements/rocm.txt -include requirements/neuron.txt include requirements/cpu.txt include CMakeLists.txt diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron deleted file mode 100644 index 8bc23554718dc..0000000000000 --- a/docker/Dockerfile.neuron +++ /dev/null @@ -1,56 +0,0 @@ -# default base image -# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04" - -FROM $BASE_IMAGE - -RUN echo "Base image is $BASE_IMAGE" - -# Install some basic utilities -RUN apt-get update && \ - apt-get install -y \ - git \ - python3 \ - python3-pip \ - ffmpeg libsm6 libxext6 libgl1 - -### Mount Point ### -# When launching the container, mount the code directory to /workspace -ARG APP_MOUNT=/workspace -VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT}/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity -RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U -RUN python3 -m pip install pytest - -# uninstall transformers-neuronx package explicitly to avoid version conflict -RUN python3 -m pip uninstall -y transformers-neuronx - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -RUN python3 -m pip install -U \ - 'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements/neuron.txt - -ENV VLLM_TARGET_DEVICE neuron -RUN --mount=type=bind,source=.git,target=.git \ - pip install --no-build-isolation -v -e . - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils - -# install transformers-neuronx package as an optional dependencies (for V0) -# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict -RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps - -RUN python3 -m pip install sentencepiece transformers==4.48.0 -U - -# overwrite entrypoint to run bash script -RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py - -CMD ["/bin/bash"] diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py deleted file mode 100644 index 7826629a36d01..0000000000000 --- a/examples/offline_inference/neuron.py +++ /dev/null @@ -1,49 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - -def main(): - # Create an LLM. - llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=1024, - block_size=1024, - # ruff: noqa: E501 - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=2, - ) - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - print("-" * 50) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") - print("-" * 50) - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py deleted file mode 100644 index 8b1d235ff9742..0000000000000 --- a/examples/offline_inference/neuron_eagle.py +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This example shows how to run offline inference with an EAGLE speculative -decoding model on neuron. To use EAGLE speculative decoding, you must use -a draft model that is specifically fine-tuned for EAGLE speculation. -Additionally, to use EAGLE with NxD Inference, the draft model must include -the LM head weights from the target model. These weights are shared between -the draft and target model. -""" - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "What is annapurna labs?", -] - - -def main(): - # Create a sampling params object. - sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) - - # Create an LLM. - llm = LLM( - model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", - speculative_config={ - "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", - "num_speculative_tokens": 5, - "max_model_len": 2048, - }, - max_num_seqs=4, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in neuronx-distributed-inference. - max_model_len=2048, - block_size=2048, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=32, - override_neuron_config={ - "enable_eagle_speculation": True, - "enable_fused_speculation": True, - }, - ) - - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}") - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py deleted file mode 100644 index c0ecfac508996..0000000000000 --- a/examples/offline_inference/neuron_int8_quantization.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os - -from vllm import LLM, SamplingParams - -# creates XLA hlo graphs for all the context length buckets. -os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" -# creates XLA hlo graphs for all the token gen buckets. -os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" -# Quantizes neuron model weight to int8 , -# The default config for quantization is int8 dtype. -os.environ["NEURON_QUANT_DTYPE"] = "s8" - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - -def main(): - # Create an LLM. - llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=2048, - block_size=2048, - # ruff: noqa: E501 - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - quantization="neuron_quant", - override_neuron_config={ - "cast_logits_dtype": "bfloat16", - }, - tensor_parallel_size=2, - ) - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - print("-" * 50) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") - print("-" * 50) - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py deleted file mode 100644 index 26f7505f2fa53..0000000000000 --- a/examples/offline_inference/neuron_multimodal.py +++ /dev/null @@ -1,110 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import requests -import torch -from neuronx_distributed_inference.models.mllama.utils import add_instruct -from PIL import Image - -from vllm import LLM, SamplingParams, TextPrompt - - -def get_image(image_url): - image = Image.open(requests.get(image_url, stream=True).raw) - return image - - -# Model Inputs -PROMPTS = [ - "What is in this image? Tell me a story", - "What is the recipe of mayonnaise in two sentences?", - "Describe this image", - "What is the capital of Italy famous for?", -] -IMAGES = [ - get_image( - "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" - ), - None, - get_image( - "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" - ), - None, -] -SAMPLING_PARAMS = [ - dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16) - for _ in range(len(PROMPTS)) -] - - -def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params): - # Prepare all inputs for mllama generation, including: - # 1. put text prompt into instruct chat template - # 2. compose single text and single image prompt into Vllm's prompt class - # 3. prepare sampling parameters - input_image = single_image - has_image = torch.tensor([1]) - if isinstance(single_image, torch.Tensor) and single_image.numel() == 0: - has_image = torch.tensor([0]) - - instruct_prompt = add_instruct(prompt, has_image) - inputs = TextPrompt(prompt=instruct_prompt) - - if input_image is not None: - inputs["multi_modal_data"] = {"image": input_image} - - sampling_params = SamplingParams(**sampling_params) - return inputs, sampling_params - - -def print_outputs(outputs): - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -def main(): - assert ( - len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS) - ), f"""Text, image prompts and sampling parameters should have the - same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, - and {len(SAMPLING_PARAMS)}""" - - # Create an LLM. - llm = LLM( - model="meta-llama/Llama-3.2-11B-Vision-Instruct", - max_num_seqs=1, - max_model_len=4096, - block_size=4096, - device="neuron", - tensor_parallel_size=32, - override_neuron_config={ - "sequence_parallel_enabled": False, - "skip_warmup": True, - "save_sharded_checkpoint": True, - "on_device_sampling_config": { - "global_topk": 1, - "dynamic": False, - "deterministic": False, - }, - }, - ) - - batched_inputs = [] - batched_sample_params = [] - for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS): - inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params) - # test batch-size = 1 - outputs = llm.generate(inputs, sampling_params) - print_outputs(outputs) - batched_inputs.append(inputs) - batched_sample_params.append(sampling_params) - - # test batch-size = 4 - outputs = llm.generate(batched_inputs, batched_sample_params) - print_outputs(outputs) - - -if __name__ == "__main__": - main() diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py deleted file mode 100644 index 7fc22caee742d..0000000000000 --- a/examples/offline_inference/neuron_speculation.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This example shows how to run offline inference with a speculative -decoding model on neuron. -""" - -import os - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, I am a language model and I can help", - "The president of the United States is", - "The capital of France is", -] - - -def config_buckets(): - """Configure context length and token gen buckets.""" - # creates XLA hlo graphs for all the context length buckets. - os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" - # creates XLA hlo graphs for all the token gen buckets. - os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" - - -def initialize_llm(): - """Create an LLM with speculative decoding.""" - return LLM( - model="openlm-research/open_llama_7b", - speculative_config={ - "model": "openlm-research/open_llama_3b", - "num_speculative_tokens": 4, - "max_model_len": 2048, - }, - max_num_seqs=4, - max_model_len=2048, - block_size=2048, - device="neuron", - tensor_parallel_size=32, - ) - - -def process_requests(llm: LLM, sampling_params: SamplingParams): - """Generate texts from prompts and print them.""" - outputs = llm.generate(prompts, sampling_params) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -def main(): - """Main function that sets up the llm and processes prompts.""" - config_buckets() - llm = initialize_llm() - # Create a sampling params object. - sampling_params = SamplingParams(max_tokens=100, top_k=1) - process_requests(llm, sampling_params) - - -if __name__ == "__main__": - main() diff --git a/requirements/neuron.txt b/requirements/neuron.txt deleted file mode 100644 index 7df478eddde3f..0000000000000 --- a/requirements/neuron.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Common dependencies --r common.txt - -# Dependencies for Neuron devices -packaging>=24.2 -setuptools>=77.0.3,<80.0.0 -torch-neuronx >= 2.5.0 -neuronx-cc>=2.0.0a0 -torchvision # Required for Llama3.2 multimodal image preprocessing diff --git a/setup.py b/setup.py index 872696b250849..4ea0baa0b2204 100644 --- a/setup.py +++ b/setup.py @@ -413,8 +413,7 @@ def _no_device() -> bool: def _is_cuda() -> bool: has_cuda = torch.version.cuda is not None - return (VLLM_TARGET_DEVICE == "cuda" and has_cuda - and not (_is_neuron() or _is_tpu())) + return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu()) def _is_hip() -> bool: @@ -422,10 +421,6 @@ def _is_hip() -> bool: or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None -def _is_neuron() -> bool: - return VLLM_TARGET_DEVICE == "neuron" - - def _is_tpu() -> bool: return VLLM_TARGET_DEVICE == "tpu" @@ -470,25 +465,6 @@ def get_rocm_version(): return None -def get_neuronxcc_version(): - import sysconfig - site_dir = sysconfig.get_paths()["purelib"] - version_file = os.path.join(site_dir, "neuronxcc", "version", - "__init__.py") - - # Check if the command was executed successfully - with open(version_file) as fp: - content = fp.read() - - # Extract the version using a regular expression - match = re.search(r"__version__ = '(\S+)'", content) - if match: - # Return the version string - return match.group(1) - else: - raise RuntimeError("Could not find Neuron version in the output") - - def get_nvcc_cuda_version() -> Version: """Get the CUDA version from nvcc. @@ -541,12 +517,6 @@ def get_vllm_version() -> str: rocm_version = get_rocm_version() or torch.version.hip if rocm_version and rocm_version != MAIN_CUDA_VERSION: version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" - elif _is_neuron(): - # Get the Neuron version - neuron_version = str(get_neuronxcc_version()) - if neuron_version != MAIN_CUDA_VERSION: - neuron_version_str = neuron_version.replace(".", "")[:3] - version += f"{sep}neuron{neuron_version_str}" elif _is_tpu(): version += f"{sep}tpu" elif _is_cpu(): @@ -591,8 +561,6 @@ def get_requirements() -> list[str]: requirements = modified_requirements elif _is_hip(): requirements = _read_requirements("rocm.txt") - elif _is_neuron(): - requirements = _read_requirements("neuron.txt") elif _is_tpu(): requirements = _read_requirements("tpu.txt") elif _is_cpu(): @@ -601,7 +569,7 @@ def get_requirements() -> list[str]: requirements = _read_requirements("xpu.txt") else: raise ValueError( - "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") + "Unsupported platform, please use CUDA, ROCm, or CPU.") return requirements diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index ba8e31a79feb5..b82e839638041 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -287,15 +287,6 @@ def test_prefix_cache_default(): }, "mm-processor-kwargs" ), - ( - '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}', - { - "cast_logits_dtype": "bfloat16", - "sequence_parallel_norm": True, - "sequence_parallel_norm_threshold": 2048, - }, - "override-neuron-config" - ), ]) # yapf: enable def test_composite_arg_parser(arg, expected, option): diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py deleted file mode 100644 index 2d6e5f523cb85..0000000000000 --- a/tests/neuron/1_core/test_activation.py +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch -import torch.nn.functional as F - -from vllm.model_executor.layers.activation import FastGELU, SiluAndMul -from vllm.platforms import current_platform - - -@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"]) -@pytest.mark.parametrize("num_tokens,d,dtype", [ - (7, 512, torch.half), - (7, 512, torch.float), - (83, 512, torch.half), -]) -@torch.inference_mode() -def test_act_and_mul( - activation: str, - num_tokens: int, - d: int, - dtype: torch.dtype, -) -> None: - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - current_platform.seed_everything(0) - torch.set_default_device("cpu") - x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device) - if activation == "silu_and_mul": - layer = SiluAndMul() - fn = layer.forward_native - elif activation == "gelu_fast": - layer = FastGELU() - fn = F.gelu - else: - raise NotImplementedError( - f"activation {activation} is not implemented.") - assert x.is_xla, "input tensor under testing is expected to be XLA tensor." - out = layer.to(device=device).forward_neuron(x) - ref_out = fn(x.cpu()) - torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0) diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py deleted file mode 100644 index efec56360c142..0000000000000 --- a/tests/neuron/1_core/test_block_table.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import neuronxcc.nki.language as nl -import pytest -import torch -import torch.nn.functional as F -from neuronxcc import nki - -from vllm.attention.ops.nki_flash_attn import ( - load_block_tables, transform_block_tables_for_indirect_load) - - -def is_power_of_2(n): - return n > 0 and (n & (n - 1) == 0) - - -def nki_load_and_transform_block_tables( - block_tables, - num_tiles, - num_blocks_per_tile, - num_head, - head_id, - block_size_tiling_factor, -): - assert is_power_of_2( - num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2" - block_tables_sbuf = load_block_tables(block_tables, num_tiles, - num_blocks_per_tile) - - # we need to pass an Index as head_id - head_id = nl.arange(1)[None, :] + head_id - - block_tables_transposed = transform_block_tables_for_indirect_load( - block_tables_sbuf, block_size_tiling_factor, num_head, head_id) - B_P_SIZE = 128 - assert block_tables_transposed.shape[1] == B_P_SIZE - - out = nl.ndarray( - block_tables_transposed.shape, - dtype=nl.int32, - buffer=nl.shared_hbm, - ) - for i in nl.affine_range(block_tables_transposed.shape[0]): - nl.store(dst=out[i], value=block_tables_transposed[i]) - return out - - -def ref_block_tables_transform( - block_tables, - num_tiles, - num_blocks_per_tile, - num_head, - head_id, - block_size_tiling_factor, -): - assert block_tables.numel() == num_tiles * num_blocks_per_tile - block_tables = block_tables.view(num_tiles, num_blocks_per_tile) - B_F_SIZE = 128 - num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE - block_tables = F.pad( - block_tables, - (0, 0, 0, num_tiles_padded - num_tiles), - "constant", - 0, - ) - - block_tables = block_tables * num_head + head_id - block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1) - offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1) - block_tables = block_tables * block_size_tiling_factor + offset - block_tables_transposed = block_tables.view(num_tiles_padded, -1).t() - - num_blocks_per_tile = block_tables_transposed.shape[0] - assert num_blocks_per_tile % B_F_SIZE == 0 - return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE, - B_F_SIZE, num_tiles_padded) - - -@pytest.mark.parametrize( - "q_head_per_kv_head,head_id", - [ - (1, 0), - (3, 1), - ], -) -@pytest.mark.parametrize( - "num_tiles,num_blocks_per_tile", - [ - (1, 1), - (13, 16), - (17, 128), - (35, 512), - (128, 128), - (130, 64), - (280, 256), - (315, 1), - ], -) -@torch.inference_mode() -def test_load_and_transform_block_tables( - monkeypatch: pytest.MonkeyPatch, - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, -) -> None: - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - - compiler_flags_str = " ".join([ - "-O1", - "--retry_failed_compilation", - ]) - with monkeypatch.context() as m: - m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - - torch.manual_seed(10000) - torch.set_printoptions(sci_mode=False) - - # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient - B_P_SIZE = 128 - if num_blocks_per_tile < B_P_SIZE: - assert B_P_SIZE % num_blocks_per_tile == 0 - block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile - else: - block_size_tiling_factor = 1 - max_num_blocks = 100000 - block_tables = torch.randint( - 0, - max_num_blocks, - (num_tiles * num_blocks_per_tile, ), - dtype=torch.int32, - ) - nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( - block_tables.to(device=device), - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ).cpu() - ref_out = ref_block_tables_transform( - block_tables, - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ) - assert (nki_out.shape == ref_out.shape - ), f"{nki_out.shape=} != {ref_out.shape=}" - assert torch.all(nki_out == ref_out) diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py deleted file mode 100644 index 670889ad6b58d..0000000000000 --- a/tests/neuron/1_core/test_cache.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.attention.ops.nki_flash_attn import reshape_and_cache - - -@pytest.mark.parametrize( - "num_tokens, n_kv_head, d_head, num_blocks, block_size", - [ - # Small model configuration (e.g., GPT-2 small) - (32, 12, 64, 4, 128), # Typical sequence processing - (1, 12, 64, 4, 128), # Single token update - (128, 12, 64, 4, 128), # Longer sequence - - # Medium model configuration (e.g., GPT-2 medium) - (64, 16, 96, 8, 256), # Standard batch - (256, 16, 96, 8, 256), # Large batch - - # Large model configuration (e.g., GPT-3 style) - (48, 32, 128, 16, 512), # Typical processing window - (512, 32, 128, 16, 512), # Full context window - - # Edge cases and stress tests - (1024, 8, 32, 32, 32), # Many tokens, small heads - (16, 64, 256, 4, 64), # Few tokens, many heads - (2048, 24, 128, 64, 128), # Large scale test - - # Minimal configurations for debugging - (4, 2, 16, 2, 16), # Tiny test case - (1, 1, 8, 1, 8), # Minimal possible - ]) -def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks, - block_size): - # Set random seed for reproducibility - torch.manual_seed(42) - - # Create CPU tensors for reference implementation - key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt( - torch.tensor(d_head)) - value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt( - torch.tensor(d_head)) - key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head) - value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head) - slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens] - - # Run reference implementation on CPU - block_indices = torch.div(slot_mapping_cpu, - block_size, - rounding_mode="floor") - block_offsets = slot_mapping_cpu % block_size - - for i in range(num_tokens): - block_idx = block_indices[i] - block_offset = block_offsets[i] - key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i] - value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i] - - # Create XLA device tensors - device = torch.device('xla') - key = key_cpu.to(device) - value = value_cpu.to(device) - key_cache = torch.zeros_like(key_cache_cpu, device=device) - value_cache = torch.zeros_like(value_cache_cpu, device=device) - slot_mapping = slot_mapping_cpu.to(device) - kv_cache = torch.stack([key_cache, value_cache]) - - # Run vectorized implementation on XLA device - reshape_and_cache(key, value, kv_cache, slot_mapping) - key_cache, value_cache = torch.unbind(kv_cache, dim=0) - - # Move results back to CPU for comparison - key_cache_result = key_cache.cpu() - value_cache_result = value_cache.cpu() - - # Assert results match - torch.testing.assert_close(key_cache_result, - key_cache_cpu, - rtol=1e-5, - atol=1e-5) - torch.testing.assert_close(value_cache_result, - value_cache_cpu, - rtol=1e-5, - atol=1e-5) diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py deleted file mode 100644 index c6fce1d1a0630..0000000000000 --- a/tests/neuron/1_core/test_layernorm.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform - - -@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [ - (7, 8, False, torch.half), - (83, 768, False, torch.half), - (83, 768, True, torch.half), - (83, 768, True, torch.bfloat16), - (83, 768, True, torch.float32), -]) -@torch.inference_mode() -def test_rms_norm( - num_tokens: int, - hidden_size: int, - add_residual: bool, - dtype: torch.dtype, -) -> None: - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - current_platform.seed_everything(0) - torch.set_default_device("cpu") - layer = RMSNorm(hidden_size).to(dtype=dtype) - layer.weight.data.normal_(mean=1.0, std=0.1) - scale = 1 / (2 * hidden_size) - x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device) - x *= scale - residual = torch.randn_like(x) * scale if add_residual else None - - residual_cpu = residual.cpu() if add_residual else None - ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu) - assert x.is_xla, "input tensor under testing is expected to be XLA tensor." - out = layer.to(device=device)(x, residual) - - # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger - # numerical errors than other operators because they involve reductions. - # Therefore, we use a larger tolerance. - if add_residual: - assert out[0].is_xla, "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out[0].cpu(), - ref_out[0], - atol=1e-2, - rtol=1e-2) - torch.testing.assert_close(out[1].cpu(), - ref_out[1], - atol=1e-2, - rtol=1e-2) - else: - assert out.is_xla, "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2) diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py deleted file mode 100644 index ce9eadf5a883e..0000000000000 --- a/tests/neuron/1_core/test_logits_processor.py +++ /dev/null @@ -1,95 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from unittest.mock import patch - -import pytest -import torch - -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import is_pin_memory_available - - -class MockLogitsProcessor(LogitsProcessor): - - def __init__(self, vocab_size: int, scale: float, - fake_logits: torch.Tensor): - super().__init__(vocab_size=vocab_size, scale=scale) - self.fake_logits = fake_logits.clone() - - def forward(self, *args, **kwargs): - with patch( - "vllm.model_executor.layers.logits_processor._prune_hidden_states", - lambda x, y: x - ), patch( - "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits", - lambda *args, **kwargs: self.fake_logits): - return super().forward(*args, **kwargs) - - -def _prepare_test( - batch_size: int -) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) - fake_logits = torch.full((batch_size, vocab_size), - 1e-2, - dtype=input_tensor.dtype) - logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits) - return input_tensor, fake_logits, logits_processor - - -RANDOM_SEEDS = list(range(8)) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_logits_processors(seed: int): - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - set_random_seed(seed) - torch.set_default_device("cpu") - batch_size = random.randint(1, 256) - input_tensor, fake_logits, logits_processor = _prepare_test(batch_size) - - # This sample logits processor gives infinite score to the i-th token, - # where i is the length of the input sequence. - # We therefore expect the output token sequence to be [0, 1, 2, ...] - def pick_ith(token_ids, logits): - logits[len(token_ids)] = float("inf") - return logits - - seq_group_metadata_list = [] - seq_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0, - logits_processors=[pick_ith]), - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - logits_processor_output = logits_processor( - lm_head=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - - fake_logits *= logits_processor.scale - torch.testing.assert_close(logits_processor_output[:, 1], - fake_logits[:, 1], - rtol=1e-4, - atol=0.0) diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py deleted file mode 100644 index 5f3268810f9fe..0000000000000 --- a/tests/neuron/1_core/test_neuron_model_runner.py +++ /dev/null @@ -1,127 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os -from unittest.mock import MagicMock - -from vllm.config import VllmConfig -from vllm.engine.arg_utils import EngineArgs -from vllm.platforms import current_platform -from vllm.platforms.neuron import NeuronFramework -from vllm.sampling_params import SamplingParams -from vllm.sequence import SequenceData, SequenceGroupMetadata -from vllm.worker.neuron_model_runner import NeuronModelRunner - -os.environ[ - 'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value - - -def _create_neuron_model_runner(model: str, *args, - **kwargs) -> NeuronModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - vllm_config = VllmConfig( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - ) - neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config) - return neuron_model_runner - - -def test_update_neuron_sampling_params_not_full_batch(): - os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0" - model_runner = _create_neuron_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - max_num_seqs=2, - ) - assert not model_runner._on_device_sampling_disabled - # Test sampling param updating only when TNx is framework - # NxDI handles sampling parameter updating inside model - if current_platform.use_transformers_neuronx(): - model_mock = MagicMock() - model_runner.model = model_mock - - seq_group_metadata_list = [ - SequenceGroupMetadata( - request_id="test_0", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0.5, - top_k=1, - top_p=0.5), - block_tables={0: [1]}, - ) - ] - - model_runner.prepare_model_input(seq_group_metadata_list) - - # Index neuron sampling parameters based on block_tables indices. - # The first block_id of the sequence 0 is 1, so its parameters are - # placed at index 1. So the sampling parameters will be: - # Index 0: default sampling parameters - # Index 1: sequecne 0's sampling parameters. - neuron_sampling_params = ( - model_runner.model_config.neuron_sampling_params) - assert neuron_sampling_params.temperature == [1.0, 0.5] - assert neuron_sampling_params.top_k == [ - model_runner._MAX_NEURON_SAMPLING_TOP_K, 1 - ] - assert neuron_sampling_params.top_p == [1.0, 0.5] - model_mock.model.update_generation_config.assert_called_once_with( - neuron_sampling_params) - - -def test_update_neuron_sampling_params_full_batch(): - os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0" - model_runner = _create_neuron_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - max_num_seqs=2, - ) - assert not model_runner._on_device_sampling_disabled - - # Test sampling param updating only when TNx is framework - # NxDI handles sampling parameter updating inside model - if current_platform.use_transformers_neuronx(): - model_mock = MagicMock() - model_runner.model = model_mock - - seq_group_metadata_list = [ - SequenceGroupMetadata( - request_id="test_0", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0.5, - top_k=1, - top_p=0.5), - block_tables={0: [1]}, - ), - SequenceGroupMetadata( - request_id="test_0", - is_prompt=True, - seq_data={1: SequenceData.from_seqs([4, 5, 6])}, - sampling_params=SamplingParams(temperature=0.2, - top_k=2, - top_p=0.2), - block_tables={1: [0]}, - ) - ] - - model_runner.prepare_model_input(seq_group_metadata_list) - - # Index neuron sampling parameters based on block_tables indices. - # The first block_id of the sequence 0 is 1, so its parameters are - # placed at index 1. So the sampling parameters will be: - # Index 0: sequence 1's sampling parameters - # Index 1: sequecne 0's sampling parameters. - neuron_sampling_params = ( - model_runner.model_config.neuron_sampling_params) - assert neuron_sampling_params.temperature == [0.2, 0.5] - assert neuron_sampling_params.top_k == [2, 1] - assert neuron_sampling_params.top_p == [0.2, 0.5] - model_mock.model.update_generation_config.assert_called_once_with( - neuron_sampling_params) diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py deleted file mode 100644 index 0863002695928..0000000000000 --- a/tests/neuron/1_core/test_neuron_quant.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.model_executor.layers.quantization.neuron_quant import ( - NeuronQuantConfig) - - -def test_get_supported_act_dtypes(): - neuron_quant_config = NeuronQuantConfig() - supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes() - target_list = ["any_dtype1", "any_dtype2"] - for dtype in target_list: - assert dtype in supported_act_dtypes diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py deleted file mode 100644 index abf7febc2955c..0000000000000 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ /dev/null @@ -1,514 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -import pytest -import torch -import torch.nn.functional as F - -from vllm.utils import cdiv - - -class BlockDiagonalCausalFromBottomRightMask: - - @staticmethod - def _from_seqlens(query_lens, seq_lens, block_size=None): - from torch import logical_and, logical_or - - contexted = block_size is None - context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - n_queries = sum(query_lens) - num_seqs = len(query_lens) - if contexted: - key_lens_blockaligned = seq_lens - else: - n_blocks_per_seq = (context_lens + block_size - 1) // block_size - offset_per_seq = n_blocks_per_seq * block_size - key_lens_blockaligned = offset_per_seq[:num_seqs].tolist() - n_keys = sum(key_lens_blockaligned) - - a = (torch.arange(n_queries).reshape(n_queries, - 1).expand(n_queries, n_keys)) - b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys) - q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0) - k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0) - - prior_mask = torch.zeros(n_queries, n_keys) - new_masks: list[torch.Tensor] = [] - for seq_id in range(num_seqs): - ri = q_cumsum[seq_id] - ci = k_cumsum[seq_id] - nr = query_lens[seq_id] - - if contexted: - nc = seq_lens[seq_id] - a_offset = ci + nc - ri - nr - new_mask = (a + a_offset) >= b - else: - nc = context_lens[seq_id] - a_offset = ci + nc - 1 - new_mask = a_offset >= b - - left_mask = b >= ci - top_mask = a >= ri - bottom_mask = a < (ri + nr) - - new_mask = logical_and( - logical_and(logical_and(new_mask, left_mask), top_mask), - bottom_mask, - ) - prior_mask = logical_or(prior_mask, new_mask) - new_masks = new_masks + [new_mask] - return prior_mask - - @staticmethod - def from_seqlens(query_lens, seq_lens, block_size=None): - contexted = block_size is None - if contexted: - prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( - query_lens, seq_lens) - active_mask = None - else: - prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( - query_lens, seq_lens, block_size) - active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( - query_lens, query_lens) - return prior_mask, active_mask - - -def ref_softmax(x: torch.Tensor, - dim: int, - mixed_precision=False, - return_max_reduce=False): - max_value = torch.amax(x, dim=dim, keepdims=True) - exp = torch.exp(x - max_value) - if mixed_precision: - sum_value = torch.sum(exp.astype(torch.float32), - dim=dim, - keepdims=True).astype(x.dtype) - else: - sum_value = torch.sum(exp, dim=dim, keepdims=True) - if return_max_reduce: - return exp / sum_value, max_value, torch.reciprocal(sum_value) - return exp / sum_value - - -def ref_masked_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - scale: float, - attn_mask: Optional[torch.Tensor] = None, - return_max_reduce: Optional[bool] = False, -) -> torch.Tensor: - scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float() - if attn_mask is not None: - masked_score = scaled_qk + attn_mask.float() - if return_max_reduce: - norm_score, cached_max, cached_sum_reciprocal = ref_softmax( - masked_score, dim=-1, return_max_reduce=True) - else: - norm_score = ref_softmax(masked_score, dim=-1) - out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value) - if return_max_reduce: - return ( - out, - cached_max, - cached_sum_reciprocal, - norm_score, - masked_score, - scaled_qk, - ) - else: - return (out, ) - - -def ref_context_attention( - query, - key, - value, - query_lens, - seq_lens, - head_size, - num_queries_per_kv, - return_max_reduce=False, -): - scale = float(1.0 / (head_size**0.5)) - if num_queries_per_kv > 1: - # Handle MQA and GQA - key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) - value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) - - attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens) - - # convert binary mask to -inf values - attn_mask = torch.logical_not(attn_mask) - attn_mask = attn_mask.float() * -30000 - - output, *debug_tensors = ref_masked_attention( - query, - key, - value, - scale, - attn_mask, - return_max_reduce=return_max_reduce, - ) - - output = output.unsqueeze(1) - if return_max_reduce: - cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = ( - debug_tensors) - return ( - output, - cached_max, - cached_sum_reciprocal, - lse, - masked_score, - scaled_qk, - ) - else: - return output - - -def sample_inputs( - prefill_batch_size, - decode_batch_size, - min_query_len, - max_query_len, - min_ctx_len, - max_ctx_len, - block_size, - num_heads, - num_kv_heads, - head_size, - dtype, -): - batch_size = prefill_batch_size + decode_batch_size - max_model_len = (max_query_len + max_ctx_len) * 4 - max_block_per_request = max_model_len // block_size - cache_size = (batch_size * max_block_per_request) + 2 - prefill_ctx_lens = torch.randint(min_ctx_len, - max_ctx_len + 1, (prefill_batch_size, ), - dtype=torch.long).tolist() - decode_ctx_lens = torch.randint(min_ctx_len, - max_ctx_len + 1, (decode_batch_size, ), - dtype=torch.long).tolist() - ctx_lens = prefill_ctx_lens + decode_ctx_lens - query_lens = torch.randint( - min_query_len, - max_query_len + 1, - (prefill_batch_size, ), - dtype=torch.long, - ).tolist() + [1 for _ in range(decode_batch_size)] - seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)] - - num_tokens = sum(query_lens) - query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - query.uniform_(-1, 1) - torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - - kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype) - kv.uniform_(-1, 1) - key, value = kv.unbind(dim=1) - - k_cache = torch.zeros(cache_size, - block_size, - num_kv_heads, - head_size, - dtype=dtype) - v_cache = torch.zeros(cache_size, - block_size, - num_kv_heads, - head_size, - dtype=dtype) - k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - values = torch.arange(0, cache_size, dtype=torch.long) - values = values[torch.randperm(cache_size)] - block_table = values[:batch_size * max_block_per_request].view( - batch_size, max_block_per_request) - b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1], - dtype=torch.long), - dim=0) - # copy kv to cache - b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1], - dtype=torch.long), - dim=0) - for i in range(batch_size): - for j in range(query_lens[i]): - k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + - j]) - v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + - b_ctx_len[i] + j]) - cur_ctx = 0 - block_id = 0 - while cur_ctx < b_ctx_len[i]: - start_loc = b_seq_start_loc[i] + cur_ctx - if cur_ctx + block_size > b_ctx_len[i]: - end_loc = b_seq_start_loc[i] + b_ctx_len[i] - else: - end_loc = start_loc + block_size - start_slot = block_table[i, block_id] * block_size - end_slot = start_slot + end_loc - start_loc - k_cache.view(-1, num_kv_heads, - head_size)[start_slot:end_slot].copy_( - key[start_loc:end_loc]) - v_cache.view(-1, num_kv_heads, - head_size)[start_slot:end_slot].copy_( - value[start_loc:end_loc]) - cur_ctx += block_size - block_id += 1 - kv_cache = torch.stack([k_cache, v_cache]) - - return ( - query, - k, - v, - kv_cache, - block_table, - key, - value, - query_lens, - seq_lens, - ) - - -def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, - num_blocks): - context_lens = seq_lens - query_lens - blocks_per_seq = (context_lens + block_size - 1) // block_size - num_seqs = len(seq_lens) - active_blocks: list[int] = [] - for seq_id in range(num_seqs): - active_blocks = ( - active_blocks + - block_tables[seq_id, :blocks_per_seq[seq_id]].tolist()) - return F.pad( - torch.tensor(active_blocks, dtype=torch.int32), - (0, num_blocks - len(active_blocks)), - "constant", - 0, - ) - - -@pytest.mark.parametrize( - "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision", - [ - # Test minimal configurations (small block size) - (1, 199, 1, 512, 4, 2, 8, False - ), # minimal block size, small dimensions - (1, 199, 1, 512, 4, 2, 8, True), # same with mixed precision - - # Test common/medium configurations - (4, 12, 32, 2048, 32, 8, 64, False), # common case, larger heads - (4, 12, 32, 2048, 16, 4, 32, - True), # medium size, mixed precision, grouped-query attention (GQA) - - # Test large configurations - (4, 12, 256, 8192, 8, 1, 128, False), # large blocks, large head size - (4, 12, 256, 8192, 64, 8, 64, True), # large blocks, many heads - - # Test asymmetric configurations - (2, 24, 64, 4096, 12, 4, 96, False), # varied batch sizes - (8, 8, 128, 2048, 24, 2, 48, True), # balanced batches - - # Test edge cases - (1, 128, 16, 1024, 4, 2, 16, False), # large decode batch - (16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch - (4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA) - (4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA) - ]) -@torch.inference_mode() -def test_contexted_kv_attention( - monkeypatch: pytest.MonkeyPatch, - prefill_batch_size: int, - decode_batch_size: int, - num_heads: int, - num_queries_per_kv: int, - head_size: int, - block_size: int, - large_tile_size, - mixed_precision: bool, -) -> None: - - import torch_xla.core.xla_model as xm - - from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc, - reorder_context_mask) - - assert large_tile_size % block_size == 0 - - device = xm.xla_device() - - compiler_flags_str = " ".join([ - "-O1", - "--retry_failed_compilation", - ]) - with monkeypatch.context() as m: - m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - - torch.manual_seed(0) - torch.set_printoptions(sci_mode=False) - torch.set_default_device("cpu") - dtype = torch.float32 - - min_ctx_len = 32 - max_ctx_len = 1024 - min_query_len = 16 - max_query_len = 512 - num_kv_heads = num_heads // num_queries_per_kv - ( - query, - k_active, - v_active, - kv_cache, - block_table, - key, - value, - query_lens, - seq_lens, - ) = sample_inputs( - prefill_batch_size=prefill_batch_size, - decode_batch_size=decode_batch_size, - min_query_len=min_query_len, - max_query_len=max_query_len, - min_ctx_len=min_ctx_len, - max_ctx_len=max_ctx_len, - block_size=block_size, - num_heads=num_heads, - num_kv_heads=num_kv_heads, - head_size=head_size, - dtype=dtype, - ) - - output_ref = ref_context_attention( - query, - key, - value, - query_lens, - seq_lens, - head_size, - num_queries_per_kv, - return_max_reduce=False, - ) - - # build neuron program - B_P_SIZE = 128 - assert (large_tile_size >= B_P_SIZE - ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" - - def pad_to_multiple(a, b): - return cdiv(a, b) * b - - def pad_to_next_power_of_2(a): - assert a > 0 - return 2**int(a - 1).bit_length() - - # calculate input shapes - max_num_queries = pad_to_next_power_of_2(sum(query_lens)) - context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - num_active_blocks = cdiv(context_lens, block_size).sum().item() - num_active_blocks = pad_to_multiple(num_active_blocks, - large_tile_size // block_size) - context_kv_len = num_active_blocks * block_size - assert ( - context_kv_len % - large_tile_size == 0), f"invalid context_kv_len={context_kv_len}" - - # pad QKV tensors - pad_dims = ( - 0, - 0, - 0, - 0, - 0, - max_num_queries - query.shape[0], - ) - query = F.pad(query, pad_dims, "constant", 0) - k = F.pad(k_active, pad_dims, "constant", 0) - v = F.pad(v_active, pad_dims, "constant", 0) - - # permute QKV tensors - # query: (1, n_heads, d, seq_q) - # key: (1, n_kv_heads, d, seq_k) - # value: (1, n_kv_heads, seq_v, d) - query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() - kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous() - - # transform block table - active_block_table = get_active_block_tables( - block_table.cpu(), - torch.tensor(query_lens).cpu(), - torch.tensor(seq_lens).cpu(), - block_size, - num_active_blocks, - ) - - # Build attention masks - prior_mask, active_mask = ( - BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens, block_size=block_size)) - prior_mask_padded = F.pad( - prior_mask, - ( - 0, - context_kv_len - prior_mask.shape[1], - 0, - max_num_queries - prior_mask.shape[0], - ), - "constant", - 0, - ).bool() - active_mask_padded = F.pad( - active_mask, - ( - 0, - max_num_queries - active_mask.shape[1], - 0, - max_num_queries - active_mask.shape[0], - ), - "constant", - 0, - ).bool() - attn_mask = torch.concat([prior_mask_padded, active_mask_padded], - dim=1) - - attn_mask = reorder_context_mask(attn_mask, large_tile_size, - block_size) - - input_args = ( - query.to(device=device), - k.to(device=device), - v.to(device=device), - kv_cache.to(device=device), - active_block_table.to(device=device), - attn_mask.to(device=device), - ) - input_kwargs = dict( - n_kv_head=num_kv_heads, - head_size=head_size, - mixed_precision=mixed_precision, - LARGE_TILE_SZ=large_tile_size, - ) - - output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) - - num_actual_tokens = sum(query_lens) - # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) - output_nki = output_nki.cpu().permute(0, 2, 1, 3) - output_nki = output_nki[0, :num_actual_tokens, :, :] - output_ref_padded = F.pad( - output_ref, - (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), - "constant", - 0, - ) - output_ref = output_ref_padded.transpose( - 0, 1)[0, :num_actual_tokens, :, :] - - torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py deleted file mode 100644 index a7ac79729986d..0000000000000 --- a/tests/neuron/1_core/test_rotary_embedding.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Tests for miscellaneous utilities -""" - -import pytest -import torch - -from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding -from vllm.platforms import current_platform - - -@pytest.mark.parametrize( - "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [ - (16, False, 32, 32, 1024, True), - (16, False, 32, 128, 1024, True), - (16, True, 32, 32, 1024, True), - (16, True, 32, 128, 1024, True), - (16, False, 32, 128, 1024, False), - (16, True, 32, 128, 1024, False), - ]) -def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim, - head_size, seq_len, use_key): - import torch_xla.core.xla_model as xm - - device = xm.xla_device() - current_platform.seed_everything(0) - torch.set_default_device("cpu") - - batch_size = 1 - base = 10000 - num_heads = 8 - - rot = RotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style, torch.float32) - - positions = torch.randint(0, - max_position, (batch_size, seq_len), - device="cpu") - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=torch.float32, - device="cpu") - key = torch.randn_like(query) if use_key else None - assert positions.is_cpu, \ - "reference input tensor is expected to be CPU tensor." - ref_query, ref_key = rot.to(device="cpu").forward_native( - positions, query, key) - out_query, out_key = rot.to(device=device).forward_neuron( - positions.to(device=device), query.to(device=device), - key.to(device=device) if key is not None else None) - if use_key: - assert out_query.is_xla and out_key.is_xla, \ - "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out_key.cpu(), - ref_key, - atol=1e-2, - rtol=1e-2) - else: - assert out_key is None, "expected returned key to be None" - assert out_query.is_xla, \ - "output tensor is expected to be XLA tensor" - torch.testing.assert_close(out_query.cpu(), - ref_query, - atol=1e-2, - rtol=1e-2) diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py deleted file mode 100644 index 85a48dae58aaf..0000000000000 --- a/tests/neuron/2_core/test_comm_ops.py +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import functools -from typing import Callable -from unittest.mock import patch - -import pytest -import torch -import torch_xla.distributed.xla_multiprocessing as xmp -from typing_extensions import ParamSpec - -from vllm.distributed.communication_op import ( - tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.utils import get_distributed_init_method, get_open_port - -_P = ParamSpec("_P") - - -def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]: - """Decorator to reinitialize the Neuron Runtime before executing a test. - This is necessary for distributed tests which need to reallocate Neuron - Cores to separate subprocesses. - """ - - @functools.wraps(f) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: - runtime = torch.classes.neuron.Runtime() - runtime.initialize() - runtime.unsafe_close() - - f(*args, **kwargs) - runtime.initialize() - - return wrapper - - -def all_gather_test_worker(index, tp_degree, distributed_init_method): - init_distributed_environment(tp_degree, - index, - distributed_init_method, - index, - backend="xla") - ensure_model_parallel_initialized(tp_degree, 1) - - num_dimensions = 3 - tensor_size = list(range(2, num_dimensions + 2)) - total_size = 1 - for s in tensor_size: - total_size *= s - - all_gather_dimension = -1 - all_tensors = [ - torch.arange(total_size, dtype=torch.float32, - device="xla").reshape(tensor_size) * (r + 1) - for r in range(tp_degree) - ] - expected = torch.cat(all_tensors, dim=all_gather_dimension) - t = all_tensors[index % tp_degree] - t = tensor_model_parallel_all_gather(t, all_gather_dimension) - torch.testing.assert_close(t, expected) - - -def all_reduce_test_worker(index, tp_degree, distributed_init_method): - init_distributed_environment(tp_degree, - index, - distributed_init_method, - index, - backend="xla") - ensure_model_parallel_initialized(tp_degree, 1) - - num_elements = 8 - all_tensors = [ - torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1) - for r in range(tp_degree) - ] - expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) - t = all_tensors[index % tp_degree] - t = tensor_model_parallel_all_reduce(t) - torch.testing.assert_close(t, expected) - - -@pytest.mark.parametrize("tp_size", [2]) -@pytest.mark.parametrize("test_target", - [all_reduce_test_worker, all_gather_test_worker]) -@reinitialize_neuron_runtime -def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size, - test_target): - - with patch('torch_xla._XLAC._xla_runtime_is_initialized', - return_value=False): - distributed_init_method = get_distributed_init_method( - "127.0.0.1", get_open_port()) - - monkeypatch.setenv("VLLM_USE_V1", "1") - monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size)) - monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES", - ','.join(['1' for _ in range(tp_size)])) - - xmp.spawn(test_target, args=(tp_size, distributed_init_method)) diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py deleted file mode 100644 index cac642af03101..0000000000000 --- a/tests/neuron/2_core/test_eagle.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import json -import os -import shutil -import tempfile - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from vllm import LLM, SamplingParams - - -def patch_eagle_draft_with_lm_head(target_model_id: str, - draft_model_id: str) -> str: - # In NxDI, draft model checkpoint must include lm_head weights from target - # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com - # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html - # #eagle-checkpoint-compatibility - final_draft_dir = "/tmp/patched_eagle_draft" - - with tempfile.TemporaryDirectory() as tmp_dir: - target_dir = snapshot_download(repo_id=target_model_id, - local_dir=os.path.join( - tmp_dir, "target")) - draft_dir = snapshot_download(repo_id=draft_model_id, - local_dir=os.path.join(tmp_dir, "draft")) - - lm_head_key = "lm_head.weight" - index_path = os.path.join(target_dir, "model.safetensors.index.json") - with open(index_path) as f: - index = json.load(f) - shard_name = index["weight_map"][lm_head_key] - target_safetensor_path = os.path.join(target_dir, shard_name) - - with safe_open(target_safetensor_path, framework="pt") as f: - target_lm_head = f.get_tensor(lm_head_key) - - draft_path = os.path.join(draft_dir, "pytorch_model.bin") - draft_state_dict = torch.load(draft_path, map_location="cpu") - draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16) - torch.save(draft_state_dict, draft_path) - - shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True) - - return final_draft_dir - - -def test_eagle(): - patched_draft_path = patch_eagle_draft_with_lm_head( - target_model_id="meta-llama/Llama-2-7b-hf", - draft_model_id="yuhuili/EAGLE-llama2-chat-7B") - llm = LLM( - model="meta-llama/Llama-2-7b-hf", - speculative_config={ - "model": patched_draft_path, - "num_speculative_tokens": 5, - "max_model_len": 128 - }, - max_num_seqs=1, - max_model_len=128, - tensor_parallel_size=2, - override_neuron_config={ - "enable_eagle_speculation": True, - "enable_fused_speculation": True, - "fused_qkv": True - }, - ) - prompts = [ - "The president of the United States is", - ] - outputs = llm.generate(prompts, SamplingParams(top_k=1)) - expected_output = " the head of state and head of government of " \ - "the United States. The president direct" - - for output in outputs: - generated_text = output.outputs[0].text - print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") - assert (expected_output == generated_text) - - print("Neuron Eagle speculation test passed.") diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py deleted file mode 100644 index ff59be1725b6c..0000000000000 --- a/tests/neuron/2_core/test_mistral.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm import LLM, SamplingParams - - -def test_mistral(): - llm = LLM(model="mistralai/Mistral-7B-v0.1", - tensor_parallel_size=2, - max_num_seqs=4, - max_model_len=128, - override_neuron_config={ - "sequence_parallel_enabled": False, - "skip_warmup": True - }) - - # Send more prompts than the compiled batch size (4) and request - # varying generation lengths to test accuracy related to Neuron - # specific sequence id sorting. - prompts = [ - "The president of the United States is", - "The capital of France is", - "What is Annapurna labs?", - "I believe the meaning of life is", - "Tell me a story about a brave knight", - "Hello, my name is Llama", - ] - - sampling_params = [ - SamplingParams(top_k=1, max_tokens=10), - SamplingParams(top_k=1, max_tokens=20), - SamplingParams(top_k=1, max_tokens=30), - SamplingParams(top_k=1, max_tokens=40), - SamplingParams(top_k=1, max_tokens=50), - SamplingParams(top_k=1, max_tokens=60) - ] - - outputs = llm.generate(prompts, sampling_params) - - expected_outputs = [ - " the most powerful person in the world. He is", - " a city of many faces. It is a city of history, culture, art, " - "fashion, and", - "\n\nAnnapurna Labs is a semiconductor company that was founded " - "in 2013 by Amazon. The company is", - " to be happy.\n\nI believe that happiness is a choice.\n\nI " - "believe that happiness is a state of mind.\n\nI believe that " - "happiness is a journey.\n\nI believe", - " who rescued a princess from a dragon.\n\nTell me a story about" - " a princess who rescued herself from a dragon.\n\nTell me a " - "story about a princess who rescued herself from a dragon and " - "then rescued a knight from", - " and I am a 10 year old male. I am a very friendly and " - "affectionate boy who loves to be around people. I am a very " - "active boy who loves to play and run around. I am a very smart " - "boy who loves to learn new things. I am a very loyal boy" - ] - - for expected_output, output in zip(expected_outputs, outputs): - generated_text = output.outputs[0].text - print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") - assert (expected_output == generated_text) - - print("Neuron Mistral test passed.") diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py deleted file mode 100644 index 52ca9fe7b6667..0000000000000 --- a/tests/neuron/2_core/test_multi_lora.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from huggingface_hub import snapshot_download - -from vllm import LLM, SamplingParams -from vllm.lora.request import LoRARequest - - -def test_llama_single_lora(): - sql_lora_files = snapshot_download( - repo_id="yard1/llama-2-7b-sql-lora-test") - llm = LLM(model="meta-llama/Llama-2-7b-hf", - tensor_parallel_size=2, - max_num_seqs=4, - max_model_len=512, - override_neuron_config={ - "sequence_parallel_enabled": False, - "skip_warmup": True, - "lora_modules": [{ - "name": "lora_id_1", - "path": sql_lora_files - }] - }, - enable_lora=True, - max_loras=1, - max_lora_rank=256, - device="neuron") - """For multi-lora requests using NxDI as the backend, only the lora_name - needs to be specified. The lora_id and lora_path are supplied at the LLM - class/server initialization, after which the paths are handled by NxDI""" - lora_req_1 = LoRARequest("lora_id_1", 0, " ") - prompts = [ - "The president of the United States is", - "The capital of France is", - ] - outputs = llm.generate(prompts, - SamplingParams(top_k=1), - lora_request=[lora_req_1, lora_req_1]) - - expected_outputs = [ - " the head of state and head of government of the United States. " - "The president direct", - " a city of contrasts. The city is home to the Eiffel Tower" - ] - - for expected_output, output in zip(expected_outputs, outputs): - generated_text = output.outputs[0].text - assert (expected_output == generated_text) - - -def test_llama_multiple_lora(): - sql_lora_files = snapshot_download( - repo_id="yard1/llama-2-7b-sql-lora-test") - llm = LLM(model="meta-llama/Llama-2-7b-hf", - tensor_parallel_size=2, - max_num_seqs=4, - max_model_len=512, - override_neuron_config={ - "sequence_parallel_enabled": - False, - "skip_warmup": - True, - "lora_modules": [{ - "name": "lora_id_1", - "path": sql_lora_files - }, { - "name": "lora_id_2", - "path": sql_lora_files - }] - }, - enable_lora=True, - max_loras=2, - max_lora_rank=256, - device="neuron") - """For multi-lora requests using NxDI as the backend, only the lora_name - needs to be specified. The lora_id and lora_path are supplied at the LLM - class/server initialization, after which the paths are handled by NxDI""" - lora_req_1 = LoRARequest("lora_id_1", 0, " ") - lora_req_2 = LoRARequest("lora_id_2", 1, " ") - prompts = [ - "The president of the United States is", - "The capital of France is", - ] - outputs = llm.generate(prompts, - SamplingParams(top_k=1), - lora_request=[lora_req_1, lora_req_2]) - - expected_outputs = [ - " the head of state and head of government of the United States. " - "The president direct", - " a city of contrasts. The city is home to the Eiffel Tower" - ] - - for expected_output, output in zip(expected_outputs, outputs): - generated_text = output.outputs[0].text - assert (expected_output == generated_text) diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py deleted file mode 100644 index 29fa432017616..0000000000000 --- a/vllm/attention/ops/nki_flash_attn.py +++ /dev/null @@ -1,903 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import neuronxcc.nki.isa as nisa -import neuronxcc.nki.language as nl -import numpy as np -import torch -from neuronxcc import nki -from neuronxcc.nki.language import par_dim - -from vllm.utils import cdiv - - -def is_power_of_2(x): - return x > 0 and (x & (x - 1)) == 0 - - -@nki.jit -def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile): - """ - Load block tables from HBM into SRAM - - `block_tables_hbm` has shape `(num_tiles * num_blocks_per_tile, )`. - In case `num_tiles > B_P_SIZE`, we need further tile `num_tile` dimension. - """ - B_P_SIZE = 128 - - # reshape as `(num_tiles, num_blocks_per_tile)` - assert len(block_tables_hbm.shape) == 1 - (num_total_blocks, ) = block_tables_hbm.shape - assert num_blocks_per_tile * num_tiles == num_total_blocks - block_tables_hbm = block_tables_hbm.reshape( - (num_tiles, num_blocks_per_tile)) - - block_tables_sbuf = nl.zeros( - (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile), - dtype=nl.int32, - ) - for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)): - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = nl.arange(num_blocks_per_tile)[None, :] - block_tables_sbuf[i, i_p, i_f] = nl.load( - block_tables_hbm[i_p + i * B_P_SIZE, i_f], - dtype=nl.int32, - mask=(i_p + i * B_P_SIZE < num_tiles), - ) - return block_tables_sbuf - - -@nki.jit -def transform_block_tables_for_indirect_load( - block_tables, - block_size_tiling_factor, - num_head, - head_id, -): - """ - This function does two things: - 1. calculate new `block_tables` for a `head_id` after flattening - `num_block`, `num_head`, and `block_size_tiling_factor` dimensions - 2. transpose the result so that `block_table` for each tile is mapped to - SBUF Partition dimension for vectorized DMA - - Tiling trick to further improve DMA performance: - Given KV cache shape `(num_block, num_head, block_size, D)`, when loading M - blocks of a given `head_id` from HBM, the load `cache[block_tables, - head_id]` has shape `(M, block_size, D)`. If M < B_P_SIZE = 128, DMA may not - fully utilize hardware parallelization. The solution is to tile `block_size` - into `(block_size_tiling_factor, tiled_block_size)` s.t. `M * - block_size_tiling_factor = B_P_SIZE`. After tiling, KV cache has shape - `(num_block, num_head, block_size_tiling_factor, tiled_block_size, D)`. - - Note: - We don't further tile D dimension as small DMA size also hurts performance. - """ - B_P_SIZE = 128 - num_partitions, num_tiles_per_partition, num_blocks_per_tile = ( - block_tables.shape) - assert num_tiles_per_partition == B_P_SIZE - assert is_power_of_2( - num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2" - - num_loads = cdiv(num_blocks_per_tile, B_P_SIZE) - block_tables_transposed = nl.ndarray( - ( - num_loads, - par_dim(B_P_SIZE), - num_partitions * num_tiles_per_partition, - ), - dtype=nl.int32, - ) - - # prepare iota ahead of time to avoid repeatedly using Gpsimd - if num_head > 1: - head_id = nisa.iota(head_id, dtype=nl.int32).reshape((1, 1)) - head_id = nl.transpose( - head_id.broadcast_to((1, num_tiles_per_partition))) - if num_blocks_per_tile > 1: - head_id = head_id.broadcast_to( - (num_tiles_per_partition, num_blocks_per_tile)) - - if block_size_tiling_factor > 1: - broadcast_shape = ( - num_tiles_per_partition, - num_blocks_per_tile, - block_size_tiling_factor, - ) - offset = nisa.iota(nl.arange(block_size_tiling_factor)[None, None, :], - dtype=nl.int32).broadcast_to(broadcast_shape) - - for partition_id in nl.affine_range(num_partitions): - block_tables_partition = block_tables[partition_id] - if num_head > 1: - # fuse num_block and num_head dimension - block_tables_partition = block_tables_partition * num_head + head_id - - if block_size_tiling_factor > 1: - # need to apply block size tiling trick - assert num_blocks_per_tile * block_size_tiling_factor == B_P_SIZE - block_tables_partition = ((block_tables_partition * - block_size_tiling_factor).reshape( - (num_tiles_per_partition, - num_blocks_per_tile, - 1)).broadcast_to(broadcast_shape)) - new_block_tables = block_tables_partition + offset - new_block_tables = new_block_tables.reshape( - (num_tiles_per_partition, B_P_SIZE)) - else: - new_block_tables = block_tables_partition - - # transpose the block table so that it can be used by vector DGE - for i in nl.affine_range(num_loads): - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = (partition_id * num_tiles_per_partition + - nl.arange(num_tiles_per_partition)[None, :]) - block_tables_transposed[i, i_p, i_f] = nl.transpose( - new_block_tables[:, nl.ds(i * B_P_SIZE, B_P_SIZE)]) - return block_tables_transposed - - -@nki.jit -def load_kv_tile_from_cache( - cur_k_tile, - cur_v_tile, - kv_cache, - block_tables, - large_k_tile_idx, - num_blocks_per_large_tile, - tiled_block_size, - B_P_SIZE, - B_D_SIZE, -): - """ - Load KV cache and transform Key and Value into layout required by Matmul - - Vectorized DMA Load layout: - Key and Value: (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE) - - Layout used by attention matmuls: - Key: (par_dim(B_D_SIZE), seqlen_kv) - Value: (seqlen_kv // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE) - equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE) - """ - # load key cache - num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE) - for load_idx in nl.affine_range(num_loads): - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :] - loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p, - large_k_tile_idx], i_f]) - if cur_k_tile.dtype != loaded.dtype: - loaded = nl.copy(loaded, dtype=cur_k_tile.dtype) - # Transpose SBUF tensor using PE - for tb_i in nl.affine_range(tiled_block_size): - cur_k_tile[ - :, - nl.ds( - load_idx * B_P_SIZE * tiled_block_size + tb_i * B_P_SIZE, - B_P_SIZE, - ), - ] = nl.transpose(loaded[:, nl.ds(tb_i * B_D_SIZE, B_D_SIZE)]) - - # load value cache - for load_idx in nl.affine_range(num_loads): - loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p, - large_k_tile_idx], i_f]) - if cur_v_tile.dtype != loaded.dtype: - loaded = nl.copy(loaded, dtype=cur_v_tile.dtype) - i_p = nl.arange(B_P_SIZE)[:, None] - i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :] - cur_v_tile[ - :, - nl.ds( - load_idx * tiled_block_size * B_D_SIZE, - tiled_block_size * B_D_SIZE, - ), - ] = loaded - - -@nki.jit -def transpose_p_local(p_local_transposed, - p_local, - LARGE_TILE_SZ, - B_F_SIZE=512): - for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE): - if nisa.get_nc_version() == nisa.nc_version.gen3: - p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), - buffer=nl.sbuf, - dtype=p_local.dtype) - else: - p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), - buffer=nl.psum, - dtype=np.float32) - - for j in nl.affine_range(B_F_SIZE // 128): - j_128_slice = nl.ds(j * 128, 128) - i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128) - - if nisa.get_nc_version() == nisa.nc_version.gen3: - p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose( - p_local[:, i_j_128_slice]) - else: - p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose( - p_local[:, i_j_128_slice]) - - p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy( - p_local_t_tmp, dtype=p_local_transposed.dtype) - - -@nki.jit -def _flash_attention_core( - q_local_tile, - k, - v, - o_buffer, - l_buffer, - m_buffer, - kernel_dtype, - acc_type, - tile_mask, - use_causal_mask, - q_tile_idx=None, - initialize=False, - LARGE_TILE_SZ=2048, - B_P_SIZE=128, - B_F_SIZE=512, - B_D_SIZE=128, - qk_res_buffer=None, -): - """ - The flash attention core function to calculate self attention between a tile - of q and a block of K and V. - The q_local_tile has (B_P_SIZE, B_D_SIZE) - The K and V have shape (B_D_SIZE, LARGE_TILE_SZ), whose free dimension will - be split into size B_F_SIZE tiles - - The results are stored in the following three buffers - o_buffer: (B_P_SIZE, d) - l_buffer: (B_P_SIZE, 1) - m_buffer: (B_P_SIZE, 1) - - All IO buffers are in SBUF. - """ - num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE - - qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), - buffer=nl.sbuf, - dtype=acc_type) - max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile), - dtype=acc_type) - for k_i in nl.affine_range(num_k_tile_per_large_tile): - k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE) - - if use_causal_mask: - # mask are used to only apply computation to the lower half of the - # matrix, which reduce the arithmetic intensity by up to 50% - multiplication_required_selection = (q_tile_idx * B_P_SIZE - >= k_i * B_F_SIZE) - else: - multiplication_required_selection = True - - if multiplication_required_selection: - qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), - dtype=np.float32, - buffer=nl.psum) # (128, 512) - qk_psum[:, :] = nl.matmul(q_local_tile, - k[:, k_i_b_f_slice], - transpose_x=True) # (p(128), 512) - qk_res_buf[:, k_i_b_f_slice] = nl.where( - tile_mask[:, k_i_b_f_slice], - qk_psum[:, nl.ds(0, B_F_SIZE)], - -9984.0, - dtype=acc_type, - ) - else: - qk_res_buf[:, k_i_b_f_slice] = -9984.0 - - # Calculate max of the current tile - max_local[:, k_i] = nisa.tensor_reduce( - np.max, - qk_res_buf[:, k_i_b_f_slice], - axis=(1, ), - dtype=acc_type, - negate=False, - ) - - if qk_res_buffer is not None: - qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :]) - - max_ = nisa.tensor_reduce( - np.max, - max_local[:, :], - axis=(1, ), - dtype=acc_type, - negate=False, - ) - - o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE), - dtype=o_buffer.dtype) - - if initialize: - m_buffer[:, 0] = nl.copy(max_) - m_current = max_ - else: - m_previous = nl.copy(m_buffer[:, 0]) - m_buffer[:, 0] = nl.maximum(m_previous, max_) # (128,1) - - m_current = m_buffer[:, 0] - # Compute scaling factor - alpha = nisa.activation( - np.exp, - m_previous, - bias=-1 * m_current, - scale=1.0, - ) - o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha) - - p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype) - REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2) - - p_partial_sum = nl.ndarray( - (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), - dtype=acc_type, - ) - - for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE): - k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE) - - # compute exp(qk - max) - # Compute partial row - tile sum of exp(qk - max)) - # FIXME : Use activation accumulate to accumulate over k_r_i loop ? - p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce( - np.exp, - qk_res_buf[:, k_r_i_reduce_slice], - bias=-1 * m_current, - scale=1.0, - reduce_op=nl.add, - reduce_res=p_partial_sum[:, k_r_i], - dtype=kernel_dtype, - ) - - ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type) - - p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype) - transpose_p_local( - p_local_transposed=p_local_transposed, - p_local=p_local, - LARGE_TILE_SZ=LARGE_TILE_SZ, - B_F_SIZE=B_F_SIZE, - ) - - pv_psum = nl.zeros( - (par_dim(B_P_SIZE), B_D_SIZE), - dtype=np.float32, - buffer=nl.psum, - ) - for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE): - pv_psum[:, :] += nl.matmul( - p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)], - v[:, nl.ds(k_i * B_D_SIZE, B_D_SIZE)], - transpose_x=True, - ) # (128, 128) (p(Br), d) - - if initialize: - o_buffer[:, :] = nl.copy(pv_psum[:, :]) - l_buffer[:, 0] = nl.add(nl.log(ps), max_) - else: - o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum) - - l_prev = l_buffer[:, 0] - l_exp = nl.add( - nl.exp(nl.subtract(l_prev, m_current)), - ps, - ) - l_buffer[:, 0] = nl.add(m_current, nl.log(l_exp)) - - -@nki.jit -def load_v_tile(v_hbm_tile, cur_v_tile, large_tile_idx, v_i, LARGE_TILE_SZ): - B_P_SIZE = 128 - B_D_SIZE = v_hbm_tile.shape[-1] - loaded = nl.load(v_hbm_tile[ - nl.ds(large_tile_idx * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), - :, - ]) - if cur_v_tile.dtype != loaded.dtype: - loaded = nl.copy(loaded, dtype=cur_v_tile.dtype) - cur_v_tile[:, nl.ds(v_i * B_D_SIZE, B_D_SIZE)] = loaded - - -@nki.jit -def flash_paged_attention( - query, - key, - value, - kv_cache, - block_tables, - mask, - softmax_scale=None, - mixed_precision=True, - LARGE_TILE_SZ=2048, - return_debug_tensors=False, -): - """ - Flash PagedAttention Forward Kernel. - - IO tensor layouts: - - query: shape (1, n_heads, d, seq_q) - - key: shape (1, n_kv_heads, d, seq_k) - - value: shape (1, n_kv_heads, seq_v, d) - - kv_cache: (2, num_blocks, n_kv_heads, block_size, d) - - block_tables: (num_active_blocks, ) - - mask: (seq_q, num_active_blocks * block_size + seq_q) - - o: shape (1, n_heads, seq_q, d) - - - This kernel requires seq_k == seq_v - - We use continuous batching by default, so the batch dimension is - always 1, and different requests are concatenated along sequence - dimension. - - We use paged cache blocks (kv_cache) to store KV cache. - - IO tensor dtypes: - - This kernel assumes all IO tensors have the same dtype except for - block_tables (int32) and mask (int32) - - If mixed_precision is True, then all Tensor Engine operation will be - performed in bfloat16 and accumulation will be performed in float32. - Otherwise the intermediates will be in the same type as the inputs. - - Compile-time Constants: - - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)` - - mixed_precision: flag to set non-matmul ops in fp32 precision, default - is set to `true`, if false, we use same precision as input types - - LARGE_TILE_SZ: `default=2048`, size of the kv tile size for attention - computation reduction - - GQA support Notes: - the spmd kernel for launching kernel should be on kv_heads instead of - nheads - - Example usage: - MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d] - usage: `flash_fwd[b, h](q, k, v, ...)` - GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d] - usage: `flash_fwd[b, kv_h](q, k, v, ...)` - """ - B_F_SIZE = 512 - B_P_SIZE = 128 - b, h, d, seqlen_q = query.shape - B_D_SIZE = d - n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine - _, num_blocks, k_h, block_size, _ = kv_cache.shape - q_h_per_k_h = h // k_h - assert b == 1, f"invalid batch size {b=}" - assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}" - cache_shape = (2, num_blocks, k_h, block_size, d) - assert (tuple(kv_cache.shape) == cache_shape - ), f"{kv_cache.shape=} mismatch, expect {cache_shape}" - assert key is None or tuple(key.shape) == ( - 1, - k_h, - d, - seqlen_q, - ), f"key shape {key.shape} mismatch!" - assert value is None or tuple(value.shape) == ( - 1, - k_h, - seqlen_q, - d, - ), f"value shape {value.shape} mismatch!" - - assert ( - nl.program_ndim() == 2 - ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!" - batch_id = nl.program_id(axis=0) - head_id = nl.program_id(axis=1) - - (num_active_blocks, ) = block_tables.shape - context_kv_len = num_active_blocks * block_size - assert ( - LARGE_TILE_SZ % B_F_SIZE == 0 - ), f"Need {LARGE_TILE_SZ=} to be divisible by {B_F_SIZE=} in transpose_p" - assert (context_kv_len % LARGE_TILE_SZ == 0 - ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}" - - num_blocks_per_large_tile = LARGE_TILE_SZ // block_size - assert is_power_of_2( - num_blocks_per_large_tile - ), f"{num_blocks_per_large_tile=} is expected of be power of 2" - if seqlen_q > B_F_SIZE: - MAX_REDUCTION_TILE = 2048 - if seqlen_q // 2 > MAX_REDUCTION_TILE: - assert ( - seqlen_q % MAX_REDUCTION_TILE == 0 - ), f"{seqlen_q=} should be divisible by {MAX_REDUCTION_TILE=}" - else: - assert (seqlen_q % B_F_SIZE == 0 - ), f"{seqlen_q=} should be divisible by {B_F_SIZE=})" - - kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype - acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype - softmax_scale = softmax_scale or (1.0 / (d**0.5)) - num_large_k_tile = context_kv_len // LARGE_TILE_SZ - - o = nl.ndarray((b, h, seqlen_q, d), - dtype=query.dtype, - buffer=nl.shared_hbm) - hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = ( - None, - None, - None, - None, - ) - if return_debug_tensors: - hbm_l_buffer = nl.ndarray((b, h, seqlen_q), - dtype=acc_type, - buffer=nl.shared_hbm) - hbm_m_buffer = nl.ndarray((b, h, seqlen_q), - dtype=acc_type, - buffer=nl.shared_hbm) - hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q), - dtype=acc_type, - buffer=nl.shared_hbm) - qk_res_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - block_tables_sbuf = load_block_tables( - block_tables_hbm=block_tables, - num_tiles=num_large_k_tile, - num_blocks_per_tile=num_blocks_per_large_tile, - ) - - # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient - if num_blocks_per_large_tile < B_P_SIZE: - # we checked num_blocks_per_tile is a power of 2 - assert B_P_SIZE % num_blocks_per_large_tile == 0 - block_size_tiling_factor = B_P_SIZE // num_blocks_per_large_tile - # We assume block_size >= block_size_tiling_factor - assert block_size % block_size_tiling_factor == 0 - else: - block_size_tiling_factor = 1 - tiled_block_size = block_size // block_size_tiling_factor - - # Indirect DMA load must be placed along Partition Dimension - block_tables_sbuf = transform_block_tables_for_indirect_load( - block_tables_sbuf, - block_size_tiling_factor=block_size_tiling_factor, - num_head=k_h, - head_id=head_id, - ) - - # Flatten KV cache to be 3D for loading into SBUF - new_cache_shape = ( - 2, - num_blocks * k_h * block_size_tiling_factor, - tiled_block_size * d, - ) - kv_cache = kv_cache.reshape(new_cache_shape) - - # Global Flash Attention accumulators - o_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - l_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - m_buffer = nl.zeros( - (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1), - dtype=acc_type, - buffer=nl.sbuf, - lazy_initialization=True, - ) - - for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile): - num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE) - cur_k_tile = nl.ndarray( - (par_dim(B_D_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype, - ) - cur_v_tile = nl.ndarray( - (par_dim(B_P_SIZE), num_loads * tiled_block_size * B_D_SIZE), - dtype=kernel_dtype, - ) - load_kv_tile_from_cache( - cur_k_tile=cur_k_tile, - cur_v_tile=cur_v_tile, - kv_cache=kv_cache, - block_tables=block_tables_sbuf, - large_k_tile_idx=large_k_tile_idx, - num_blocks_per_large_tile=num_blocks_per_large_tile, - tiled_block_size=tiled_block_size, - B_P_SIZE=B_P_SIZE, - B_D_SIZE=B_D_SIZE, - ) - - for i in nl.affine_range(n_tile_q): - cur_mask = nl.load(mask[ - nl.ds(i * B_P_SIZE, B_P_SIZE), - nl.ds(large_k_tile_idx * LARGE_TILE_SZ, LARGE_TILE_SZ), - ]) - for i_q_h in nl.affine_range(q_h_per_k_h): - q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) - q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] - q_sbuf_tile = nl.load(q_hbm_tile[:, - nl.ds(i * - B_P_SIZE, B_P_SIZE)]) - if q_sbuf_tile.dtype != kernel_dtype: - q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype) - q_tile[:, :] = q_sbuf_tile * softmax_scale - - _flash_attention_core( - q_local_tile=q_tile, - k=cur_k_tile, - v=cur_v_tile, - o_buffer=o_buffer[i, i_q_h], - l_buffer=l_buffer[i, i_q_h], - m_buffer=m_buffer[i, i_q_h], - kernel_dtype=kernel_dtype, - acc_type=acc_type, - tile_mask=cur_mask, - use_causal_mask=False, - q_tile_idx=i, - initialize=large_k_tile_idx == 0, - LARGE_TILE_SZ=LARGE_TILE_SZ, - B_P_SIZE=B_P_SIZE, - B_F_SIZE=B_F_SIZE, - B_D_SIZE=B_D_SIZE, - ) - - # compute attention between input query, key and value - if key is not None and value is not None: - B_F_SIZE = min(seqlen_q, B_F_SIZE) - LARGE_TILE_SZ = seqlen_q - - cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), - dtype=kernel_dtype) - cur_v_tile = nl.ndarray( - (par_dim(B_P_SIZE), LARGE_TILE_SZ // B_P_SIZE * B_D_SIZE), - dtype=kernel_dtype, - ) - - loaded = nl.load(key[batch_id, head_id, :, :]) - if loaded.dtype != kernel_dtype: - loaded = nl.copy(loaded, dtype=kernel_dtype) - cur_k_tile[:, :] = loaded - - v_hbm_tile = value[batch_id, head_id] - for v_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE): - load_v_tile( - v_hbm_tile=v_hbm_tile, - cur_v_tile=cur_v_tile, - large_tile_idx=0, - v_i=v_i, - LARGE_TILE_SZ=LARGE_TILE_SZ, - ) - - for i in nl.affine_range(n_tile_q): - cur_mask = nl.load(mask[ - nl.ds(i * B_P_SIZE, B_P_SIZE), - nl.ds(context_kv_len, LARGE_TILE_SZ), - ]) - for i_q_h in nl.affine_range(q_h_per_k_h): - - q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) - q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] - q_sbuf_tile = nl.load(q_hbm_tile[:, - nl.ds(i * - B_P_SIZE, B_P_SIZE)]) - if q_sbuf_tile.dtype != kernel_dtype: - q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype) - q_tile[:, :] = q_sbuf_tile * softmax_scale - _flash_attention_core( - q_local_tile=q_tile, - k=cur_k_tile, - v=cur_v_tile, - o_buffer=o_buffer[i, i_q_h], - l_buffer=l_buffer[i, i_q_h], - m_buffer=m_buffer[i, i_q_h], - kernel_dtype=kernel_dtype, - acc_type=acc_type, - tile_mask=cur_mask, - use_causal_mask=True, - q_tile_idx=i, - initialize=False, - LARGE_TILE_SZ=LARGE_TILE_SZ, - B_P_SIZE=B_P_SIZE, - B_F_SIZE=B_F_SIZE, - B_D_SIZE=B_D_SIZE, - qk_res_buffer=(qk_res_buffer[i, i_q_h] - if qk_res_buffer is not None else None), - ) - - # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- # - for i_q_h in nl.affine_range(q_h_per_k_h): - for i in nl.affine_range(n_tile_q): - out = nl.multiply( - o_buffer[i, i_q_h], - nl.exp(m_buffer[i, i_q_h] - l_buffer[i, i_q_h]), - dtype=kernel_dtype, - ) - - nl.store( - o[ - batch_id, - head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), - :, - ], - out, - ) - # maximum and summation statistics - if return_debug_tensors: - nl.store( - hbm_m_buffer[ - batch_id, - head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), - ], - m_buffer[i, i_q_h, :, :], - ) - nl.store( - hbm_l_buffer[ - batch_id, - head_id * q_h_per_k_h + i_q_h, - nl.ds(i * B_P_SIZE, B_P_SIZE), - ], - l_buffer[i, i_q_h], - ) - nl.store( - hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :], - qk_res_buffer[batch_id, i_q_h, :, :], - ) - - if return_debug_tensors: - return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res - return o - - -def reorder_context_mask(mask, LARGE_TILE_SZ, block_size): - """ - Reorder the mask to make it compatible with the flash attention kernel. - - We vectorize KV cache read to improve DMA utilization. However, the layout - that maximizes DMA bandwidth changes the order tokens are consumed. - - The token layout (inner 2 dimensions) after vectorized load is (B_P_SIZE, - tiled_block_size) in a tile of `B_P_SIZE * tiled_block_size` tokens. And - each step the engine consumes a column (rather than a row) of B_P_SIZE - tokens. Therefore, the tokens are visited in a strided way. - - To make sure mask matches the order tokens are consumed, we need to properly - transpose mask. - """ - total_query_len, total_seq_len = mask.shape - context_kv_len = total_seq_len - total_query_len - - B_P_SIZE = 128 - assert (LARGE_TILE_SZ - >= B_P_SIZE), f"{LARGE_TILE_SZ=} must be larger than {B_P_SIZE=}" - num_tiled_blocks = max(B_P_SIZE, LARGE_TILE_SZ // block_size) - tiled_block_size = LARGE_TILE_SZ // num_tiled_blocks - if tiled_block_size > 1: - # Mask reordering is needed when tiled_block_size > 1 - device = mask.device - mask = mask.cpu() - context_mask = mask[:, :context_kv_len] - context_mask = context_mask.view( - total_query_len, - context_kv_len // LARGE_TILE_SZ, - num_tiled_blocks // B_P_SIZE, - B_P_SIZE, - tiled_block_size, - ) - context_mask = context_mask.transpose(3, 4).reshape( - total_query_len, context_kv_len) - new_mask = mask[:, context_kv_len:] - return torch.concat([context_mask, new_mask], dim=1).to(device) - else: - return mask - - -def flash_attn_varlen_nkifunc( - query, - key, - value, - kv_cache, - block_table, - attn_mask, - n_kv_head=None, - head_size=None, - LARGE_TILE_SZ=2048, - mixed_precision=True, -): - """ - Compute flash paged attention for variable length sequences. - - This function is a wrapper around the flash attention NKI kernel. It takes - in the following arguments: - - query: (1, n_heads, d, seq_q) - - key: (1, n_kv_heads, d, seq_k) - - value: (1, n_kv_heads, seq_v, d) - - kv_cache: (2, n_blocks, n_kv_heads, block_size, d) - - block_tables: (n_active_blocks, ) - - attn_mask: (seq_q, n_active_blocks * block_size + seq_q) - - Notes: - - attn_mask must be reordered outside using `reorder_context_mask` - - Key/value cache layout must be (n_blocks, n_kv_heads, block_size, d) - for better DMA throughput - """ - if n_kv_head is None: - n_kv_head = kv_cache.shape[2] - assert kv_cache.shape[0] == 2 - assert kv_cache.shape[2] == n_kv_head - if head_size is None: - head_size = kv_cache.shape[-1] - - kwargs = dict( - query=query, - key=key, - value=value, - kv_cache=kv_cache, - block_tables=block_table, - mask=attn_mask, - softmax_scale=1.0 / (head_size**0.5), - mixed_precision=mixed_precision, - LARGE_TILE_SZ=LARGE_TILE_SZ, - ) - - o = flash_paged_attention[1, n_kv_head](**kwargs) - return o - - -def reshape_and_cache( - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - slot_mapping: torch.Tensor, -) -> None: - """ - Writes key-value pairs to the KV cache at specified positions. - - Args: - key (torch.Tensor): Key tensor with shape - (num_tokens, n_kv_head, d_head) - value (torch.Tensor): Value tensor with shape - (num_tokens, n_kv_head, d_head) - kv_cache (torch.Tensor): Key/value cache tensor with shape - (2, num_blocks, n_kv_head, block_size, d_head) - slot_mapping (torch.Tensor): Mapping tensor indicating cache positions - with shape (num_tokens) - - Returns: - None: Updates the kv_cache tensor in-place - """ - block_size = kv_cache.size(3) - n_kv_head = key.size(1) - - # Calculate indices with explicit floor division - block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_offsets = slot_mapping % block_size - - # Create the head indices tensor - head_indices = torch.arange(n_kv_head, device=key.device) - - # Update caches using index_put_ - kv_cache.index_put_( - (torch.tensor([0], device=key.device), block_indices[:, None], - head_indices[None, :], block_offsets[:, None]), key) - - kv_cache.index_put_( - (torch.tensor([1], device=key.device), block_indices[:, None], - head_indices[None, :], block_offsets[:, None]), value) diff --git a/vllm/collect_env.py b/vllm/collect_env.py index ee43ad12e8a5e..5b3883a482f25 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -54,7 +54,6 @@ SystemEnv = namedtuple( 'is_xnnpack_available', 'cpu_info', 'rocm_version', # vllm specific field - 'neuron_sdk_version', # vllm specific field 'vllm_version', # vllm specific field 'vllm_build_flags', # vllm specific field 'gpu_topo', # vllm specific field @@ -275,15 +274,6 @@ def get_rocm_version(run_lambda): r'HIP version: (\S+)') -def get_neuron_sdk_version(run_lambda): - # Adapted from your install script - try: - result = run_lambda(["neuron-ls"]) - return result if result[0] == 0 else 'N/A' - except Exception: - return 'N/A' - - def get_vllm_version(): from vllm import __version__, __version_tuple__ @@ -306,10 +296,9 @@ def get_vllm_version(): def summarize_vllm_build_flags(): # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. - return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format( + return 'CUDA Archs: {}; ROCm: {}'.format( os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'), 'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled', - 'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled', ) @@ -601,7 +590,6 @@ def get_env_info(): conda_packages = get_conda_packages(run_lambda) rocm_version = get_rocm_version(run_lambda) - neuron_sdk_version = get_neuron_sdk_version(run_lambda) vllm_version = get_vllm_version() vllm_build_flags = summarize_vllm_build_flags() gpu_topo = get_gpu_topo(run_lambda) @@ -635,7 +623,6 @@ def get_env_info(): is_xnnpack_available=is_xnnpack_available(), cpu_info=get_cpu_info(run_lambda), rocm_version=rocm_version, - neuron_sdk_version=neuron_sdk_version, vllm_version=vllm_version, vllm_build_flags=vllm_build_flags, gpu_topo=gpu_topo, @@ -702,7 +689,6 @@ env_info_fmt += """ vLLM Info ============================== ROCM Version : {rocm_version} -Neuron SDK Version : {neuron_sdk_version} vLLM Version : {vllm_version} vLLM Build Flags: {vllm_build_flags} diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 41322f4f2a25d..063af69f41dad 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -461,11 +461,6 @@ class ModelConfig: DP (which is controlled by `--data-parallel-size`). This is only supported on a per-model basis and falls back to `"weights"` if the encoder does not support DP.""" - override_neuron_config: dict[str, Any] = field(default_factory=dict) - """Initialize non-default neuron config or override default neuron config - that are specific to Neuron devices, this argument will be used to - configure the neuron config that can not be gathered from the vllm - arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" @@ -785,10 +780,6 @@ class ModelConfig: if not self.skip_tokenizer_init: self._verify_tokenizer_mode() - if (not current_platform.is_neuron() and self.override_neuron_config): - raise ValueError( - "`override_neuron_config` is only supported on Neuron.") - # Avoid running try_verify_and_update_config multiple times self.config_updated = False @@ -1696,13 +1687,7 @@ class ModelConfig: """ For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to True to enable cross-attention - Neuron needs all multimodal data to be in the decoder and does not - need to explicitly enable cross-attention """ - if (current_platform.is_neuron() - and self.hf_config.model_type == "mllama"): - return False - return is_encoder_decoder(self.hf_config) @property @@ -1871,7 +1856,7 @@ class LoadConfig: self.ignore_patterns = ["original/**/*"] -Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"] +Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] @config @@ -1927,9 +1912,7 @@ class DeviceConfig: self.device_type = self.device.type # Some device types require processing inputs on CPU - if self.device_type in ["neuron"]: - self.device = torch.device("cpu") - elif self.device_type in ["tpu"]: + if self.device_type in ["tpu"]: self.device = None else: # Set device with device type @@ -3941,7 +3924,6 @@ class VllmConfig: f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, " f"tokenizer_mode={self.model_config.tokenizer_mode}, " f"revision={self.model_config.revision}, " - f"override_neuron_config={self.model_config.override_neuron_config}, " # noqa f"tokenizer_revision={self.model_config.tokenizer_revision}, " f"trust_remote_code={self.model_config.trust_remote_code}, " f"dtype={self.model_config.dtype}, " diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 6f8f962fe7cad..5cc630b72846d 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -33,9 +33,8 @@ class CacheConfig: """Configuration for the KV cache.""" block_size: SkipValidation[BlockSize] = None # type: ignore - """Size of a contiguous cache block in number of tokens. This is ignored on - neuron devices and set to `--max-model-len`. On CUDA devices, only block - sizes up to 32 are supported. On HPU devices, block size defaults to 128. + """Size of a contiguous cache block in number of tokens. On CUDA devices, + only block sizes up to 32 are supported. This config has no static default. If left unspecified by the user, it will be set in `Platform.check_and_update_config()` based on the current diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index fb8e30996ea33..3a74b5fb7e64f 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -377,10 +377,7 @@ class ParallelConfig: from vllm.executor import ray_utils backend: DistributedExecutorBackend = "mp" ray_found = ray_utils.ray_is_available() - if current_platform.is_neuron(): - # neuron uses single process to control multiple devices - backend = "uni" - elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: + if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: backend = "uni" elif (current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size): diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py deleted file mode 100644 index 5b61a1687a016..0000000000000 --- a/vllm/distributed/device_communicators/neuron_communicator.py +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import torch - -from vllm.distributed.device_communicators.base_device_communicator import ( - DeviceCommunicatorBase) -from vllm.platforms import current_platform - -if current_platform.is_neuron(): - import torch_xla.core.xla_model as xm - - -class NeuronCommunicator(DeviceCommunicatorBase): - - def all_reduce(self, x: torch.Tensor) -> torch.Tensor: - return xm.all_reduce(xm.REDUCE_SUM, x) - - def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: - assert dim == -1, "Neuron only supports dim=-1 for all-gather." - return xm.all_gather(x, dim=dim) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d96654ecfa466..fdd25a2f9ce2f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -419,8 +419,6 @@ class EngineArgs: scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls - override_neuron_config: dict[str, Any] = \ - get_field(ModelConfig, "override_neuron_config") override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ ModelConfig.override_pooler_config compilation_config: CompilationConfig = \ @@ -561,8 +559,6 @@ class EngineArgs: help=model_kwargs["hf_token"]["help"]) model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) - model_group.add_argument("--override-neuron-config", - **model_kwargs["override_neuron_config"]) model_group.add_argument("--override-pooler-config", **model_kwargs["override_pooler_config"]) model_group.add_argument("--logits-processor-pattern", @@ -992,7 +988,6 @@ class EngineArgs: mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, mm_encoder_tp_mode=self.mm_encoder_tp_mode, - override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, diff --git a/vllm/envs.py b/vllm/envs.py index 5e1585f3c3f06..50783eeb95a42 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -236,7 +236,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # ================== Installation Time Env Vars ================== # Target device of vLLM, supporting [cuda (by default), - # rocm, neuron, cpu] + # rocm, cpu] "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 6b5a107396c92..e7eb8247d5efd 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -73,11 +73,6 @@ class CustomOp(nn.Module): # NOTE(woosuk): This is a placeholder for future extensions. return self.forward_native(*args, **kwargs) - def forward_neuron(self, *args, **kwargs): - # By default, we assume that Neuron ops are compatible with the - # PyTorch-native implementation. - return self.forward_native(*args, **kwargs) - def forward_oot(self, *args, **kwargs): # By default, we assume that OOT ops are compatible with the # PyTorch-native implementation. @@ -105,8 +100,6 @@ class CustomOp(nn.Module): return self.forward_tpu elif current_platform.is_xpu(): return self.forward_xpu - elif current_platform.is_neuron(): - return self.forward_neuron elif current_platform.is_out_of_tree(): return self.forward_oot else: diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index fac37ef75b638..319fa938d400e 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -95,13 +95,6 @@ class SiluAndMul(CustomOp): self.op(out, x) return out - def forward_neuron(self, x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - x_reshaped = x.view(-1, x.shape[-1]) - s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d]) - result = s * x_reshaped[:, d:] - return result.view(*x.shape[:-1], d) - @CustomOp.register("mul_and_silu") class MulAndSilu(CustomOp): diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index d73fcf368f261..8cac47b5a39a3 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -26,7 +26,6 @@ QuantizationMethods = Literal[ "bitsandbytes", "hqq", "experts_int8", - "neuron_quant", "ipex", "quark", "moe_wna16", @@ -108,7 +107,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config from .moe_wna16 import MoeWNA16Config from .mxfp4 import Mxfp4Config - from .neuron_quant import NeuronQuantConfig from .petit import PetitNvFp4Config from .ptpc_fp8 import PTPCFp8Config from .rtn import RTNConfig @@ -135,7 +133,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "ptpc_fp8": PTPCFp8Config, "hqq": HQQMarlinConfig, "experts_int8": ExpertsInt8Config, - "neuron_quant": NeuronQuantConfig, "ipex": IPEXConfig, "quark": QuarkConfig, "moe_wna16": MoeWNA16Config, diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py deleted file mode 100644 index 8040236663dd1..0000000000000 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -from importlib.util import find_spec -from typing import Any, Optional - -from torch.nn import Module - -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) - -SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn'] - - -class AlwaysSupportedDtypes(list): - - def __contains__(self, item): - return True - - -class NeuronQuantConfig(QuantizationConfig): - """Int8 Quantization Config class for Neuron Backend.""" - - def __init__( - self, - dequant_dtype: str = "f16", - quantize_method: str = "vector_dynamic", - ) -> None: - super().__init__() - self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8") - if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST: - raise ValueError( - f"Neuron quantization datatype {self.quant_dtype} is not valid," - f" the quantization datatype should match one of the below " - f"types {SUPPORTED_QUANT_DTYPE_LIST}") - self.dequant_dtype = dequant_dtype - self.quantize_method = quantize_method - - def get_name(self) -> QuantizationMethods: - return "neuron_quant" - - def get_supported_act_dtypes(self) -> list[str]: - # Neuron implements custom handling logic for quantization support - return AlwaysSupportedDtypes() - - @classmethod - def get_min_capability(cls) -> int: - raise NotImplementedError( - "This function should not be called with Neuron Backend") - - @staticmethod - def get_config_filenames() -> list[str]: - return [] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig": - quantize_method = cls.get_from_keys(config, ["quantize_method"]) - dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"]) - return cls(dequant_dtype=dequant_dtype, - quantize_method=quantize_method) - - def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]: - if find_spec("transformers_neuronx") is not None: - return self.get_quantization_config() - else: - raise NotImplementedError( - "Neuron Quantization is only supported through" - " transformers_neuronx.") - - def get_quantization_config(self): - from transformers_neuronx.config import QuantizationConfig - return QuantizationConfig(quant_dtype=self.quant_dtype, - dequant_dtype=self.dequant_dtype, - quantize_method=self.quantize_method) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 10fce857a8ae2..be25e90abf821 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -7,7 +7,7 @@ import torch from vllm.model_executor.custom_op import CustomOp -from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch +from .common import apply_rotary_emb_torch @CustomOp.register("rotary_embedding") @@ -149,87 +149,6 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache, self.is_neox_style) return query, key - def forward_neuron( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - - def _apply_rotary_emb_neuron( - x: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - is_neox_style: bool, - ) -> torch.Tensor: - cos = cos.unsqueeze(-2).to(x.dtype) - sin = sin.unsqueeze(-2).to(x.dtype) - if is_neox_style: - x1, x2 = torch.chunk(x, 2, dim=-1) - else: - # x1 = x[..., ::2] - - # x2 = x[..., 1::2] - d = x.shape[-1] // 2 - x_reshaped = x.view(-1, x.shape[-1]) - x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d) - x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d) - o1 = x1 * cos - x2 * sin - o2 = x2 * cos + x1 * sin - if is_neox_style: - return torch.cat((o1, o2), dim=-1) - else: - return torch.stack((o1, o2), dim=-1).flatten(-2) - - if offsets is not None: - positions = positions + offsets - - self.cos_sin_cache = self.cos_sin_cache.to(query.device, - dtype=query.dtype) - - positions = positions.flatten() - num_tokens = positions.shape[0] - cos_sin = self.cos_sin_cache.index_select(0, positions) - cos, sin = cos_sin.chunk(2, dim=-1) - - query_shape = query.shape - query = query.view(num_tokens, -1, self.head_size) - if key is not None: - key_shape = key.shape - key = key.view(num_tokens, -1, self.head_size) - - if self.rotary_dim == self.head_size: - query = apply_rotary_emb_dispatch(query, cos, sin, - self.is_neox_style) - query = query.reshape(query_shape) - if key is not None: - key = apply_rotary_emb_dispatch(key, cos, sin, - self.is_neox_style) - key = key.reshape(key_shape) - else: - head_size = query.shape[-1] - query_reshaped = query.view(-1, head_size) - query_pass = query_reshaped[:, self.rotary_dim:].view( - *query.shape[:-1], head_size - self.rotary_dim) - query_rot = query_reshaped[:, :self.rotary_dim].view( - *query.shape[:-1], self.rotary_dim) - query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin, - self.is_neox_style) - query = torch.cat((query_rot, query_pass), - dim=-1).reshape(query_shape) - - if key is not None: - key_reshaped = key.view(-1, head_size) - key_pass = key_reshaped[:, self.rotary_dim:].view( - *key.shape[:-1], head_size - self.rotary_dim) - key_rot = key_reshaped[:, :self.rotary_dim].view( - *key.shape[:-1], self.rotary_dim) - key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin, - self.is_neox_style) - key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) - return query, key - def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}" diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py deleted file mode 100644 index ee484e9a7b0a4..0000000000000 --- a/vllm/model_executor/model_loader/neuron.py +++ /dev/null @@ -1,476 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utilities for selecting and loading Neuron models in transformers-neuronx -framework.""" -import ast -import copy -import importlib -import os -from typing import Optional - -import torch -import torch.nn as nn -from transformers import PretrainedConfig - -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - SpeculativeConfig) -from vllm.logprobs import Logprob -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import get_quantization_config -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput - -TORCH_DTYPE_TO_NEURON_AMP = { - "auto": "f32", - "half": "f16", - "float16": "f16", - "bfloat16": "bf16", - "float": "f32", - "float32": "f32", - torch.float16: "f16", - torch.bfloat16: "bf16", - torch.float32: "f32", -} - -# Models supported by Neuron. -_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = { - "LlamaForCausalLM": ("transformers_neuronx.llama.model", - "LlamaForSampling", "LlamaForCausalLM"), - "MistralForCausalLM": ("transformers_neuronx.mistral.model", - "MistralForSampling", "MistralForCausalLM") -} - - -class NeuronCausalLM(nn.Module): - - def __init__(self, - config: PretrainedConfig, - on_device_sampling_disabled: bool = False) -> None: - super().__init__() - self.config = config - self.logits_processor = LogitsProcessor(config.vocab_size, - logits_as_input=True) - - self.on_device_sampling_disabled = on_device_sampling_disabled - if self.on_device_sampling_disabled: - # Use default sampler - self.sampler = Sampler() - - # Lazy initialized - self.model: nn.Module - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - ) -> torch.Tensor: - logits = self.model(input_ids, - cache_ids=positions, - start_ids=input_block_ids) - return logits - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(None, hidden_states, sampling_metadata) - return logits - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - - if self.on_device_sampling_disabled: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - - # On-device sampling outputs the token ids directly. - sampled_token_ids = logits.flatten() - next_tokens = [] - sample_idx = 0 - for seq_group in sampling_metadata.seq_groups: - samples = [] - for seq_id in seq_group.seq_ids: - token_id = sampled_token_ids[sample_idx].item() - samples.append( - SequenceOutput(parent_seq_id=seq_id, - output_token=token_id, - logprobs={token_id: Logprob(token_id)})) - sample_idx += 1 - next_tokens.append( - CompletionSequenceGroupOutput(samples=samples, - prompt_logprobs=None)) - - return SamplerOutput(outputs=next_tokens) - - def load_weights(self, model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - - self.model = neuronx_model_cls.from_pretrained(model_name_or_path, - **kwargs) - self.model.to_neuron() - - -class NeuronSpeculationCausalLM(nn.Module): - """A Neuron-optimized causal language model with speculative decoding.""" - - SPECULATION_TERMINATION_ID = -1 - - def __init__(self, speculation_model) -> None: - super().__init__() - self.model = speculation_model - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - ) -> torch.Tensor: - tokens, counts = self.model.speculative_iteration( - input_ids, positions, input_block_ids) - - # Mark the end of accepted speculative tokens for each sequence with the - # speculation termination id. - batch_size, steps = tokens.shape - mask = torch.arange(steps).expand(batch_size, -1) >= counts - tokens[mask] = self.SPECULATION_TERMINATION_ID - - return tokens - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[list[SamplerOutput]]: - batch_size, num_steps = logits.shape - seq_ids = [ - seq_id for sg in sampling_metadata.seq_groups - for seq_id in sg.seq_ids - ] - # Organize input tensors by step instead of by sequence. - accepted_token_ids_by_step = logits.transpose(0, 1) - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - sampler_output_list = [] - for step_index in range(num_steps): - if all(token_id == self.SPECULATION_TERMINATION_ID - for token_id in accepted_token_ids_by_step[step_index]): - break - step_output_token_ids = [] - for sequence_index in range(batch_size): - token_id = accepted_token_ids_by_step[step_index][ - sequence_index] - step_output_token_ids.append( - CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_ids[sequence_index], - output_token=token_id, - logprobs={token_id: Logprob(token_id)}) - ], - prompt_logprobs=None)) - sampler_output_list.append( - SamplerOutput(outputs=step_output_token_ids)) - return sampler_output_list - - -def _get_model_architecture(config: PretrainedConfig) -> str: - architectures = getattr(config, "architectures", []) - for arch in architectures: - if arch in _NEURON_SUPPORTED_MODELS: - return arch - raise ValueError( - f"Model architectures {architectures} are not supported on Neuron " - f"for now. Supported architectures: " - f"{list(_NEURON_SUPPORTED_MODELS.keys())}") - - -def _get_buckets(env: str, default_value: list[int]) -> list[int]: - env_value = os.getenv(env) - if env_value is None: - return default_value - buckets_remove_empty = filter( - lambda x: x is not None and len(x.strip()) > 0, env_value.split(",")) - buckets_int = map(int, buckets_remove_empty) - buckets_list = list(buckets_int) - return buckets_list - - -def _get_default_neuron_config(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig): - """Generate a neuron config based on vllm config args.""" - from transformers_neuronx.config import ContinuousBatchingConfig - from transformers_neuronx.constants import LAYOUT_BSH - - continuous_batching_config = ContinuousBatchingConfig( - batch_size_for_shared_caches=scheduler_config.max_num_seqs) - quant_config = dict( - dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - quantize_method="vector_dynamic") - neuron_quantization_config_builder = lambda quant: get_quantization_config( - quant).from_config(quant_config).get_quant_method(None, "") - # TODO: Add Paged attention config to the default neuron arguments. - default_neuron_args = dict( - collectives_layout=LAYOUT_BSH, - attention_layout=LAYOUT_BSH, - fuse_qkv=True, - quant=neuron_quantization_config_builder(model_config.quantization) - if model_config.quantization else None, - continuous_batching=continuous_batching_config, - weight_tiling=bool(model_config.quantization), - on_device_generation=_get_neuron_on_device_generation_config( - model_config)) - return default_neuron_args - - -def _get_default_neuron_config_for_speculation( - model_config: ModelConfig, parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig): - """Generate a neuron config for speculative decoding based on - vllm config args.""" - from transformers_neuronx.config import ContinuousBatchingConfig - from transformers_neuronx.constants import LAYOUT_BSH - - continuous_batching_config = ContinuousBatchingConfig( - batch_size_for_shared_caches=scheduler_config.max_num_seqs) - - default_neuron_args = dict(collectives_layout=LAYOUT_BSH, - attention_layout=LAYOUT_BSH, - fuse_qkv=True, - on_device_embedding=True, - continuous_batching=continuous_batching_config, - on_device_generation=copy.deepcopy( - model_config.neuron_sampling_params)) - return default_neuron_args - - -def _get_neuron_on_device_generation_config(model_config: ModelConfig): - if not _is_neuron_on_device_sampling_disabled(model_config): - return copy.deepcopy(model_config.neuron_sampling_params) - return None - - -def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool: - return not getattr(model_config, "neuron_sampling_params", None) - - -def _get_neuron_config_after_override(default_neuron_config, - overridden_neuron_config): - from transformers_neuronx.config import (ContinuousBatchingConfig, - GenerationConfig, - KVCacheQuantizationConfig, - NeuronConfig, QuantizationConfig, - SparseAttnConfig) - - sparse_attn = overridden_neuron_config.pop("sparse_attn", {}) - if sparse_attn: - overridden_neuron_config["sparse_attn"] = SparseAttnConfig( - **sparse_attn) - - kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {}) - if kv_cache_quant: - overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig( - **kv_cache_quant) - - continuous_batching = overridden_neuron_config.pop("continuous_batching", - {}) - if continuous_batching: - overridden_neuron_config[ - "continuous_batching"] = ContinuousBatchingConfig( - **continuous_batching) - - quant = overridden_neuron_config.pop("quant", {}) - if quant: - overridden_neuron_config["quant"] = QuantizationConfig(**quant) - - on_device_generation = overridden_neuron_config.pop( - "on_device_generation", {}) - if on_device_generation: - overridden_neuron_config["on_device_generation"] = GenerationConfig( - **on_device_generation) - default_neuron_config.update(overridden_neuron_config) - return NeuronConfig(**default_neuron_config) - - -def get_neuron_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig) -> nn.Module: - """Initializes a neuron-optimized model for inference.""" - # Create a model instance. - model = NeuronCausalLM( - model_config.hf_config, - _is_neuron_on_device_sampling_disabled(model_config)) - - default_neuron_config_args = _get_default_neuron_config( - model_config, parallel_config, scheduler_config) - - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS", - [scheduler_config.max_model_len]) - n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS", - [scheduler_config.max_model_len]) - - model.load_weights(model_config.model, - tp_degree=parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - neuron_config=neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - return model.eval() - - -def get_neuron_speculation_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Initializes a neuron-optimized speculation model for inference. - - This method is only applicable for speculation with a standalone draft model - """ - from transformers_neuronx.fused_speculation import FusedSpeculativeDecoder - - # For Eagle SD, we need to pass in additional parameters in neuron config. - is_eagle = getattr(speculation_config.draft_model_config.hf_config, - "is_eagle", False) - - # Create target model instance. - target_model = NeuronCausalLM(model_config.hf_config) - - default_neuron_config_args = _get_default_neuron_config_for_speculation( - model_config, parallel_config, scheduler_config) - if is_eagle: - default_neuron_config_args['is_eagle_target'] = True - - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS", - [scheduler_config.max_model_len]) - n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS", - [scheduler_config.max_model_len]) - - target_model.load_weights( - model_config.model, - tp_degree=parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - neuron_config=neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - target_model.eval() - - # Create draft model instance. - draft_model = NeuronCausalLM( - speculation_config.draft_model_config.hf_config) - - default_draft_neuron_config_args = ( - _get_default_neuron_config_for_speculation( - speculation_config.draft_model_config, parallel_config, - scheduler_config)) - if is_eagle: - default_draft_neuron_config_args['is_eagle_draft'] = True - default_draft_neuron_config_args['has_pre_attention_norm'] = False - - draft_neuron_config = _get_neuron_config_after_override( - default_draft_neuron_config_args, - speculation_config.draft_model_config.override_neuron_config) - - draft_model.load_weights(speculation_config.draft_model_config.model, - tp_degree=speculation_config. - draft_parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[ - speculation_config.draft_model_config.dtype], - neuron_config=draft_neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - draft_model.eval() - - num_speculative_tokens = speculation_config.num_speculative_tokens - # Create speculation model instance. - speculation_model = FusedSpeculativeDecoder(draft_model.model, - target_model.model, - num_speculative_tokens) - speculation_model.to_neuron() - - return NeuronSpeculationCausalLM(speculation_model) - - -def get_neuron_eagle_speculation_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Initializes a neuron-optimized EAGLE speculation model for inference.""" - from transformers_neuronx.eagle_speculation import EagleSpeculativeDecoder - - # Create target model instance. - target_model = NeuronCausalLM(model_config.hf_config) - - default_neuron_config_args = _get_default_neuron_config_for_speculation( - model_config, parallel_config, scheduler_config) - default_neuron_config_args['is_eagle_target'] = True - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS", - [scheduler_config.max_model_len]) - n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS", - [scheduler_config.max_model_len]) - - target_model.load_weights( - model_config.model, - tp_degree=parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - neuron_config=neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - target_model.eval() - - # Create draft model instance. - draft_model = NeuronCausalLM( - speculation_config.draft_model_config.hf_config) - - default_draft_neuron_config_args = ( - _get_default_neuron_config_for_speculation( - speculation_config.draft_model_config, parallel_config, - scheduler_config)) - default_draft_neuron_config_args['is_eagle_draft'] = True - default_draft_neuron_config_args['has_pre_attention_norm'] = False - draft_neuron_config = _get_neuron_config_after_override( - default_draft_neuron_config_args, - speculation_config.draft_model_config.override_neuron_config) - - draft_model.load_weights(speculation_config.draft_model_config.model, - tp_degree=speculation_config. - draft_parallel_config.tensor_parallel_size, - amp=TORCH_DTYPE_TO_NEURON_AMP[ - speculation_config.draft_model_config.dtype], - neuron_config=draft_neuron_config, - context_length_estimate=context_length_estimates, - n_positions=n_positions, - batch_size=scheduler_config.max_num_seqs) - - draft_model.eval() - - token_tree: dict[int, list[int]] = ast.literal_eval( - speculation_config.speculative_token_tree) - - speculation_model = EagleSpeculativeDecoder(draft_model.model, - target_model.model, - token_tree=token_tree) - speculation_model.to_neuron() - - return NeuronSpeculationCausalLM(speculation_model) diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py deleted file mode 100644 index 34bf43fe7b57c..0000000000000 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ /dev/null @@ -1,685 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utilities for selecting and loading Neuron models in -neuronx-distributed-inference framework.""" -# Disabling yapf because yapf and isort have conflicts for the below imports -# yapf: disable -import copy -import hashlib -import importlib -import multiprocessing -import os -import shutil -from typing import Optional - -import torch -import torch.nn as nn -from neuronx_distributed_inference.models.config import ( - FusedSpecNeuronConfig, OnDeviceSamplingConfig) -from neuronx_distributed_inference.models.mllama.utils import ( - create_vision_mask) -from neuronx_distributed_inference.modules.lora_serving import ( - LoraServingConfig) -from neuronx_distributed_inference.utils.hf_adapter import ( - load_pretrained_config) -from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig - -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - SpeculativeConfig) -from vllm.logger import init_logger -from vllm.logprobs import Logprob -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput - -# yapf: enable -logger = init_logger(__name__) - -TORCH_DTYPE_TO_NEURON_AMP = { - "auto": "float32", - "half": "float16", - "float16": "float16", - "bfloat16": "bfloat16", - "float": "float32", - "float32": "float32", - torch.float16: "float16", - torch.bfloat16: "bfloat16", - torch.float32: "float32", -} - -# Models supported by Neuronx distributed for inference. -_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = { - "LlamaForCausalLM": - ("neuronx_distributed_inference.models.llama.modeling_llama", - "NeuronLlamaForCausalLM"), - "MistralForCausalLM": - ("neuronx_distributed_inference.models.llama.modeling_llama", - "NeuronLlamaForCausalLM"), - "DbrxForCausalLM": - ("neuronx_distributed_inference.models.dbrx.modeling_dbrx", - "NeuronDbrxForCausalLM"), - "MixtralForCausalLM": - ("neuronx_distributed_inference.models.mixtral.modeling_mixtral", - "NeuronMixtralForCausalLM"), - "MllamaForConditionalGeneration": - ("neuronx_distributed_inference.models.mllama.modeling_mllama", - "NeuronMllamaForCausalLM"), -} - - -class NeuronCausalLM(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - ) -> None: - super().__init__() - self.config = config - self.logits_processor = LogitsProcessor(config.vocab_size, - logits_as_input=True) - self.sampler = Sampler() - - # Lazy initialized - self.model: nn.Module - - def forward(self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - sampling_params: torch.Tensor, - prev_hidden: Optional[torch.Tensor] = None, - adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor: - # sort block ids sequentially for perf/neuron support reasons - sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) - input_ids = torch.index_select(input_ids, 0, sorted_indices) - positions = torch.index_select(positions, 0, sorted_indices) - sampling_params = torch.index_select(sampling_params, 0, - sorted_indices) - output = self.model(input_ids, - attention_mask=None, - position_ids=positions, - seq_ids=sorted_input_block_ids, - sampling_params=sampling_params, - prev_hidden=prev_hidden, - adapter_ids=adapter_ids) - # on-device sampling - if self.config.neuron_config.on_device_sampling_config: - output = output.hidden_states - else: - output = output.logits[:, -1, :] - - restored_indices = torch.argsort(sorted_indices) - if input_block_ids.shape[0] != 1: - output = torch.index_select(output, 0, restored_indices) - - return output - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(None, hidden_states, sampling_metadata) - return logits - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - # on-device sampling - if self.config.neuron_config.on_device_sampling_config: - batch_size = logits.shape - seq_ids = [ - seq_id for sg in sampling_metadata.seq_groups - for seq_id in sg.seq_ids - ] - assert len(seq_ids) == list(batch_size)[0], "batch size mismatch" - # Organize input tensors by step instead of by sequence. - accepted_token_ids_by_step = logits.flatten() - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - step_output_token_ids = [] - for i, seq_id in enumerate(seq_ids): - token_id = accepted_token_ids_by_step[i] - step_output_token_ids.append( - CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_id, - output_token=token_id, - logprobs={token_id: Logprob(token_id)}) - ], - prompt_logprobs=None)) - return SamplerOutput(outputs=step_output_token_ids) - else: - return self.sampler(logits, sampling_metadata) - - def load_weights(self, model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - neuron_config = neuronx_model_cls.get_neuron_config_cls()( - **kwargs['neuron_config']) - self.config.neuron_config = neuron_config - config = neuronx_model_cls.get_config_cls()( - neuron_config, - load_config=load_pretrained_config(model_name_or_path)) - hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'), - usedforsecurity=False).hexdigest() - if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None: - compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS") - elif os.path.exists(model_name_or_path): - compiled_model_path = os.path.join(model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - else: - compiled_model_path = os.path.join("local-models", - model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - try: - self.model = neuronx_model_cls(compiled_model_path) - override_neuron_config = kwargs["override_neuron_config"] - for k, v in override_neuron_config.items(): - setattr(self.model.config.neuron_config, k, v) - self.model.load(compiled_model_path) - return - except (FileNotFoundError, ValueError) as e: - logger.warning("Exception: %s", e) - logger.warning("Failed to load the model from %s, Recompiling...", - compiled_model_path) - if not os.path.exists(model_name_or_path): - hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) - saved_path = os.path.join("local-models", model_name_or_path) - hf_model.save_pretrained(saved_path) - model_name_or_path = saved_path - self.model = neuronx_model_cls(model_name_or_path, config) - self.model.compile(compiled_model_path) - self.model.load(compiled_model_path) - - -class NeuronMllamaForCausalLM(nn.Module): - - def __init__(self, - config: PretrainedConfig, - on_device_sampling_disabled: bool = False) -> None: - super().__init__() - # has_image is the only multimodal input that is used in - # token-generation - # This is a cache (on CPU) that saves has_image data per sequence id - # The number of entries in this cache is <= Batch-Size - self.has_image_cache: dict[int, torch.Tensor] = {} - self.config = config - self.logits_processor = LogitsProcessor( - config.get_text_config().vocab_size, logits_as_input=True) - - self.on_device_sampling_disabled = on_device_sampling_disabled - if self.on_device_sampling_disabled: - # Use default sampler - self.sampler = Sampler() - - # Lazy initialized - self.model: nn.Module - self.is_reorder_needed: bool = True - - def read_from_has_image_cache(self, seq_ids: torch.Tensor): - has_image_list = [] - for index in range(len(seq_ids)): - seq_id = seq_ids[index].item() - if seq_id in self.has_image_cache: - has_image_list.append(self.has_image_cache[seq_id]) - else: - has_image_list.append(torch.tensor([0])) - return torch.tensor(has_image_list) - - def write_to_has_image_cache(self, seq_ids: torch.Tensor, - has_image: torch.Tensor): - for index in range(len(seq_ids)): - seq_id = seq_ids[index].item() - if index < len(has_image): - self.has_image_cache[seq_id] = has_image[index] - else: - self.has_image_cache[seq_id] = torch.zeros(1) - - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, - seq_ids: torch.Tensor, pixel_values: torch.Tensor, - aspect_ratios: torch.Tensor, num_chunks: torch.Tensor, - has_image: torch.Tensor, sampling_params) -> torch.Tensor: - - # We update the has_image cache during prefill - # and read the has_image cache during decode - if input_ids.shape[-1] > 1: # prefill - self.write_to_has_image_cache(seq_ids, has_image) - else: - has_image = self.read_from_has_image_cache(seq_ids) - bs = input_ids.shape[0] - num_chunks = torch.zeros((bs, 1)) - aspect_ratios = torch.zeros((bs, 1, 2)) - - input_block_ids = seq_ids - origin_input_block_ids = seq_ids - if self.is_reorder_needed: - # sort block ids sequentially for perf/neuron support reasons - input_block_ids, sorted_indices = torch.sort(input_block_ids) - input_ids = torch.index_select(input_ids, 0, sorted_indices) - positions = torch.index_select(positions, 0, sorted_indices) - sampling_params = torch.index_select(sampling_params, 0, - sorted_indices) - pixel_values = torch.index_select(pixel_values, 0, sorted_indices) - aspect_ratios = torch.index_select(aspect_ratios, 0, - sorted_indices) - num_chunks = torch.index_select(num_chunks, 0, sorted_indices) - has_image = torch.index_select(has_image, 0, sorted_indices) - - self.vision_mask = create_vision_mask(input_ids, self.vision_token_id) - output = self.model( - input_ids.to(torch.int32), - attention_mask=None, - position_ids=positions.to(torch.int32), - seq_ids=seq_ids.flatten().to(torch.int32), - pixel_values=pixel_values.to( - self.config.vision_config.torch_dtype), - aspect_ratios=aspect_ratios.to(torch.int32), - vision_mask=self.vision_mask.to(torch.int32), - sampling_params=sampling_params, - num_chunks=num_chunks.to(torch.int32), - has_image=has_image.to(torch.int32), - ) - if self.config.neuron_config.on_device_sampling_config: - output = output.hidden_states - else: - output = output.logits[:, -1, :] - - if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1: - restored_indices = torch.argsort(sorted_indices) - output = torch.index_select(output, 0, restored_indices) - return output - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(None, hidden_states, sampling_metadata) - return logits - - def sample(self, hidden_states, sampling_metadata): - if not self.on_device_sampling_disabled: - with torch.profiler.record_function("sample"): - hidden_states = hidden_states.flatten() - res = [] - sample_idx = 0 - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - samples = [] - for seq_id in seq_ids: - token_id = hidden_states[sample_idx].item() - samples.append( - SequenceOutput( - parent_seq_id=seq_id, - output_token=token_id, - logprobs={token_id: Logprob(token_id)})) - sample_idx += 1 - res.append( - CompletionSequenceGroupOutput(samples=samples, - prompt_logprobs=None)) - next_tokens = SamplerOutput(outputs=res) - else: - next_tokens = self.sampler(None, hidden_states, sampling_metadata) - return next_tokens - - def load_weights(self, model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - neuron_config = neuronx_model_cls.get_neuron_config_cls()( - **kwargs['neuron_config']) - self.config.neuron_config = neuron_config - logger.info("neuron_config buckets: %s", - self.config.neuron_config.buckets) - config = neuronx_model_cls.get_config_cls()( - neuron_config, - load_config=load_pretrained_config(model_name_or_path)) - hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'), - usedforsecurity=False).hexdigest() - if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None: - compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS") - elif os.path.exists(model_name_or_path): - compiled_model_path = os.path.join(model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - else: - compiled_model_path = os.path.join("local-models", - model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - try: - self.model = neuronx_model_cls(compiled_model_path) - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - self.vision_token_id = tokenizer( - "<|image|>", add_special_tokens=False).input_ids[0] - self.model.load(compiled_model_path) - return - except (FileNotFoundError, ValueError): - logger.warning("Failed to load the model from %s, Recompiling...", - compiled_model_path) - if not os.path.exists(model_name_or_path): - hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) - saved_path = os.path.join("local-models", model_name_or_path) - hf_model.save_pretrained(saved_path) - model_name_or_path = saved_path - self.model = neuronx_model_cls(model_name_or_path, config) - - logger.info("\nCompiling and saving model to %s", model_name_or_path) - - p = multiprocessing.Process(target=compile_model, - args=(self, compiled_model_path)) - p.start() - p.join() - - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - tokenizer.save_pretrained(compiled_model_path) - logger.info("Successfully compiled and saved the model in %s", - compiled_model_path) - - # Read "<|image|>" token_id from the tokenizer - self.vision_token_id = tokenizer("<|image|>", - add_special_tokens=False).input_ids[0] - logger.info("\nLoading model from compiled checkpoint...") - self.model.load(compiled_model_path) - - -def compile_model(neuron_model, traced_model_path): - neuron_model.model.compile(traced_model_path) - - -class NeuronSpeculationCausalLM(nn.Module): - """A Neuron-optimized causal language model with speculative decoding.""" - - def __init__( - self, - config: PretrainedConfig, - ) -> None: - super().__init__() - self.config = config - self.logits_processor = LogitsProcessor(config.vocab_size, - logits_as_input=True) - # Lazy initialized - self.model: nn.Module - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - sampling_params: torch.Tensor, - ) -> torch.Tensor: - # sort block ids sequentially for perf/neuron support reasons - sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) - input_ids = torch.index_select(input_ids, 0, sorted_indices) - positions = torch.index_select(positions, 0, sorted_indices) - sampling_params = torch.index_select(sampling_params, 0, - sorted_indices) - - output = self.model(input_ids, - attention_mask=None, - position_ids=positions, - seq_ids=sorted_input_block_ids, - sampling_params=sampling_params) - restored_indices = torch.argsort(sorted_indices) - - # CTX encoding - if (positions[:, 0]).sum().item() == 0: - output = output.fused_outputs[0][:, 0:1] - if input_block_ids.shape[0] != 1: - output = torch.index_select(output, 0, restored_indices) - return output - - # Fused Spec (Generation) - accepted_tokens_with_padding = output.fused_outputs[0] - next_pos_ids = output.fused_outputs[-1] - generated_token_counts = next_pos_ids - positions - - assert torch.any(generated_token_counts == 0).item() is False, \ - "NxDI model generated no output for one or more sequences." - - batch_size, steps = accepted_tokens_with_padding.shape - mask = torch.arange(steps).expand(batch_size, - -1) >= generated_token_counts - accepted_tokens_with_padding[mask] = -1 - - if input_block_ids.shape[0] != 1: - accepted_tokens_with_padding = torch.index_select( - accepted_tokens_with_padding, 0, restored_indices) - - return accepted_tokens_with_padding - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[list[SamplerOutput]]: - batch_size, num_steps = logits.shape - seq_ids = [ - seq_id for sg in sampling_metadata.seq_groups - for seq_id in sg.seq_ids - ] - # Organize input tensors by step instead of by sequence. - accepted_token_ids_by_step = logits.transpose(0, 1) - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - sampler_output_list = [] - for step_index in range(num_steps): - if all(token_id == -1 - for token_id in accepted_token_ids_by_step[step_index]): - break - step_output_token_ids = [] - for sequence_index in range(batch_size): - token_id = accepted_token_ids_by_step[step_index][ - sequence_index] - step_output_token_ids.append( - CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_ids[sequence_index], - output_token=token_id, - logprobs={token_id: Logprob(token_id)}) - ], - prompt_logprobs=None)) - sampler_output_list.append( - SamplerOutput(outputs=step_output_token_ids)) - return sampler_output_list - - def load_weights(self, model_name_or_path: str, - draft_model_name_or_path: str, **kwargs): - arch = _get_model_architecture(self.config) - neuronx_module_path, neuronx_model_cls_name = ( - _NEURON_SUPPORTED_MODELS[arch]) - neuronx_module = importlib.import_module(neuronx_module_path) - neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) - neuron_config = neuronx_model_cls.get_neuron_config_cls()( - **kwargs['neuron_config']) - config = neuronx_model_cls.get_config_cls()( - neuron_config, - load_config=load_pretrained_config(model_name_or_path)) - - draft_neuron_config = copy.deepcopy(config.neuron_config) - if not config.neuron_config.enable_eagle_speculation: - draft_neuron_config.speculation_length = 0 - draft_neuron_config.trace_tokengen_model = True - draft_neuron_config.enable_fused_speculation = False - if getattr(config.neuron_config, "draft_model_modules_to_not_convert", - None): - draft_neuron_config.modules_to_not_convert = ( - draft_neuron_config.draft_model_modules_to_not_convert) - if config.neuron_config.enable_eagle_speculation: - draft_neuron_config.is_eagle_draft = True - draft_neuron_config.sequence_parallel_enabled = False - draft_config = neuronx_model_cls.get_config_cls()( - draft_neuron_config, - load_config=load_pretrained_config(draft_model_name_or_path)) - fused_spec_config = (FusedSpecNeuronConfig( - neuronx_model_cls._model_cls, - draft_config=draft_config, - draft_model_path=draft_model_name_or_path)) - config.fused_spec_config = fused_spec_config - self.config.neuron_config = neuron_config - - hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'), - usedforsecurity=False).hexdigest() - if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None: - compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS") - elif os.path.exists(model_name_or_path): - compiled_model_path = os.path.join(model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - else: - compiled_model_path = os.path.join("local-models", - model_name_or_path, - "neuron-compiled-artifacts", - hashed_config) - shutil.rmtree(compiled_model_path, ignore_errors=True) - try: - self.model = neuronx_model_cls(compiled_model_path) - override_neuron_config = kwargs["override_neuron_config"] - for k, v in override_neuron_config.items(): - setattr(self.model.config.neuron_config, k, v) - self.model.load(compiled_model_path) - return - except (FileNotFoundError, ValueError) as e: - logger.warning("Exception: %s", e) - logger.warning("Failed to load the model from %s Recompiling...", - compiled_model_path) - if not os.path.exists(model_name_or_path): - hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) - saved_path = os.path.join("local-models", model_name_or_path) - hf_model.save_pretrained(saved_path) - model_name_or_path = saved_path - if not os.path.exists(draft_model_name_or_path): - if draft_model_name_or_path != model_name_or_path: - hf_model = AutoModelForCausalLM.from_pretrained( - draft_model_name_or_path) - saved_path = os.path.join("local-models", - draft_model_name_or_path) - hf_model.save_pretrained(saved_path) - draft_model_name_or_path = saved_path - else: - draft_model_name_or_path = model_name_or_path - config.fused_spec_config.draft_model_path = draft_model_name_or_path - self.model = neuronx_model_cls(model_name_or_path, config) - self.model.compile(compiled_model_path) - self.model.load(compiled_model_path) - - -def _get_model_architecture(config: PretrainedConfig) -> str: - architectures = getattr(config, "architectures", []) - for arch in architectures: - if arch in _NEURON_SUPPORTED_MODELS: - return arch - raise ValueError( - f"Model architectures {architectures} are not supported on Neuron " - f"for now. Supported architectures: " - f"{list(_NEURON_SUPPORTED_MODELS.keys())}") - - -def _get_default_neuron_config(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_serving_config: LoraServingConfig): - """Generate a neuron config based on vllm config args.""" - on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True, - deterministic=False) - batch_size = scheduler_config.max_num_seqs - - neuron_config = dict( - tp_degree=parallel_config.tensor_parallel_size, - ctx_batch_size=1, - batch_size=batch_size, - max_context_length=scheduler_config.max_model_len, - seq_len=scheduler_config.max_model_len, - enable_bucketing=True, - is_continuous_batching=True, - quantized=False, - torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - padding_side="right", - on_device_sampling_config=on_device_sampling_config, - sequence_parallel_enabled=True, - lora_serving_config=lora_serving_config) - return neuron_config - - -def _get_default_speculation_config(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Generate a neuron config for speculative decoding based on vllm config - args.""" - neuron_config = dict( - tp_degree=parallel_config.tensor_parallel_size, - ctx_batch_size=1, - batch_size=scheduler_config.max_num_seqs, - max_context_length=scheduler_config.max_model_len, - seq_len=scheduler_config.max_model_len, - speculation_length=speculation_config.num_speculative_tokens, - trace_tokengen_model=False, - enable_fused_speculation=True, - enable_bucketing=True, - is_continuous_batching=True, - quantized=False, - torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], - on_device_sampling_config=dict( - top_k=1, - do_sample=False, - )) - return neuron_config - - -def _get_neuron_config_after_override(default_neuron_config, - overridden_neuron_config): - """Update default neuron config values with override args""" - overridden_neuron_config = overridden_neuron_config or {} - default_neuron_config.update(overridden_neuron_config) - return default_neuron_config - - -def get_neuron_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_serving_config: LoraServingConfig) -> nn.Module: - """Initializes a neuron-optimized model for inference.""" - model_arch = _get_model_architecture(model_config.hf_config) - if model_arch == "MllamaForConditionalGeneration": - model = NeuronMllamaForCausalLM(model_config.hf_config) - else: - model = NeuronCausalLM(model_config.hf_config) - default_neuron_config_args = _get_default_neuron_config( - model_config, parallel_config, scheduler_config, lora_serving_config) - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - override_neuron_config = model_config.override_neuron_config - model.load_weights(model_config.model, - neuron_config=neuron_config, - override_neuron_config=override_neuron_config) - return model.eval() - - -def get_neuron_speculation_model(model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - speculation_config: SpeculativeConfig): - """Initializes a neuron-optimized speculation model for inference. - - This model handles speculation using both a draft model and an EAGLE draft. - """ - model = NeuronSpeculationCausalLM(model_config.hf_config) - default_neuron_config_args = _get_default_speculation_config( - model_config, parallel_config, scheduler_config, speculation_config) - neuron_config = _get_neuron_config_after_override( - default_neuron_config_args, model_config.override_neuron_config) - - override_neuron_config = model_config.override_neuron_config - model.load_weights(model_config.model, - speculation_config.draft_model_config.model, - neuron_config=neuron_config, - override_neuron_config=override_neuron_config) - return model.eval() diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 56edb8629e45b..9b64817da648c 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -169,37 +169,12 @@ def cpu_platform_plugin() -> Optional[str]: return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None -def neuron_platform_plugin() -> Optional[str]: - tnx_installed = False - nxd_installed = False - logger.debug("Checking if Neuron platform is available.") - try: - import transformers_neuronx # noqa: F401 - tnx_installed = True - logger.debug("Confirmed Neuron platform is available because" - " transformers_neuronx is found.") - except ImportError: - pass - - try: - import neuronx_distributed_inference # noqa: F401 - nxd_installed = True - logger.debug("Confirmed Neuron platform is available because" - " neuronx_distributed_inference is found.") - except ImportError: - pass - - is_neuron = tnx_installed or nxd_installed - return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None - - builtin_platform_plugins = { 'tpu': tpu_platform_plugin, 'cuda': cuda_platform_plugin, 'rocm': rocm_platform_plugin, 'xpu': xpu_platform_plugin, 'cpu': cpu_platform_plugin, - 'neuron': neuron_platform_plugin, } diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index cb620542b89f3..fdd3764d2c35d 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -73,7 +73,6 @@ class PlatformEnum(enum.Enum): TPU = enum.auto() XPU = enum.auto() CPU = enum.auto() - NEURON = enum.auto() OOT = enum.auto() UNSPECIFIED = enum.auto() @@ -164,9 +163,6 @@ class Platform: def is_cpu(self) -> bool: return self._enum == PlatformEnum.CPU - def is_neuron(self) -> bool: - return self._enum == PlatformEnum.NEURON - def is_out_of_tree(self) -> bool: return self._enum == PlatformEnum.OOT diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py deleted file mode 100644 index cb8ac8db669fe..0000000000000 --- a/vllm/platforms/neuron.py +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import enum -import os -from functools import lru_cache -from typing import TYPE_CHECKING, Optional - -from vllm import envs -from vllm.logger import init_logger -from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS - -from .interface import Platform, PlatformEnum - -if TYPE_CHECKING: - from vllm.config import VllmConfig -else: - VllmConfig = None - -logger = init_logger(__name__) - - -class NeuronFramework(enum.Enum): - TRANSFORMERS_NEURONX = "transformers-neuronx" - NEURONX_DISTRIBUTED_INFERENCE = "neuronx-distributed-inference" - - -class NeuronPlatform(Platform): - _enum = PlatformEnum.NEURON - device_name: str = "neuron" - device_type: str = "neuron" - ray_device_key: str = "neuron_cores" - supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"] - dist_backend: str = "gloo" - device_control_env_var: str = "NEURON_RT_VISIBLE_CORES" - - @classmethod - def get_device_name(cls, device_id: int = 0) -> str: - return "neuron" - - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - - @classmethod - def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - parallel_config = vllm_config.parallel_config - if parallel_config.worker_cls == "auto": - parallel_config.worker_cls = \ - "vllm.worker.neuron_worker.NeuronWorker" - - if parallel_config.world_size > 1: - parallel_config.distributed_executor_backend = "uni" - - if vllm_config.cache_config and vllm_config.model_config: - # neuron needs block_size = max_model_len - vllm_config.cache_config.block_size = \ - vllm_config.model_config.max_model_len # type: ignore - - if vllm_config.model_config and vllm_config.model_config.use_mla: - logger.info( - "MLA is enabled on a non-GPU platform; forcing chunked " - "prefill and prefix caching to be disabled.") - vllm_config.scheduler_config.enable_chunked_prefill = False - vllm_config.scheduler_config.chunked_prefill_enabled = False - vllm_config.scheduler_config.max_num_batched_tokens = max( - vllm_config.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS) - - @classmethod - def is_pin_memory_available(cls) -> bool: - logger.warning("Pin memory is not supported on Neuron.") - return False - - @classmethod - def get_device_communicator_cls(cls) -> str: - if envs.VLLM_USE_V1: - return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator" # noqa - else: - return Platform.get_device_communicator_cls() - - @classmethod - def use_all_gather(cls) -> bool: - return True - - @classmethod - @lru_cache - def is_neuronx_distributed_inference(cls) -> bool: - try: - import neuronx_distributed_inference - except ImportError: - neuronx_distributed_inference = None - return neuronx_distributed_inference is not None - - @classmethod - @lru_cache - def is_transformers_neuronx(cls) -> bool: - try: - import transformers_neuronx - except ImportError: - transformers_neuronx = None - return transformers_neuronx is not None - - def get_neuron_framework_to_use(self): - """Return the specified framework if corresponding installations are - available. - - If no framework is specified, use neuronx-distributed-inference by - default. - If that's unavailable, check and switch to transformers-neuronx. - """ - if not self.is_neuron(): - raise AssertionError( - f"Neuron Framework unavailable for platform: {self}") - - tnx_installed = self.is_transformers_neuronx() - nxd_installed = self.is_neuronx_distributed_inference() - - specified_framework = os.environ.get("VLLM_NEURON_FRAMEWORK") - tnx_framework = NeuronFramework.TRANSFORMERS_NEURONX.value - nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE.value - if specified_framework == tnx_framework and tnx_installed: - return self.TRANSFORMERS_NEURONX - - if ((specified_framework == nxd_framework and nxd_installed) - or (specified_framework is None and nxd_installed)): - return NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE - - if specified_framework is None and tnx_installed: - return NeuronFramework.TRANSFORMERS_NEURONX - - return None - - def use_neuronx_distributed(self): - """ - Return True if the framework determined in get_neuron_framework_to_use() - is NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE, False otherwise. This - is used to select the Neuron model framework and framework-specific - configuration to apply during model compilation. - """ - nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE - return self.get_neuron_framework_to_use() == nxd_framework - - def use_transformers_neuronx(self): - """ - Return True if the framework determined in get_neuron_framework_to_use() - is NeuronFramework.TRANSFORMERS_NEURONX, False otherwise. This is used - to select the Neuron model framework and framework-specific - configuration to apply during model compilation. - """ - return self.get_neuron_framework_to_use( - ) == NeuronFramework.TRANSFORMERS_NEURONX diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py deleted file mode 100644 index 8317b9abff0cd..0000000000000 --- a/vllm/worker/neuron_model_runner.py +++ /dev/null @@ -1,455 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union - -import torch -from torch import nn - -from vllm.config import DeviceConfig, VllmConfig -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.neuron import get_neuron_model -from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs -from vllm.platforms import current_platform -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - - -@dataclass(frozen=True) -class ModelInputForNeuron(ModelRunnerInputBase): - """ - Used by the NeuronModelRunner. - """ - input_tokens: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - input_block_ids: Optional[torch.Tensor] = None - sampling_metadata: SamplingMetadata = None - multi_modal_kwargs: BatchedTensorInputs = None - adapter_ids: Optional[str] = None - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - return { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "input_block_ids": self.input_block_ids, - "sampling_metadata": self.sampling_metadata, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForNeuron": - return ModelInputForNeuron( - input_tokens=tensor_dict["input_tokens"], - input_positions=tensor_dict["input_positions"], - input_block_ids=tensor_dict["input_block_ids"], - sampling_metadata=tensor_dict["sampling_metadata"], - multi_modal_kwargs=tensor_dict["multi_modal_kwargs"], - ) - - -class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): - """A model runner for AWS Neuron hardware""" - - # NEURON has an upper limit on the top_k - _MAX_NEURON_SAMPLING_TOP_K = 256 - - def __init__( - self, - vllm_config: VllmConfig, - ): - ModelRunnerBase.__init__(self, vllm_config) - - if (self.model_config is not None - and self.model_config.get_sliding_window()): - logger.warning("Sliding window is not supported on Neuron. " - "The model will run without sliding window.") - self.device_config = (self.device_config if self.device_config - is not None else DeviceConfig()) - self.lora_config = vllm_config.lora_config - self.device = self.device_config.device - self.pin_memory = is_pin_memory_available() - - # Lazy initialization. - self.model: nn.Module # initialize after load_model. - - # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value, - # turn off on-device sampling. - self._on_device_sampling_disabled = int( - os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0")) - - # NEURON needs to update sampling parameters when request IDs change - # across batches. This variable stores the previous batch's request IDs - # to determine if an update is needed. - self._previous_batch_request_ids: List[str] = [] - - if not self._on_device_sampling_disabled: - self._init_neuron_sampling() - - def _init_neuron_sampling(self) -> None: - if current_platform.use_transformers_neuronx(): - from transformers_neuronx.config import GenerationConfig - else: - from transformers import GenerationConfig - logger.warning( - "On-device sampling is turned on in Neuron by default, only " - "top_k, top_p, and temperature are current supported sampling " - "parameters. To turn off the on-device sampling, please set " - "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1.") - self.model_config.neuron_sampling_params = GenerationConfig( - max_length=self.scheduler_config.max_model_len, - do_sample=True, - per_batch_line=True, - top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \ - * self.scheduler_config.max_num_seqs, - top_p=[1.0] * self.scheduler_config.max_num_seqs, - temperature=[1.0] * self.scheduler_config.max_num_seqs, - dynamic=True, - global_top_k=self._MAX_NEURON_SAMPLING_TOP_K) - - def load_model(self) -> None: - self.model = get_neuron_model(self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) - - def get_model(self) -> nn.Module: - return self.model - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], - BatchedTensorInputs]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - input_block_ids: List[int] = [] - - seq_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - seq_len = len(prompt_tokens) - seq_lens.append(seq_len) - - input_tokens.append(prompt_tokens) - input_positions.append(list(range(seq_len))) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - assert len(block_table) == 1 - input_block_ids.append(block_table[0]) - - mm_kwargs = seq_group_metadata.multi_modal_data - if mm_kwargs: - mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs) - multi_modal_kwargs_list.append(mm_kwargs) - - max_seq_len = max(seq_lens) - assert max_seq_len > 0 - input_tokens = make_tensor_with_pad(input_tokens, - pad=0, - max_len=max_seq_len, - dtype=torch.long, - device=self.device) - input_positions = make_tensor_with_pad(input_positions, - pad=0, - max_len=max_seq_len, - dtype=torch.long, - device=self.device) - input_block_ids = torch.tensor(input_block_ids, - dtype=torch.long, - device=self.device) - - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return (input_tokens, input_positions, input_block_ids, seq_lens, - multi_modal_kwargs) - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - input_block_ids: List[int] = [] - context_lens: List[int] = [] - - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - - seq_ids = list(seq_group_metadata.seq_data.keys()) - - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append([position]) - context_lens.append(seq_len) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - assert len(block_table) == 1 - input_block_ids.append(block_table[0]) - - input_tokens = make_tensor_with_pad(input_tokens, - pad=0, - max_len=1, - dtype=torch.long, - device=self.device) - input_positions = make_tensor_with_pad(input_positions, - pad=0, - max_len=1, - dtype=torch.long, - device=self.device) - context_lens = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) - input_block_ids = torch.tensor(input_block_ids, - dtype=torch.long, - device=self.device) - - return input_tokens, input_positions, input_block_ids - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron: - return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForNeuron: - multi_modal_kwargs = None - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, input_block_ids, seq_lens, - multi_modal_kwargs - ) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, - input_block_ids) = self._prepare_decode(seq_group_metadata_list) - seq_lens = None - - if not self._on_device_sampling_disabled: - for seq_group_metadata in seq_group_metadata_list: - sampling_params = seq_group_metadata.sampling_params - top_k, top_p, temperature = ( - self._convert_to_neuron_sampling_params(sampling_params)) - sampling_params.top_k = top_k - sampling_params.top_p = top_p - sampling_params.temperature = temperature - - # we need multi_modal_data for later tokens as well - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - for seq_group_metadata in seq_group_metadata_list: - mm_data = seq_group_metadata.multi_modal_data - if mm_data: - multi_modal_kwargs_list.append(mm_data) - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query_lens is not needed if chunked prefill is not - # supported. Since neuron worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens, - self.device, - self.pin_memory, - generators=self.get_generators(finished_requests_ids)) - - if current_platform.use_transformers_neuronx( - ) and not self._on_device_sampling_disabled: - # Once the request IDs are changed in current iteration, we will - # update the on-device sampling parameters. - current_batch_request_ids = [ - seq_group_meta_data.request_id - for seq_group_meta_data in seq_group_metadata_list - ] - if current_batch_request_ids != self._previous_batch_request_ids: - self._update_neuron_sampling_params(seq_group_metadata_list) - self._previous_batch_request_ids = current_batch_request_ids - - return ModelInputForNeuron(input_tokens=input_tokens, - input_positions=input_positions, - input_block_ids=input_block_ids, - sampling_metadata=sampling_metadata, - multi_modal_kwargs=multi_modal_kwargs) - - def _update_neuron_sampling_params( - self, seq_group_metadata_list: List[SequenceGroupMetadata]): - # Update Neuron sampling parameters (GenerationConfig in Neuron) - current_sampling_params = self.model_config.neuron_sampling_params - assert current_sampling_params is not None, ( - f"Failed to update sampling_params, " - f"current sampling params is {current_sampling_params}") - - is_update_needed = False - - top_k = current_sampling_params.top_k - top_p = current_sampling_params.top_p - temperature = current_sampling_params.temperature - - # The index of a sequence's sampling parameters in neuron is equal to - # its index in `input_block_ids`. - for seq_group_metadata in seq_group_metadata_list: - seq_ids = list(seq_group_metadata.seq_data.keys()) - sampling_params = seq_group_metadata.sampling_params - - seq_group_top_k = sampling_params.top_k - seq_group_top_p = sampling_params.top_p - seq_group_temperature = sampling_params.temperature - - for seq_id in seq_ids: - index = seq_group_metadata.block_tables[seq_id][0] - if (top_k[index] != seq_group_top_k - or top_p[index] != seq_group_top_p - or temperature[index] != seq_group_temperature): - is_update_needed = True - - top_k[index] = seq_group_top_k - top_p[index] = seq_group_top_p - temperature[index] = seq_group_temperature - - # update_generation_config is only available in transformers-neuronx - if is_update_needed and current_platform.use_transformers_neuronx(): - self.model.model.update_generation_config(current_sampling_params) - - def _convert_to_neuron_sampling_params( - self, sampling_params: SamplingParams) -> Tuple[int, float, float]: - # Returns the top_k, top_p and temperature parameters for neuron. - top_k = sampling_params.top_k - top_p = sampling_params.top_p - temperature = sampling_params.temperature - - if temperature == 0.0: - # Enable greedy sampling on zero temperature - return (1, 1.0, 1.0) - if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K: - top_k = self._MAX_NEURON_SAMPLING_TOP_K - - return (top_k, top_p, temperature) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNeuron, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "NeuronModelRunner does not support multi-step execution.") - - # extract top_k, top_p and temperature from model_input for neuron - # forward call - sampling_params = (torch.tensor([[ - seq_group.sampling_params.top_k, seq_group.sampling_params.top_p, - seq_group.sampling_params.temperature - ] for seq_group in model_input.sampling_metadata.seq_groups])) - - if current_platform.use_neuronx_distributed(): - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - sampling_params=sampling_params, - adapter_ids=model_input.adapter_ids, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - elif current_platform.use_transformers_neuronx(): - # [TODO] validate on-device sampling - # The model signature may need change for on-device sampling - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - - # Compute the logits only if the on-device sampling is turned off as - # on-device sampling outputs the token ids. - if self._on_device_sampling_disabled: - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata) - else: - logits = hidden_states - - # Sample the next token. - output = self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return [output] - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - def process_multi_modal_data_neuron(self, mm_data): - # this is a no-op for NeuronModelRunner - return mm_data - - def remove_all_loras(self): - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def add_lora(self, lora_request: LoRARequest): - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") - - def list_loras(self) -> Set[int]: - raise NotImplementedError( - "LoRAs are not supported for Transformers NeuronX framework") diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py deleted file mode 100644 index 3e4512a639083..0000000000000 --- a/vllm/worker/neuron_worker.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A Neuron worker class.""" -import os -from typing import List, Optional, Set, Tuple - -import torch.distributed - -from vllm.config import VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.platforms import current_platform -from vllm.platforms.neuron import NeuronFramework -from vllm.sequence import ExecuteModelRequest -from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class NeuronWorker(LocalOrDistributedWorkerBase): - """A worker class that executes the model on a group of neuron cores. - """ - - model_runner: NeuronModelRunner - - def __init__(self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - self.lora_config = vllm_config.lora_config - - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - neuron_framework = current_platform.get_neuron_framework_to_use() - if neuron_framework == NeuronFramework.TRANSFORMERS_NEURONX: - self.model_runner = self.get_tnx_model_runner(vllm_config) - elif neuron_framework == NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE: - self.model_runner = self.get_neuronx_distributed_model_runner( - vllm_config) - else: - raise NotImplementedError( - "Specified framework" + - f" {os.environ.get('VLLM_NEURON_FRAMEWORK')}" + - " is either not installed or not supported." + - " Supported frameworks: " + - "[transformers-neuronx, neuronx-distributed-inference]") - - def get_tnx_model_runner(self, vllm_config): - assert (self.lora_config - is None), ("LoRA is not supported for TransformersNeuronX " - "framework.") - if self.speculative_config is not None: - raise NotImplementedError( - "Speculative decoding is not supported for TransformersNeuronX" - ) - return NeuronModelRunner(vllm_config=vllm_config) - - def get_neuronx_distributed_model_runner(self, vllm_config): - from vllm.worker.neuronx_distributed_model_runner import ( - NeuronxDistributedModelRunner) - if self.speculative_config is not None: - assert (self.lora_config is None), ( - "LoRA is not supported for Speculative Decoding") - raise NotImplementedError( - "Speculative decoding is not supported for NeuronxDistributed") - return NeuronxDistributedModelRunner(vllm_config=vllm_config) - - def init_device(self) -> None: - self.init_distributed_environment() - - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - self.model_runner.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks. - - Swapping is not yet supported, so always return num_cpu_blocks=0. - - We configure num_gpu_blocks to be equal to max_num_seqs. - """ - # Set the number of GPU blocks to be the same as the maximum number of - # sequences that can be processed in a single batch. This is equivalent - # to schedule without PagedAttention. - num_gpu_blocks = self.scheduler_config.max_num_seqs + 1 - - # Swap not yet supported with Neuron backend. - num_cpu_blocks = 0 - - return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache. - """ - - # Different values are not tested. - assert num_cpu_blocks == 0 - assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1 - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - @property - def do_metadata_broadcast(self) -> bool: - return False - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return None - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - return WorkerInput(num_seq_groups=len( - execute_model_req.seq_group_metadata_list), ) - - def execute_worker(self, worker_input: WorkerInput) -> None: - pass - - def get_cache_block_size_bytes(self) -> int: - """Determine the size in bytes of a cache block. - - This is required for speculative decoding; it is not yet implemented. - """ - raise NotImplementedError - - def init_distributed_environment(self): - """Neuron uses transformers-neuronx for tensor parallelism. - - vLLM still needs the environment initialized when TP/PP > 1 - """ - init_distributed_environment( - world_size=1, - rank=self.rank, - local_rank=self.local_rank, - distributed_init_method=self.distributed_init_method, - backend=current_platform.dist_backend, - ) - - ensure_model_parallel_initialized( - 1, - 1, - ) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - if current_platform.use_transformers_neuronx(): - raise NotImplementedError( - f"{type(self)} does not support LoRA with Neuron Framework " - f"Transformers NeuronX") - return self.model_runner.list_loras() diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py deleted file mode 100644 index 2a0f4e77c99e5..0000000000000 --- a/vllm/worker/neuronx_distributed_model_runner.py +++ /dev/null @@ -1,294 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set - -import torch -from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import ( - get_all_supported_aspect_ratios) -from neuronx_distributed_inference.modules.generation.sampling import ( - prepare_sampling_params) -from neuronx_distributed_inference.modules.lora_serving import ( - LoraCheckpoint, LoraServingConfig) - -from vllm.config import VllmConfig -from vllm.entrypoints.openai.serving_models import LoRAModulePath -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.neuronx_distributed import ( - _get_model_architecture, get_neuron_model) -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.worker.neuron_model_runner import (ModelInputForNeuron, - NeuronModelRunner) - -logger = init_logger(__name__) - - -class NeuronxDistributedModelRunner(NeuronModelRunner): - - def __init__( - self, - vllm_config: VllmConfig, - ): - super().__init__(vllm_config) - self.lora_checkpoint = None - self.model = None - self.lora_serving_config = None - - @staticmethod - def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]): - if not lora_modules: - return None - return {_.get("name"): _.get("path") for _ in lora_modules} - - def _get_nxdi_lora_config(self): - override_neuron_config = self.model_config.override_neuron_config - lora_modules = override_neuron_config.pop("lora_modules", None) - target_modules = override_neuron_config.pop("target_modules", None) - lora_ckpt_paths = self._get_lora_paths_strings(lora_modules) - if self.lora_config.max_loras < len(lora_ckpt_paths): - raise ValueError( - "Number of LoRAs (%s) exceeds maximum " - "allowed (%s)", len(lora_ckpt_paths), - self.lora_config.max_loras) - - return LoraServingConfig( - max_loras=self.lora_config.max_loras, - max_lora_rank=self.lora_config.max_lora_rank, - target_modules=target_modules, - lora_ckpt_paths=lora_ckpt_paths, - ) - - def load_model(self) -> None: - # Update LoRA config - if self.lora_config is not None: - self.lora_serving_config = self._get_nxdi_lora_config() - self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config) - self.model = get_neuron_model( - self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - lora_serving_config=self.lora_serving_config) - - def get_nxd_sampling_params(self, sampling_metadata): - if self.model.config.neuron_config.on_device_sampling_config: - max_topk = (self.model.config.neuron_config. - on_device_sampling_config.global_topk) - else: - max_topk = self.model.config.vocab_size - - top_k = [1] * self.scheduler_config.max_num_seqs - top_p = [1.0] * self.scheduler_config.max_num_seqs - temperature = [1.0] * self.scheduler_config.max_num_seqs - - for index, sequenceGroupToSample in enumerate( - sampling_metadata.seq_groups): - top_k[index] = (sequenceGroupToSample.sampling_params.top_k - if sequenceGroupToSample.sampling_params.top_k > 0 - else max_topk) - top_p[index] = sequenceGroupToSample.sampling_params.top_p - temperature[index] = ( - sequenceGroupToSample.sampling_params.temperature) - - sampling_params = prepare_sampling_params( - batch_size=self.scheduler_config.max_num_seqs, - top_k=top_k, - top_p=top_p, - temperature=temperature) - return sampling_params - - def get_multi_modal_data_neuron(self, input_images): - raise NotImplementedError("need to restore multi-modal support") - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForNeuron, - kv_caches: Optional[List[torch.Tensor]] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "NeuronModelRunner does not support multi-step execution.") - - if _get_model_architecture( - self.model.config) != "MllamaForConditionalGeneration": - return super().execute_model(model_input, kv_caches, - intermediate_tensors, num_steps) - - sampling_params = self.get_nxd_sampling_params( - model_input.sampling_metadata) - - if model_input.multi_modal_kwargs.get('pixel_values') is not None: - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - seq_ids=model_input.input_block_ids, - pixel_values=model_input.multi_modal_kwargs.get( - 'pixel_values'), - aspect_ratios=model_input.multi_modal_kwargs.get( - 'aspect_ratios'), - sampling_params=sampling_params, - num_chunks=model_input.multi_modal_kwargs.get('num_chunks'), - has_image=model_input.multi_modal_kwargs.get( - 'has_image').squeeze(1), - ) - else: - bs = model_input.input_tokens.shape[0] if (model_input.input_tokens - is not None) else 1 - empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560], - dtype=torch.bfloat16) - empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64) - num_chunks = torch.zeros((bs, 1), dtype=torch.int32) - has_image = torch.zeros([bs], dtype=torch.int32) - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - seq_ids=model_input.input_block_ids, - pixel_values=empty_pixel_values, - aspect_ratios=empty_aspect_ratios, - sampling_params=sampling_params, - num_chunks=num_chunks, - has_image=has_image, - ) - - output = self.model.sample( - hidden_states=hidden_states, - sampling_metadata=model_input.sampling_metadata, - ) - - return [output] - - def process_multi_modal_data_neuron(self, mm_data): - # Neuron uses aspect_ratios instead of aspect_ratio_ids - all_supported_aspect_ratios = get_all_supported_aspect_ratios( - self.model.config.vision_config.max_num_tiles) - aspect_ratio_ids = mm_data.get("aspect_ratio_ids") - mm_data["aspect_ratios"] = torch.tensor( - all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0) - - # Neuron's num_chunks is HF's num_tiles - mm_data["num_chunks"] = mm_data.get("num_tiles") - - # Input has an image if it has pixel_values - bs = mm_data["num_chunks"].shape[0] - pixel_values = mm_data.get("pixel_values") - if pixel_values is not None and not torch.all(pixel_values == 0): - mm_data["has_image"] = torch.ones(bs) - - else: - mm_data["has_image"] = torch.zeros(bs) - return mm_data - - def _get_lora_adapter_ids(self, seq_group_metadata_list): - # set LoRA adapter IDs for multi-lora serving - batch_size = len(seq_group_metadata_list) - if self.lora_checkpoint is not None: - # "0" indicates NxDI to use the base model for inference - adapter_ids = ["0"] * batch_size - for idx, seq_group_metadata in enumerate(seq_group_metadata_list): - if seq_group_metadata.lora_request is not None: - adapter_ids[ - idx] = seq_group_metadata.lora_request.lora_name - - # convert adapter_ids from strings to integers - adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices( - adapter_ids, batch_size) - else: - adapter_ids = torch.zeros((batch_size), dtype=torch.int32) - - return adapter_ids - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForNeuron: - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, input_block_ids, seq_lens, - multi_modal_kwargs - ) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, - input_block_ids) = self._prepare_decode(seq_group_metadata_list) - seq_lens = None - - if not self._on_device_sampling_disabled: - for seq_group_metadata in seq_group_metadata_list: - sampling_params = seq_group_metadata.sampling_params - top_k, top_p, temperature = ( - self._convert_to_neuron_sampling_params(sampling_params)) - sampling_params.top_k = top_k - sampling_params.top_p = top_p - sampling_params.temperature = temperature - - # we need multi_modal_data for later tokens as well - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - for seq_group_metadata in seq_group_metadata_list: - mm_data = seq_group_metadata.multi_modal_data - if mm_data: - multi_modal_kwargs_list.append(mm_data) - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query_lens is not needed if chunked prefill is not - # supported. Since neuron worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens, - self.device, - self.pin_memory, - generators=self.get_generators(finished_requests_ids)) - - return ModelInputForNeuron(input_tokens=input_tokens, - input_positions=input_positions, - input_block_ids=input_block_ids, - sampling_metadata=sampling_metadata, - multi_modal_kwargs=multi_modal_kwargs, - adapter_ids=lora_adapter_ids) - - def remove_all_loras(self): - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def add_lora(self, lora_request: LoRARequest): - logger.warning( - "Adding LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config. If you supplied " - "the parameter, you can ignore this warning. Ignoring" - "lora request: ", lora_request) - - def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") - - def list_loras(self) -> Set[int]: - raise NotImplementedError( - "Managing LoRAs is only supported through the " - "lora_modules parameter in override_neuron_config") From 558f0907dc67c804c5821c92f4d64ed43d10489b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 7 Sep 2025 09:18:59 +0800 Subject: [PATCH 005/584] [attention][DCP] use AttentionImpl.need_to_return_lse_for_decode (#24372) Signed-off-by: youkaichao --- vllm/attention/backends/abstract.py | 26 ++++++++++++++++++++++ vllm/v1/attention/backends/mla/common.py | 4 ---- vllm/v1/attention/backends/mla/flashmla.py | 2 ++ vllm/v1/worker/gpu_model_runner.py | 15 ++++++++----- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 0b9c625533cb7..0217bff6adafa 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -257,6 +257,32 @@ class AttentionLayer(Protocol): class AttentionImpl(ABC, Generic[T]): + # Whether the attention impl can return the softmax lse for decode. + # Some features like decode context parallelism require the softmax lse. + can_return_lse_for_decode: bool = False + + # some attention backends might not always want to return lse + # even if they can return lse (for efficiency reasons) + need_to_return_lse_for_decode: bool = False + + dcp_world_size: int + dcp_rank: int + + def __new__(cls, *args, **kwargs): + # use __new__ so that all subclasses will call this + self = super().__new__(cls) + try: + from vllm.distributed.parallel_state import get_dcp_group + self.dcp_world_size = get_dcp_group().world_size + self.dcp_rank = get_dcp_group().rank_in_group + except AssertionError: + # DCP might not be initialized in testing + self.dcp_world_size = 1 + self.dcp_rank = 0 + self.need_to_return_lse_for_decode = self.dcp_world_size > 1 \ + and self.can_return_lse_for_decode + return self + @abstractmethod def __init__( self, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 090ebf93840d8..ec1216a16bc46 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1592,10 +1592,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): # recorect dcp attn_out with lse. if self.dcp_world_size > 1: - assert lse is not None, ( - "For a mla backend want to enable" - "DCP, it is mandatory that the corresponding decode attn" - "kernel return the softmax lse.") attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group()) # v_up projection diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 11c91b8a0650e..1824bbadb6a1a 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -133,6 +133,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): + can_return_lse_for_decode: bool = True + def __init__( self, num_heads: int, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 563872f8d68f3..0224f3944f005 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -56,7 +56,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, cdiv, check_use_alibi, get_dtype_size, is_pin_memory_available, round_up, supports_dynamo) -from vllm.v1.attention.backends.mla.flashmla import FlashMLABackend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, create_fast_prefill_custom_backend, @@ -3405,10 +3404,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): copy_kv_blocks) if self.dcp_world_size > 1: - assert self.attn_groups[0][0].backend is FlashMLABackend, ( - "DCP only support flashmla now." - "For a mla backend want to enable DCP, it is mandatory that the" - "corresponding decode attn kernel return the softmax lse.") + layer_names = self.attn_groups[0][0].layer_names + layers = get_layers_from_vllm_config(self.vllm_config, + AttentionLayerBase, + layer_names) + for layer in layers.values(): + assert layer.impl.need_to_return_lse_for_decode, ( + "DCP requires attention impls to return" + " the softmax lse for decode, but the impl " + f"{layer.impl.__class__.__name__} " + "does not return the softmax lse for decode.") def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: """ From 37a6fa95fd29e95de6ceaf7ccc249c4523be6c75 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Sat, 6 Sep 2025 20:07:31 -0700 Subject: [PATCH 006/584] Migrate Qwen2 inputs to TensorSchema (#23475) Signed-off-by: Benji Beck Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../models/qwen2_5_omni_thinker.py | 52 ++--- vllm/model_executor/models/qwen2_5_vl.py | 177 +++++++++++------- vllm/model_executor/models/qwen2_audio.py | 48 +++-- vllm/model_executor/models/qwen2_vl.py | 166 +++++++++------- 4 files changed, 268 insertions(+), 175 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 29563540a7942..d05eb76cdf6fd 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -25,7 +25,7 @@ from collections.abc import Iterable, Mapping, Sequence from copy import copy from functools import partial -from typing import Any, Callable, Optional, Union +from typing import Annotated, Any, Callable, Literal, Optional, Union import torch import torch.nn as nn @@ -41,15 +41,13 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2_5_vl import ( Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs, Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs, Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs) from vllm.model_executor.models.qwen2_audio import ( - Qwen2AudioFeatureInputs, Qwen2AudioProcessingInfo, - _get_feat_extract_output_lengths) + Qwen2AudioProcessingInfo, _get_feat_extract_output_lengths) from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -66,9 +64,9 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens +from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import (MultiModalEmbeddings, SupportsLoRA, - SupportsMultiModal, SupportsPP) +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -81,6 +79,26 @@ except (ImportError, ModuleNotFoundError): logger = init_logger(__name__) +class Qwen2_5OmniAudioFeatureInputs(TensorSchema): + """ + Dimensions: + - na: Number of audios + - nmb: Number of mel bins + - msl: Maximum sequence length + - tsl: Total sequence length + """ + type: Literal["audio_features"] + input_features: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("nmb", "tsl"), + ] + + feature_attention_mask: Annotated[ + torch.Tensor, + TensorShape("na", "msl"), + ] + + def create_qwen2_5_omni_thinker_field_factory( spatial_merge_size: int ) -> Callable[[Mapping[str, torch.Tensor]], Mapping[str, @@ -536,7 +554,7 @@ class Qwen2_5OmniConditionalGenerationMixin: return torch.concat(mm_input, dim=dim) def _parse_and_validate_audio_input( - self, **kwargs: object) -> Optional[Qwen2AudioFeatureInputs]: + self, **kwargs: object) -> Optional[Qwen2_5OmniAudioFeatureInputs]: input_audio_features = kwargs.pop('input_audio_features', None) audio_feature_lengths = kwargs.pop('audio_feature_lengths', None) feature_attention_mask = kwargs.pop('feature_attention_mask', None) @@ -550,7 +568,8 @@ class Qwen2_5OmniConditionalGenerationMixin: if not isinstance(input_audio_features, (torch.Tensor, list)): raise ValueError("Incorrect type of audio input features. " f"Got type: {type(input_audio_features)}") - return Qwen2AudioFeatureInputs( + return Qwen2_5OmniAudioFeatureInputs( + type="audio_features", input_features=input_audio_features, audio_feature_lengths=audio_feature_lengths, feature_attention_mask=feature_attention_mask) @@ -633,7 +652,7 @@ class Qwen2_5OmniConditionalGenerationMixin: def _process_audio_input( self, - audio_input: Qwen2AudioFeatureInputs, + audio_input: Qwen2_5OmniAudioFeatureInputs, audio_hashes: list[str] = None, cached_audio_features: torch.Tensor = None, ) -> torch.Tensor: @@ -660,8 +679,8 @@ class Qwen2_5OmniConditionalGenerationMixin: feature_lens=audio_feature_lengths, aftercnn_lens=audio_feat_lengths, ) - audio_features = audio_outputs.last_hidden_state - return audio_features.split(audio_output_lengths.tolist()) + return audio_outputs.last_hidden_state.split( + audio_output_lengths.tolist()) def _process_image_input( self, @@ -707,7 +726,7 @@ class Qwen2_5OmniConditionalGenerationMixin: dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder, ) class Qwen2_5OmniThinkerForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, + nn.Module, SupportsMultiModal, SupportsPP, Qwen2_5OmniConditionalGenerationMixin): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -800,15 +819,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_mm_mapping(self) -> MultiModelKeys: - """Get module prefix for multimodal models to filter LoRA modules.""" - return MultiModelKeys.from_string_field( - language_model="language_model", - connector=[], # No explicit connector in this model - tower_model=["visual", - "audio_tower"], # Exclude vision and audio towers - ) - def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0a89f86fc7389..afef86fbaa027 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -27,7 +27,7 @@ """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping from functools import lru_cache, partial -from typing import Callable, Literal, Optional, TypedDict, Union +from typing import Annotated, Callable, Literal, Optional, Union import torch import torch.nn as nn @@ -64,6 +64,7 @@ from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsQuant) @@ -80,84 +81,125 @@ logger = init_logger(__name__) # === Vision Inputs === # -class Qwen2_5_VLImagePixelInputs(TypedDict): +class Qwen2_5_VLImagePixelInputs(TensorSchema): + """ + Dimensions: + - np: Number of patches + - ni: Number of images + - cps: Number of channels * patch_size * patch_size + + Historical context: + - pixel_values shape: (num_patches, num_channels * patch_size * + patch_size) + - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w) + formatnum_channels * patch_size * patch_size + """ type: Literal["pixel_values"] - pixel_values: torch.Tensor - """Shape: - `(num_patches, num_channels * patch_size * patch_size)` + + pixel_values: Annotated[ + torch.Tensor, + TensorShape("np", "cps"), + ] + + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("ni", 3), + ] + + +class Qwen2_5_VLImageEmbeddingInputs(TensorSchema): """ - - image_grid_thw: torch.Tensor - """Shape: `(num_images, 3)` - This should be in `(grid_t, grid_h, grid_w)` format. + Dimensions: + - nf: Number of image features + - hs: Hidden size + - ni: Number of images + + Historical context: + - image_embeds shape: (num_image_features, hidden_size) + - num_image_features varies based on the number and resolution of the + images. + - hidden_size must match the hidden size of language model backbone. + - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w) + format """ - - -class Qwen2_5_VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - image_embeds: torch.Tensor - """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all images' features. - Each tensor holds an image's features. - - `torch.Tensor`: A tensor holding all images' features - (concatenation of all images' feature tensors). - Tensor shape: `(num_image_features, hidden_size)` - - `num_image_features` varies based on - the number and resolution of the images. - - `hidden_size` must match the hidden size of language model backbone. - """ + image_embeds: Annotated[ + torch.Tensor, + TensorShape("nf", "hs"), + ] - image_grid_thw: torch.Tensor - """Shape: `(num_images, 3)` - This should be in `(grid_t, grid_h, grid_w)` format. - """ + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("ni", 3), + ] Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs, Qwen2_5_VLImageEmbeddingInputs] -class Qwen2_5_VLVideoPixelInputs(TypedDict): +class Qwen2_5_VLVideoPixelInputs(TensorSchema): + """ + Dimensions: + - np: Number of patches + - nv: Number of videos + - ctps: Number of channels * temporal_patch_size * patch_size * + patch_size + + Historical context: + - pixel_values_videos shape: (num_patches, num_channels * + temporal_patch_size * patch_size * patch_size) + - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w) + format + - second_per_grid_ts: The video time interval (in seconds) for each + grid along the temporal dimension in the 3D position IDs. Returned + when `videos` is not `None`. + """ type: Literal["pixel_values_videos"] - pixel_values_videos: torch.Tensor - """Shape: - `(num_patches, - num_channels * temporal_patch_size * patch_size * patch_size)` + + pixel_values_videos: Annotated[ + torch.Tensor, + TensorShape("np", "ctps"), + ] + + video_grid_thw: Annotated[ + torch.Tensor, + TensorShape("nv", 3), + ] + + second_per_grid_ts: Annotated[ + Optional[torch.Tensor], + TensorShape("nv"), + ] + + +class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema): """ - - video_grid_thw: torch.Tensor - """Shape: `(num_videos, 3)` - - This should be in `(grid_t, grid_h, grid_w)` format. + Dimensions: + - nf: Number of video features + - hs: Hidden size + - nv: Number of videos + + Historical context: + - video_embeds shape: (num_video_features, hidden_size) + - num_video_features varies based on the number and resolution of the + videos. + - hidden_size must match the hidden size of language model backbone. + - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w) + format """ - - second_per_grid_ts: torch.Tensor - """ - The video time interval (in seconds) for each grid along the temporal - dimension in the 3D position IDs. Returned when `videos` is not `None`. - """ - - -class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] - video_embeds: torch.Tensor - """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all videos' features. - Each tensor holds an video's features. - - `torch.Tensor`: A tensor holding all videos' features - (concatenation of all videos' feature tensors). - Tensor shape: `(num_image_features, hidden_size)` - - `num_image_features` varies based on - the number and resolution of the videos. - - `hidden_size` must match the hidden size of language model backbone. - """ + video_embeds: Annotated[ + torch.Tensor, + TensorShape("nf", "hs"), + ] - video_grid_thw: torch.Tensor - """Shape: `(num_videos, 3)` - This should be in `(grid_t, grid_h, grid_w)` format. - """ + video_grid_thw: Annotated[ + torch.Tensor, + TensorShape("nv", 3), + ] Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs, @@ -936,10 +978,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_grid_thw = self._validate_and_reshape_mm_tensor( image_grid_thw, "image grid_thw") - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError("Incorrect type of image pixel values. " - f"Got type: {type(pixel_values)}") - return Qwen2_5_VLImagePixelInputs(type="pixel_values", pixel_values=pixel_values, image_grid_thw=image_grid_thw) @@ -950,9 +988,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_grid_thw = self._validate_and_reshape_mm_tensor( image_grid_thw, "image grid_thw") - if not isinstance(image_embeds, torch.Tensor): - raise ValueError("Incorrect type of image embeddings. " - f"Got type: {type(image_embeds)}") return Qwen2_5_VLImageEmbeddingInputs( type="image_embeds", image_embeds=image_embeds, @@ -973,7 +1008,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values_videos, "video pixel values") video_grid_thw = self._validate_and_reshape_mm_tensor( video_grid_thw, "video grid_thw") - + if second_per_grid_ts is not None and second_per_grid_ts.ndim == 2: + second_per_grid_ts = second_per_grid_ts.squeeze(-1) return Qwen2_5_VLVideoPixelInputs( type="pixel_values_videos", pixel_values_videos=pixel_values_videos, @@ -987,9 +1023,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, video_grid_thw = self._validate_and_reshape_mm_tensor( video_grid_thw, "video grid_thw") - if not isinstance(video_embeds, torch.Tensor): - raise ValueError("Incorrect type of video embeddings. " - f"Got type: {type(video_embeds)}") return Qwen2_5_VLVideoEmbeddingInputs( type="video_embeds", video_embeds=video_embeds, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 86b4a9a018c76..54ec7b8627488 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -23,7 +23,7 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, TypedDict, Union +from typing import Annotated, Any, Literal, Optional, Union import torch import torch.nn as nn @@ -47,6 +47,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, @@ -54,21 +55,38 @@ from .utils import (AutoWeightsLoader, init_vllm_registered_model, # # === Audio Inputs === # -class Qwen2AudioFeatureInputs(TypedDict): - type: Literal["audio_features"] - input_features: torch.Tensor - """Shape: `(num_audios, num_mel_bins, 3000)`""" - - feature_attention_mask: torch.Tensor - """Shape: `(num_audios, 3000)`""" - - -class Qwen2AudioEmbeddingInputs(TypedDict): - type: Literal["audio_embeds"] - audio_embeds: list[torch.Tensor] - """Shape: `(num_audio_features, hidden_size)` - `hidden_size` must match the hidden size of language model backbone. +class Qwen2AudioFeatureInputs(TensorSchema): """ + Dimensions: + - na: Number of audios + - nmb: Number of mel bins + """ + type: Literal["audio_features"] + input_features: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("na", "nmb", 3000), + ] + + feature_attention_mask: Annotated[ + torch.Tensor, + TensorShape("na", 3000), + ] + + +class Qwen2AudioEmbeddingInputs(TensorSchema): + """ + Dimensions: + - bn: Batch size + - naf: Number of audio features + - hs: Hidden size (must match the hidden size of language model + backbone) + """ + type: Literal["audio_embeds"] = "audio_embeds" + + audio_embeds: Annotated[ + list[torch.Tensor], + TensorShape("bn", "naf", "hs"), + ] Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs] diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b708719e4f9b8..f00b214b1ef18 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -26,7 +26,7 @@ """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import Any, Callable, Literal, Optional, TypedDict, Union +from typing import Annotated, Any, Callable, Literal, Optional, Union import torch import torch.nn as nn @@ -70,6 +70,7 @@ from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -86,78 +87,119 @@ _MAX_FRAMES_PER_VIDEO = 16 # === Vision Inputs === # -class Qwen2VLImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - pixel_values: torch.Tensor - """Shape: - `(num_patches, num_channels * patch_size * patch_size)` +class Qwen2VLImagePixelInputs(TensorSchema): """ - - image_grid_thw: torch.Tensor - """Shape: `(num_images, 3)` - This should be in `(grid_t, grid_h, grid_w)` format. - """ - - -class Qwen2VLImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - image_embeds: torch.Tensor - """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all images' features. - Each tensor holds an image's features. - - `torch.Tensor`: A tensor holding all images' features - (concatenation of all images' feature tensors). + Dimensions: + - np: The total number of patches over each image over each prompt in + the batch + - ni: Number of images + - cps: Number of channels * patch_size * patch_size - Tensor shape: `(num_image_features, hidden_size)` - - `num_image_features` varies based on - the number and resolution of the images. - - `hidden_size` must match the hidden size of language model backbone. + Historical context: + - pixel_values shape: (num_patches, num_channels * patch_size * + patch_size) + - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w) + format """ + type: Literal["pixel_values"] - image_grid_thw: torch.Tensor - """Shape: `(num_images, 3)` - This should be in `(grid_t, grid_h, grid_w)` format. + pixel_values: Annotated[ + torch.Tensor, + TensorShape("np", "cps"), + ] + + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("ni", 3), + ] + + +class Qwen2VLImageEmbeddingInputs(TensorSchema): """ + Dimensions: + - nf: Number of image features + - hs: Hidden size + - ni: Number of images + + Historical context: + - image_embeds shape: (num_image_features, hidden_size) + - num_image_features varies based on the number and resolution of the + images. + - hidden_size must match the hidden size of language model backbone. + - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w) + format + """ + type: Literal["image_embeds"] + + image_embeds: Annotated[ + torch.Tensor, + TensorShape("nf", "hs"), + ] + + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("ni", 3), + ] Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs, Qwen2VLImageEmbeddingInputs] -class Qwen2VLVideoPixelInputs(TypedDict): - type: Literal["pixel_values_videos"] - pixel_values_videos: torch.Tensor - """Shape: - `(num_patches, - num_channels * temporal_patch_size * patch_size * patch_size)` +class Qwen2VLVideoPixelInputs(TensorSchema): """ - - video_grid_thw: torch.Tensor - """Shape: `(num_videos, 3)` - - This should be in `(grid_t, grid_h, grid_w)` format. - """ - - -class Qwen2VLVideoEmbeddingInputs(TypedDict): - type: Literal["video_embeds"] - video_embeds: torch.Tensor - """Supported types: - - list[`torch.Tensor`]: A list of tensors holding all videos' features. - Each tensor holds an video's features. - - `torch.Tensor`: A tensor holding all videos' features - (concatenation of all videos' feature tensors). + Dimensions: + - np: The total number of patches over each video over each prompt in + the batch + - ctps: Number of channels * temporal_patch_size * patch_size * + patch_size + - nv: Number of videos - Tensor shape: `(num_image_features, hidden_size)` - - `num_image_features` varies based on - the number and resolution of the videos. - - `hidden_size` must match the hidden size of language model backbone. + Historical context: + - pixel_values_videos shape: (num_patches, num_channels * + temporal_patch_size * patch_size * patch_size) + - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w) + format """ + type: Literal["pixel_values_videos"] - video_grid_thw: torch.Tensor - """Shape: `(num_videos, 3)` - This should be in `(grid_t, grid_h, grid_w)` format. + pixel_values_videos: Annotated[ + torch.Tensor, + TensorShape("np", "ctps"), + ] + + video_grid_thw: Annotated[ + torch.Tensor, + TensorShape("nv", 3), + ] + + +class Qwen2VLVideoEmbeddingInputs(TensorSchema): """ + Dimensions: + - nf: Number of video features + - hs: Hidden size + - nv: Number of videos + + Historical context: + - video_embeds shape: (num_video_features, hidden_size) + - num_video_features varies based on the number and resolution of the + videos. + - hidden_size must match the hidden size of language model backbone. + - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w) + format + """ + type: Literal["video_embeds"] + + video_embeds: Annotated[ + torch.Tensor, + TensorShape("nf", "hs"), + ] + + video_grid_thw: Annotated[ + torch.Tensor, + TensorShape("nv", 3), + ] Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs, @@ -1126,10 +1168,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_grid_thw = self._validate_and_reshape_mm_tensor( image_grid_thw, "image grid_thw") - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError("Incorrect type of image pixel values. " - f"Got type: {type(pixel_values)}") - return Qwen2VLImagePixelInputs(type="pixel_values", pixel_values=pixel_values, image_grid_thw=image_grid_thw) @@ -1140,9 +1178,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_grid_thw = self._validate_and_reshape_mm_tensor( image_grid_thw, "image grid_thw") - if not isinstance(image_embeds, torch.Tensor): - raise ValueError("Incorrect type of image embeddings. " - f"Got type: {type(image_embeds)}") return Qwen2VLImageEmbeddingInputs(type="image_embeds", image_embeds=image_embeds, image_grid_thw=image_grid_thw) @@ -1174,9 +1209,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, video_grid_thw = self._validate_and_reshape_mm_tensor( video_grid_thw, "video grid_thw") - if not isinstance(video_embeds, torch.Tensor): - raise ValueError("Incorrect type of video embeddings. " - f"Got type: {type(video_embeds)}") return Qwen2VLVideoEmbeddingInputs(type="video_embeds", video_embeds=video_embeds, video_grid_thw=video_grid_thw) From e67597545b0162e8f2e0d4f8186fb2e546c04128 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Sat, 6 Sep 2025 23:10:40 -0400 Subject: [PATCH 007/584] [CI][Fix] deterministic seed for flaky CI runs on structured outputs (#24380) Signed-off-by: Aaron Pham --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index fb49db8f1611d..c10b1abb2b3b7 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -41,9 +41,8 @@ EAGLE_SPEC_CONFIG = { PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None), ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), - #FIXME: This test is flaky on CI thus disabled - #("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", - # None), + ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", + None), ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None), ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None), @@ -123,6 +122,7 @@ def test_structured_output( guided_decoding_backend=guided_decoding_backend, guided_decoding_disable_any_whitespace=(guided_decoding_backend in {"xgrammar", "guidance"}), + seed=120, tokenizer_mode=tokenizer_mode, speculative_config=speculative_config) From 77aec83b8c6e6c15a9b5c333a531c29eff0b61fc Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Sun, 7 Sep 2025 11:12:05 +0800 Subject: [PATCH 008/584] [Benchmark] add benchmark for custom activation op (#23908) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zjy0516 Signed-off-by: Jiangyun Zhu Co-authored-by: Luka Govedič --- benchmarks/kernels/benchmark_activation.py | 104 +++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 benchmarks/kernels/benchmark_activation.py diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py new file mode 100644 index 0000000000000..93edbcc9391fc --- /dev/null +++ b/benchmarks/kernels/benchmark_activation.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# benchmark custom activation op performance +import itertools + +import torch + +import vllm.model_executor.layers.activation # noqa F401 +from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform +from vllm.triton_utils import triton +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + +batch_size_range = [1, 16, 32, 64, 128] +seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] +intermediate_size = [3072, 9728, 12288] +configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) + + +def benchmark_activation( + batch_size: int, + seq_len: int, + intermediate_size: int, + provider: str, + func_name: str, + dtype: torch.dtype, +): + device = "cuda" + num_tokens = batch_size * seq_len + dim = intermediate_size + current_platform.seed_everything(42) + torch.set_default_device(device) + + if func_name == "gelu_and_mul": + layer = CustomOp.op_registry[func_name](approximate="none") + elif func_name == "gelu_and_mul_tanh": + layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh") + elif func_name == "fatrelu_and_mul": + threshold = 0.5 + layer = CustomOp.op_registry[func_name](threshold) + else: + layer = CustomOp.op_registry[func_name]() + + x = torch.randn(num_tokens, dim, dtype=dtype, device=device) + compiled_layer = torch.compile(layer.forward_native) + + if provider == "custom": + fn = lambda: layer(x) + elif provider == "compiled": + fn = lambda: compiled_layer(x) + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + fn, quantiles=[0.5, 0.2, 0.8] + ) + return ms, max_ms, min_ms + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the custom activation op.") + parser.add_argument( + "--func-name", + type=str, + choices=[ + "mul_and_silu", + "silu_and_mul", + "gelu_and_mul", + "gelu_and_mul_tanh", + "fatrelu_and_mul", + "swigluoai_and_mul", + "gelu_new", + "gelu_fast", + "quick_gelu", + ], + default="silu_and_mul", + ) + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" + ) + args = parser.parse_args() + assert args + + func_name = args.func_name + dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] + + perf_report = triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size", "seq_len", "intermediate_size"], + x_vals=configs, + line_arg="provider", + line_vals=["custom", "compiled"], + line_names=["Custom OP", "Compiled"], + styles=[("blue", "-"), ("green", "-")], + ylabel="ms", + plot_name=f"{func_name}-op-performance", + args={}, + ) + ) + + perf_report( + lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation( + batch_size, seq_len, intermediate_size, provider, func_name, dtype + ) + ).run(print_data=True) From 75334956c2a88dc7d9d73243bed26a75c8a68b85 Mon Sep 17 00:00:00 2001 From: "Saman A. Pour" Date: Sat, 6 Sep 2025 20:18:54 -0700 Subject: [PATCH 009/584] QWEN3 Thinking Fused MoE kernels Optimization configs (#24330) Signed-off-by: Saman Keon --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 36 ++--- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 4 files changed, 456 insertions(+), 18 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e5059358c91e3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..db1b6e98df469 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json index b9dc2d71f6dcf..1bbb8aa613996 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -9,16 +9,16 @@ }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "4": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 @@ -26,15 +26,15 @@ "8": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "16": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 @@ -42,7 +42,7 @@ "24": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 @@ -53,12 +53,12 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "48": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 @@ -82,10 +82,10 @@ "128": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "256": { "BLOCK_SIZE_M": 16, @@ -98,8 +98,8 @@ "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, @@ -107,7 +107,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4 }, @@ -115,7 +115,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4 }, @@ -123,15 +123,15 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4 }, "3072": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8fb4947d62ab2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} From 81c53ef55c01203a2be39f97acfc9f40abe3686b Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Sat, 6 Sep 2025 20:30:41 -0700 Subject: [PATCH 010/584] [Misc] collect flashinfer version in collect_env.py (#24378) Signed-off-by: Ye (Charlotte) Qi --- vllm/collect_env.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/collect_env.py b/vllm/collect_env.py index 5b3883a482f25..0291f64e84f0a 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -74,6 +74,7 @@ DEFAULT_CONDA_PATTERNS = { "zmq", "nvidia", "pynvml", + "flashinfer-python", } DEFAULT_PIP_PATTERNS = { @@ -89,6 +90,7 @@ DEFAULT_PIP_PATTERNS = { "zmq", "nvidia", "pynvml", + "flashinfer-python", } From 62f66be1f74378e5a22e266ad161023c324cf4f8 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 7 Sep 2025 13:19:46 +0800 Subject: [PATCH 011/584] [Bugfix] Fix Qwen3-coder moe tuned config (#24072) Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 6 +- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 86 +++++++++---------- 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 710d30adfd846..6259aa0dd6290 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -678,7 +678,11 @@ def main(args: argparse.Namespace): is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") - + if use_deep_gemm: + raise ValueError( + "Tuning with --use-deep-gemm is not supported as it only tunes Triton " + "kernels. Please remove the flag." + ) start = time.time() configs = _distribute( "tune", diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json index 307c9240938c5..c7998718dab4c 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -18,18 +18,18 @@ "4": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3 }, "8": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "16": { "BLOCK_SIZE_M": 16, @@ -58,7 +58,7 @@ "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 @@ -74,73 +74,73 @@ "96": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "128": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 2 + "num_stages": 4 }, "256": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 3 - }, - "512": { - "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, - "num_warps": 8, "num_stages": 4 }, "1024": { - "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 }, "1536": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 4 - }, - "2048": { - "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, - "num_warps": 8, - "num_stages": 5 + "num_warps": 4, + "num_stages": 3 }, - "3072": { - "BLOCK_SIZE_M": 128, + "2048": { + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 }, "4096": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 5 + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 } } From 105d3d62efde2d750b985020bbcea888f94a139b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 7 Sep 2025 01:12:36 -0700 Subject: [PATCH 012/584] [TPU] Remove TopKTopPSampler dependency for TPU sampler (#24391) Signed-off-by: Woosuk Kwon --- tests/v1/tpu/test_topk_topp_sampler.py | 8 ++- vllm/v1/sample/ops/topk_topp_sampler.py | 53 +---------------- vllm/v1/sample/tpu/sampler.py | 75 +++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 57 deletions(-) diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py index ca5c067b364e0..05751badc7619 100644 --- a/tests/v1/tpu/test_topk_topp_sampler.py +++ b/tests/v1/tpu/test_topk_topp_sampler.py @@ -6,8 +6,12 @@ import pytest import torch from vllm.platforms import current_platform -from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p, - apply_top_k_top_p_tpu) +from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p + +# isort: off +from vllm.v1.sample.tpu.sampler import (apply_top_k_top_p as + apply_top_k_top_p_tpu) +# isort: on if not current_platform.is_tpu(): pytest.skip("This test needs a TPU.", allow_module_level=True) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 7bd4a5a380ac0..cc5653b10ec1d 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -73,10 +73,8 @@ class TopKTopPSampler(nn.Module): self.forward = self.forward_native else: self.forward = self.forward_native - if current_platform.is_tpu(): - self.apply_top_k_top_p = apply_top_k_top_p_tpu - else: - self.apply_top_k_top_p = apply_top_k_top_p + + self.apply_top_k_top_p = apply_top_k_top_p def forward_native( self, @@ -125,53 +123,6 @@ class TopKTopPSampler(nn.Module): return flashinfer_sample(logits.contiguous(), k, p, generators), None -def apply_top_k_top_p_tpu( - logits: torch.Tensor, - k: torch.Tensor, - p: torch.Tensor, -) -> torch.Tensor: - """ - Apply top-k and top-p optimized for TPU. - - This algorithm avoids using torch.scatter which is extremely slow on TPU. - This is achieved by finding a "cut-off" element in the original logit, and - after thresholding the logit using this cut-off, the remaining elements - shall constitute the top-p set. - - Note: in the case of tie (i.e. multipple cut-off elements present in the - logit), all tie elements are included in the top-p set. In other words, - this function does not break ties. Instead, these tie tokens have equal - chance of being chosen during final sampling, so we can consider the tie - being broken then. - """ - probs = logits.softmax(dim=-1) - probs_sort, _ = probs.sort(dim=-1, descending=False) - - if k is not None: - top_k_count = probs_sort.size(1) - k.to(torch.long) # shape: (batch, ) - top_k_count = top_k_count.unsqueeze(dim=1) - top_k_cutoff = probs_sort.gather(-1, top_k_count) - - # Make sure the no top-k rows are no-op. - no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1) - top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf")) - - elements_to_discard = probs < top_k_cutoff - logits.masked_fill_(elements_to_discard, -float("inf")) - - if p is not None: - cumprob = torch.cumsum(probs_sort, dim=-1) - top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1) - top_p_mask[:, -1] = False # at least one - - top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1) - top_p_cutoff = probs_sort.gather(-1, top_p_count) - elements_to_discard = probs < top_p_cutoff - logits.masked_fill_(elements_to_discard, -float("inf")) - - return logits - - def apply_top_k_top_p( logits: torch.Tensor, k: Optional[torch.Tensor], diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index e84136e3a6d07..17b83a4ba074c 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampler layer implementing TPU supported operations.""" +from typing import Optional + import torch import torch.nn as nn from vllm.v1.outputs import LogprobsTensors, SamplerOutput -from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata _SAMPLING_EPS = 1e-5 @@ -17,7 +18,6 @@ class Sampler(nn.Module): def __init__(self): # TODO(houseroad): Add support for logprobs_mode. super().__init__() - self.topk_topp_sampler = TopKTopPSampler() def forward( self, @@ -65,13 +65,17 @@ class Sampler(nn.Module): logits = self.apply_min_p(logits, sampling_metadata.min_p) # Apply top_k and/or top_p. - random_sampled, _ = self.topk_topp_sampler( + logits = apply_top_k_top_p( logits, - sampling_metadata.generators, sampling_metadata.top_k, sampling_metadata.top_p, ) + # Random sample. + probs = logits.softmax(dim=-1, dtype=torch.float32) + random_sampled = self.random_sample(probs, + sampling_metadata.generators) + sampled = torch.where(sampling_metadata.temperature < _SAMPLING_EPS, greedy_sampled, random_sampled) return sampled @@ -144,3 +148,66 @@ class Sampler(nn.Module): # Apply mask using boolean indexing (xla friendly) logits.masked_fill_(~valid_token_mask, -float("inf")) return logits + + def random_sample( + self, + probs: torch.Tensor, + generators: dict[int, torch.Generator], + ) -> torch.Tensor: + q = torch.empty_like(probs) + # NOTE(woosuk): To batch-process the requests without their own seeds, + # which is the common case, we first assume that every request does + # not have its own seed. Then, we overwrite the values for the requests + # that have their own seeds. + q.exponential_() + if generators: + for i, generator in generators.items(): + q[i].exponential_(generator=generator) + return probs.div_(q).argmax(dim=-1).view(-1) + + +def apply_top_k_top_p( + logits: torch.Tensor, + k: Optional[torch.Tensor], + p: Optional[torch.Tensor], +) -> torch.Tensor: + """ + Apply top-k and top-p optimized for TPU. + + This algorithm avoids using torch.scatter which is extremely slow on TPU. + This is achieved by finding a "cut-off" element in the original logit, and + after thresholding the logit using this cut-off, the remaining elements + shall constitute the top-p set. + + Note: in the case of tie (i.e. multipple cut-off elements present in the + logit), all tie elements are included in the top-p set. In other words, + this function does not break ties. Instead, these tie tokens have equal + chance of being chosen during final sampling, so we can consider the tie + being broken then. + """ + probs = logits.softmax(dim=-1) + probs_sort, _ = probs.sort(dim=-1, descending=False) + + if k is not None: + top_k_count = probs_sort.size(1) - k.to(torch.long) # shape: (batch, ) + top_k_count = top_k_count.unsqueeze(dim=1) + top_k_cutoff = probs_sort.gather(-1, top_k_count) + + # Make sure the no top-k rows are no-op. + no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1) + top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf")) + + elements_to_discard = probs < top_k_cutoff + logits.masked_fill_(elements_to_discard, -float("inf")) + + if p is not None: + cumprob = torch.cumsum(probs_sort, dim=-1) + top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1) + top_p_mask[:, -1] = False # at least one + + top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1) + top_p_cutoff = probs_sort.gather(-1, top_p_count) + elements_to_discard = probs < top_p_cutoff + logits.masked_fill_(elements_to_discard, -float("inf")) + + return logits From 0661cb9df390a47dda0a5c201cf712dd2d943666 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sun, 7 Sep 2025 01:26:48 -0700 Subject: [PATCH 013/584] Add renderer-based prompt processing for embedding and classification endpoints (#24356) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/entrypoints/openai/test_truncation.py | 14 ++---- tests/entrypoints/test_renderer.py | 17 +++++++ .../openai/serving_classification.py | 13 +++--- vllm/entrypoints/openai/serving_embedding.py | 45 ++++++++----------- vllm/entrypoints/openai/serving_engine.py | 19 ++++---- vllm/entrypoints/renderer.py | 9 +++- 6 files changed, 60 insertions(+), 57 deletions(-) diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py index 121c0413e1af7..6bdf5ce7c4a6c 100644 --- a/tests/entrypoints/openai/test_truncation.py +++ b/tests/entrypoints/openai/test_truncation.py @@ -73,17 +73,11 @@ async def test_zero_truncation_size(client: openai.AsyncOpenAI): "truncate_prompt_tokens": truncation_size } - with pytest.raises(openai.BadRequestError) as err: - await client.post(path="embeddings", cast_to=object, body={**kwargs}) + response = await client.post(path="embeddings", + cast_to=object, + body={**kwargs}) - assert err.value.status_code == 400 - error_details = err.value.response.json()["error"] - - assert error_details["type"] == "BadRequestError" - assert "This model's maximum context length is" in error_details["message"] - assert "tokens in the input for embedding generation" in error_details[ - "message"] - assert "Please reduce the length of the input" in error_details["message"] + assert response["usage"]["prompt_tokens"] == truncation_size @pytest.mark.asyncio diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py index 54b5271ba67a6..1d80ea6cb4917 100644 --- a/tests/entrypoints/test_renderer.py +++ b/tests/entrypoints/test_renderer.py @@ -130,6 +130,23 @@ class TestRenderPrompt: assert call_args.kwargs["truncation"] is True assert call_args.kwargs["max_length"] == 50 + @pytest.mark.asyncio + async def test_truncation_negative(self, renderer, mock_async_tokenizer): + # Test that negative truncation uses model's max_model_len + mock_async_tokenizer.return_value = MockTokenizerResult( + [101, 7592, 2088]) # Truncated to max_model_len + renderer.async_tokenizer_pool[ + renderer.tokenizer] = mock_async_tokenizer + + results = await renderer.render_prompt(prompt_or_prompts="Hello world", + max_length=200, + truncate_prompt_tokens=-1) + + assert len(results) == 1 + call_args = mock_async_tokenizer.call_args + assert call_args.kwargs["truncation"] is True + assert call_args.kwargs["max_length"] == 100 # model's max_model_len + @pytest.mark.asyncio async def test_token_truncation_last_elements(self, renderer): # Test that token truncation keeps the last N elements diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index b4fdc36390319..98b7a206fa0cb 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -54,14 +54,11 @@ class ClassificationMixin(OpenAIServing): ctx.tokenizer = await self.engine_client.get_tokenizer( ctx.lora_request) - ( - ctx.request_prompts, - ctx.engine_prompts, - ) = await self._preprocess_completion( - ctx.request, - ctx.tokenizer, - ctx.request.input, - ) + renderer = self._get_renderer(ctx.tokenizer) + ctx.engine_prompts = await renderer.render_prompt( + prompt_or_prompts=ctx.request.input, + max_length=self.max_model_len, + truncate_prompt_tokens=ctx.request.truncate_prompt_tokens) return None diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index c6d3509afda74..c375f9e7c5064 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -24,7 +24,6 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, ErrorResponse, UsageInfo) from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext, OpenAIServing, - RequestPrompt, ServeContext, TextTokensPrompt) # yapf: enable @@ -79,11 +78,12 @@ class EmbeddingMixin(OpenAIServing): tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request ) + renderer = self._get_renderer(tokenizer) if isinstance(ctx.request, EmbeddingChatRequest): ( _, - ctx.request_prompts, + _, ctx.engine_prompts, ) = await self._preprocess_chat( ctx.request, @@ -98,13 +98,18 @@ class EmbeddingMixin(OpenAIServing): add_special_tokens=ctx.request.add_special_tokens, ) else: - (ctx.request_prompts, - ctx.engine_prompts) = await self._preprocess_completion( - ctx.request, - tokenizer, - ctx.request.input, - add_special_tokens=ctx.request.add_special_tokens, - ) + # Set max_length based on chunked processing capability + if self._should_use_chunked_processing(ctx.request): + max_length = None + else: + max_length = self.max_embed_len or self.max_model_len + + ctx.engine_prompts = await renderer.render_prompt( + prompt_or_prompts=ctx.request.input, + max_length=max_length, + truncate_prompt_tokens=ctx.request.truncate_prompt_tokens, + add_special_tokens=ctx.request.add_special_tokens, + ) return None except (ValueError, TypeError) as e: logger.exception("Error in preprocessing prompt inputs") @@ -286,7 +291,6 @@ class EmbeddingMixin(OpenAIServing): self, ctx: EmbeddingServeContext, engine_prompt: Union[EngineTokensPrompt, EngineEmbedsPrompt], - request_prompt: RequestPrompt, pooling_params: PoolingParams, trace_headers: Optional[Mapping[str, str]], prompt_index: int, @@ -295,7 +299,7 @@ class EmbeddingMixin(OpenAIServing): request_id_item = f"{ctx.request_id}-{prompt_index}" self._log_inputs(request_id_item, - request_prompt, + engine_prompt, params=pooling_params, lora_request=ctx.lora_request) @@ -353,20 +357,14 @@ class EmbeddingMixin(OpenAIServing): return self.create_error_response( "Engine prompts not available") - if ctx.request_prompts is None: - return self.create_error_response( - "Request prompts not available") - max_pos_embeddings = self._get_max_position_embeddings() for i, engine_prompt in enumerate(ctx.engine_prompts): - request_prompt = ctx.request_prompts[i] - # Check if this specific prompt needs chunked processing - if self._is_text_tokens_prompt(request_prompt): + if self._is_text_tokens_prompt(engine_prompt): # Cast to TextTokensPrompt since we've verified # prompt_token_ids - text_tokens_prompt = cast(TextTokensPrompt, request_prompt) + text_tokens_prompt = cast(TextTokensPrompt, engine_prompt) if (len(text_tokens_prompt["prompt_token_ids"]) > max_pos_embeddings): # Use chunked processing for this prompt @@ -382,8 +380,7 @@ class EmbeddingMixin(OpenAIServing): Union[EngineTokensPrompt, EngineEmbedsPrompt], engine_prompt) generator = await self._create_single_prompt_generator( - ctx, engine_prompt_typed, request_prompt, pooling_params, - trace_headers, i) + ctx, engine_prompt_typed, pooling_params, trace_headers, i) generators.append(generator) from vllm.utils import merge_async_iterators @@ -419,10 +416,6 @@ class EmbeddingMixin(OpenAIServing): if not use_chunked: return await super()._collect_batch(ctx=ctx) - if ctx.request_prompts is None: - return self.create_error_response( - "Request prompts not available") - if ctx.result_generator is None: return self.create_error_response( "Result generator not available") @@ -538,7 +531,7 @@ class EmbeddingMixin(OpenAIServing): data=final_embedding) # Get original prompt token IDs for this prompt - original_prompt = ctx.request_prompts[prompt_idx] + original_prompt = ctx.engine_prompts[prompt_idx] if not self._is_text_tokens_prompt(original_prompt): return self.create_error_response( f"Chunked prompt {prompt_idx} is not a " diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index a218f6882f8ca..1a2236de4fa42 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -368,23 +368,20 @@ class OpenAIServing: for i, engine_prompt in enumerate(ctx.engine_prompts): request_id_item = f"{ctx.request_id}-{i}" - if ctx.request_prompts is None: - return self.create_error_response( - "Request prompts not available") - - self._log_inputs( - request_id_item, - ctx.request_prompts[i], - params=pooling_params, - lora_request=ctx.lora_request, - ) - # Mypy has an existing bug related to inferring the variance of # TypedDicts with `builtins.enumerate`: # https://github.com/python/mypy/issues/8586#issuecomment-2867698435 engine_prompt = cast( Union[EngineTokensPrompt, EngineEmbedsPrompt], engine_prompt) + + self._log_inputs( + request_id_item, + engine_prompt, + params=pooling_params, + lora_request=ctx.lora_request, + ) + generator = self.engine_client.encode( engine_prompt, pooling_params, diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 29200dda8998a..d3f3a8cfa5aa9 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -108,10 +108,15 @@ class CompletionRenderer(BaseRenderer): for detailed parameter documentation. """ if truncate_prompt_tokens is not None: - if max_length is not None: - assert 0 <= truncate_prompt_tokens <= max_length if truncate_prompt_tokens == 0: return [] + if truncate_prompt_tokens < 0: + truncate_prompt_tokens = self.model_config.max_model_len + if max_length is not None and truncate_prompt_tokens > max_length: + raise ValueError( + f"truncate_prompt_tokens ({truncate_prompt_tokens}) " + f"cannot be greater than max_length ({max_length}). " + f"Please select a smaller truncation size.") # Parse and batch the input prompts batch_inputs = parse_and_batch_prompt(prompt_or_prompts) From 2e5d21378db8226b04bb76f3e7119a9ddd0239df Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 7 Sep 2025 09:38:35 -0700 Subject: [PATCH 014/584] Skip MM Encoder for non-first PP ranks (#24387) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0224f3944f005..549c5dd2bbb2c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1620,14 +1620,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order - if self.supports_mm_inputs: + if self.supports_mm_inputs and get_pp_group().is_first_rank: # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) mm_embeds = self._gather_mm_embeddings(scheduler_output) - else: - mm_embeds = [] - if self.supports_mm_inputs and get_pp_group().is_first_rank: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. From 795b6951cd6a18c2d78bcb1d188f112357d19262 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Sun, 7 Sep 2025 17:30:27 -0700 Subject: [PATCH 015/584] Add @luccafong to codeowner for spec decode (#24397) Signed-off-by: Lu Fang --- .github/CODEOWNERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 682b27ac8986e..b6b3e184bff25 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -28,7 +28,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett -/vllm/v1/spec_decode @benchislett +/vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/attention/backends/triton_attn.py @tdoublep # Test ownership @@ -70,6 +70,9 @@ mkdocs.yaml @hmellor /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow /vllm/model_executor/models/qwen* @sighingnow +# MTP-specific files +/vllm/model_executor/models/deepseek_mtp.py @luccafong + # Mistral-specific files /vllm/model_executor/models/mistral*.py @patrickvonplaten /vllm/model_executor/models/mixtral*.py @patrickvonplaten From 86173ad593ff2d0c699230c4c2cd3ee5715d6e5d Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Sun, 7 Sep 2025 18:27:12 -0700 Subject: [PATCH 016/584] [Kernel] Support decode context parallelism on Blackwell with CUTLASS MLA (#24385) Signed-off-by: Ming Yang Signed-off-by: youkaichao Co-authored-by: youkaichao --- .../attention/mla/sm100_cutlass_mla_kernel.cu | 17 +++++++--- csrc/torch_bindings.cpp | 8 ++--- tests/kernels/test_cutlass_mla_decode.py | 32 +++++++++++++------ vllm/_custom_ops.py | 6 ++-- vllm/v1/attention/backends/mla/cutlass_mla.py | 32 +++++++++++++------ 5 files changed, 63 insertions(+), 32 deletions(-) diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index 820bf81dd1a02..c60f1823b8a1d 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -36,6 +36,7 @@ limitations under the License. #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040 void sm100_cutlass_mla_decode( torch::Tensor const& out, + torch::Tensor const& lse, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, @@ -99,6 +100,7 @@ struct MlaSm100 { template typename T::Fmha::Arguments args_from_options( at::Tensor const& out, + at::Tensor const& lse, at::Tensor const& q_nope, at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, @@ -162,7 +164,10 @@ typename T::Fmha::Arguments args_from_options( stride_PT, page_count_total, page_size}, - {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE}, + {static_cast(out.data_ptr()), + stride_O, + static_cast(lse.defined() ? lse.data_ptr() : nullptr), + stride_LSE}, hw_info, // TODO(trevor-m): Change split_kv back to -1 when // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will @@ -181,6 +186,7 @@ typename T::Fmha::Arguments args_from_options( template void runMla( at::Tensor const& out, + at::Tensor const& lse, at::Tensor const& q_nope, at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, @@ -192,7 +198,7 @@ void runMla( cudaStream_t stream) { using MlaSm100Type = MlaSm100; typename MlaSm100Type::Fmha fmha; - auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits); + auto arguments = args_from_options(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits); CUTLASS_CHECK(fmha.can_implement(arguments)); @@ -214,6 +220,7 @@ void runMla( void sm100_cutlass_mla_decode( torch::Tensor const& out, + torch::Tensor const& lse, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, @@ -234,13 +241,13 @@ void sm100_cutlass_mla_decode( DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] { if (in_dtype == at::ScalarType::Half) { runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::BFloat16) { runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::Float8_e4m3fn) { runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else { TORCH_CHECK(false, "Unsupported input data type of MLA"); } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 95fb5b197f534..d3f50d1076cb0 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -516,10 +516,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // SM100 CUTLASS MLA decode ops.def( - "sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe," - " Tensor kv_c_and_k_pe_cache, Tensor seq_lens," - " Tensor page_table, Tensor workspace, float " - "scale," + "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope," + " Tensor q_pe, Tensor kv_c_and_k_pe_cache," + " Tensor seq_lens, Tensor page_table," + " Tensor workspace, float scale," " int num_kv_splits) -> ()"); // conditionally compiled so impl in source file diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py index 85984324b1967..820dac0e6cec9 100644 --- a/tests/kernels/test_cutlass_mla_decode.py +++ b/tests/kernels/test_cutlass_mla_decode.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random +from typing import Optional import pytest import torch @@ -14,14 +15,20 @@ from vllm.triton_utils import triton def cal_diff(x: torch.Tensor, y: torch.Tensor, name: str, - use_fp8: bool = False) -> None: + use_fp8: bool = False, + diff_threshold: Optional[float] = None) -> None: x, y = x.double(), y.double() cos_diff = 1 - 2 * (x * y).sum().item() / max( (x * x + y * y).sum().item(), 1e-12) - if (use_fp8): - assert cos_diff < 1e-4 + if diff_threshold is not None: + # directly compare the cos_diff with the threshold + assert cos_diff < diff_threshold else: - assert cos_diff < 1e-5 + # use the default threshold + if (use_fp8): + assert cos_diff < 1e-4 + else: + assert cos_diff < 1e-5 CUTLASS_MLA_UNSUPPORTED_REASON = \ @@ -118,11 +125,13 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, dtype=torch.uint8) out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype) - - ops.sm100_cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache_flat, - cache_seqlens, block_table, workspace, - scale, 1) - return out_ans[:, :h_q].contiguous() + output_lse = torch.empty((b, MAX_HEADS), + dtype=torch.float32, + device=q_nope.device) + ops.sm100_cutlass_mla_decode(out_ans, output_lse, q_nope, q_pe, + kv_cache_flat, cache_seqlens, block_table, + workspace, scale, 1) + return out_ans[:, :h_q].contiguous(), output_lse[:, :h_q].contiguous() def scaled_dot_product_attention(query, key, value, is_causal=False): query = query.float() @@ -165,11 +174,14 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, lse[i] = lse_i return out, lse - out_cutlass = cutlass_mla() + out_cutlass, lse_cutlass = cutlass_mla() out_torch, lse_torch = ref_mla() # Extract the single token (s_q=1) slice to match cutlass output shape out_torch_slice = out_torch[:, 0, :, :] # [b, h_q, dv] + lse_torch_slice = lse_torch[:, 0, :] # [b, h_q] cal_diff(out_cutlass, out_torch_slice, "out", use_fp8) + # lse has larger numerical error, so use a larger threshold + cal_diff(lse_cutlass, lse_torch_slice, "lse", use_fp8, diff_threshold=1e-3) t = triton.testing.do_bench(cutlass_mla) FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2 diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 545f4cb48bf47..6e9a8df0a56a2 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1833,13 +1833,13 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, return out -def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, - q_pe: torch.Tensor, +def sm100_cutlass_mla_decode(out: torch.Tensor, lse: torch.Tensor, + q_nope: torch.Tensor, q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, seq_lens: torch.Tensor, page_table: torch.Tensor, workspace: torch.Tensor, scale: float, num_kv_splits: int) -> torch.Tensor: - torch.ops._C.sm100_cutlass_mla_decode(out, q_nope, q_pe, + torch.ops._C.sm100_cutlass_mla_decode(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, scale, num_kv_splits) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 95dce8d8e2eef..6017445402eca 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -76,6 +76,7 @@ g_sm100_workspace = SM100Workspace(128 * 1024 * 1024) # 128MB class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): + can_return_lse_for_decode: bool = True def __init__( self, @@ -138,7 +139,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): workspace: torch.Tensor, sm_scale: float, num_kv_splits: int, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, torch.Tensor]: assert (q_nope.ndim == 3 ), f"q_nope must be a 3D tensor, but got {q_nope.ndim}" assert ( @@ -193,9 +194,13 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): dtype = (torch.bfloat16 if is_quantized_kv_cache(self.kv_cache_dtype) else q_nope.dtype) out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype) + lse = (torch.empty( + (B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device) + if self.need_to_return_lse_for_decode else torch.Tensor()) ops.sm100_cutlass_mla_decode( out, + lse, q_nope, q_pe, kv_c_and_k_pe_cache, @@ -205,7 +210,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): sm_scale, num_kv_splits, ) - return out[:, :H].contiguous() + returned_lse = lse[:, :H].contiguous( + ) if self.need_to_return_lse_for_decode else lse + return out[:, :H].contiguous(), returned_lse def _sm100_forward_decode( self, @@ -213,7 +220,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: MLACommonMetadata, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None @@ -226,13 +233,18 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): q_nope = q_nope.clone() q_pe = q_pe.clone() - o = self._sm100_cutlass_mla_decode(q_nope, q_pe, kv_c_and_k_pe_cache, - attn_metadata.decode.seq_lens, - attn_metadata.decode.block_table, - self._workspace.get_buf(), - self.scale, self._num_kv_splits) + o, lse = self._sm100_cutlass_mla_decode( + q_nope, + q_pe, + kv_c_and_k_pe_cache, + attn_metadata.decode.seq_lens, + attn_metadata.decode.block_table, + self._workspace.get_buf(), + self.scale, + self._num_kv_splits, + ) - return o + return o, (lse if self.need_to_return_lse_for_decode else None) # TODO: Currently we leave it here only for backup in case something is # wrong with the new SM100 CUTLASS MLA kernel @@ -286,4 +298,4 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): attn_metadata), None return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache, - attn_metadata), None + attn_metadata) From 67841317d11653febdb321d7748f3d7e0c242d64 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Mon, 8 Sep 2025 10:07:16 +0800 Subject: [PATCH 017/584] [xpu] upgrade ipex/python3.12 for xpu (#23830) Signed-off-by: Yan Ma --- docker/Dockerfile.xpu | 33 ++++++++++++------- .../installation/gpu/xpu.inc.md | 19 +++++------ requirements/xpu.txt | 7 ++-- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 65d2e5036b783..ef422352509a9 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -1,12 +1,10 @@ FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base -RUN rm /etc/apt/sources.list.d/intel-graphics.list +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ + add-apt-repository -y ppa:kobuk-team/intel-graphics RUN apt clean && apt-get update -y && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get install -y python3.10 python3.10-distutils && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ ffmpeg \ @@ -17,17 +15,29 @@ RUN apt clean && apt-get update -y && \ libgl1 \ lsb-release \ numactl \ - python3.10-dev \ - wget + wget \ + vim \ + python3.12 \ + python3.12-dev \ + python3-pip +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 +RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing + +RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh +RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc +SHELL ["bash", "-c"] +CMD ["bash", "-c", "source /root/.bashrc && exec bash"] WORKDIR /workspace/vllm COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt COPY requirements/common.txt /workspace/vllm/requirements/common.txt +# suppress the python externally managed environment error +RUN python3 -m pip config set global.break-system-packages true + RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ -r requirements/xpu.txt @@ -54,8 +64,9 @@ FROM vllm-base AS vllm-openai RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope -ENV VLLM_USAGE_SOURCE production-docker-image \ - TRITON_XPU_PROFILE 1 +RUN --mount=type=cache,target=/root/.cache/pip \ + pip uninstall oneccl oneccl-devel -y + # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md index b77c4e00cf0c4..ed1dc0418cf7e 100644 --- a/docs/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -3,13 +3,16 @@ vLLM initially supports basic model inference and serving on Intel GPU platform. !!! warning - There are no pre-built wheels or images for this device, so you must build vLLM from source. + There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions. # --8<-- [end:installation] # --8<-- [start:requirements] - Supported Hardware: Intel Data Center GPU, Intel ARC GPU -- OneAPI requirements: oneAPI 2025.0 +- OneAPI requirements: oneAPI 2025.1 +- Python: 3.12 +!!! warning + The provided IPEX whl is Python3.12 specific so this version is a MUST. # --8<-- [end:requirements] # --8<-- [start:set-up-using-python] @@ -24,7 +27,7 @@ Currently, there are no pre-built XPU wheels. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later. +- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later. - Second, install Python packages for vLLM XPU backend building: ```bash @@ -40,14 +43,10 @@ pip install -v -r requirements/xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -!!! note - - FP16 is the default data type in the current XPU backend. The BF16 data - type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. - # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] -Currently, there are no pre-built XPU images. +Currently, we release prebuilt XPU images at docker [hub](https://hub.docker.com/r/intel/vllm/tags) based on vLLM released version. For more information, please refer release [note](https://github.com/intel/ai-containers/blob/main/vllm). # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] @@ -65,14 +64,14 @@ docker run -it \ # --8<-- [end:build-image-from-source] # --8<-- [start:supported-features] -XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: +XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following: ```bash python -m vllm.entrypoints.openai.api_server \ --model=facebook/opt-13b \ --dtype=bfloat16 \ --max_model_len=1024 \ - --distributed-executor-backend=ray \ + --distributed-executor-backend=mp \ --pipeline-parallel-size=2 \ -tp=8 ``` diff --git a/requirements/xpu.txt b/requirements/xpu.txt index c44a2a9c74e50..74f5b05b2382a 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -11,10 +11,9 @@ jinja2>=3.1.6 datasets # for benchmark scripts numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding nixl==0.3.0 # for PD disaggregation ---extra-index-url=https://download.pytorch.org/whl/xpu torch==2.8.0+xpu torchaudio torchvision -pytorch-triton-xpu ---extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -intel-extension-for-pytorch==2.8.10+xpu +--extra-index-url=https://download.pytorch.org/whl/xpu + +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl From b3d7e3c845d3f4905e9a95776c7d605c743f0ea1 Mon Sep 17 00:00:00 2001 From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com> Date: Sun, 7 Sep 2025 19:34:31 -0700 Subject: [PATCH 018/584] [Sampler] Support returning all prompt logprobs (#23868) Signed-off-by: Xingyu Liu Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Cyrus Leung --- tests/v1/sample/test_logprobs.py | 12 ++++++++++-- vllm/sampling_params.py | 11 +++++++---- vllm/v1/engine/processor.py | 28 ++++++++++++++++++---------- vllm/v1/worker/gpu_input_batch.py | 5 +++-- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index e835c029634ce..570e330208a39 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -430,7 +430,7 @@ def test_zero_logprobs(vllm_model, example_prompts, def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): - """Engine should return all vocabulary logprobs + """Engine should return all vocabulary logprobs and prompt logprobs Args: example_prompts: list of example prompts (test fixture) @@ -444,16 +444,24 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): # 2 other llms alive during whole session gpu_memory_utilization=0.15, max_model_len=256) + sampling_params_logprobs_all = SamplingParams(max_tokens=5, - logprobs=-1) + logprobs=-1, + prompt_logprobs=-1) results_logprobs_all = runner.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_all) vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size() + for i in range(len(results_logprobs_all)): logprobs = results_logprobs_all[i].outputs[0].logprobs + prompt_logprobs = results_logprobs_all[i].prompt_logprobs assert logprobs is not None for logprob in logprobs: assert len(logprob) == vocab_size + assert prompt_logprobs is not None + assert prompt_logprobs[0] is None + for prompt_logprob in prompt_logprobs[1:]: + assert len(prompt_logprob) == vocab_size @pytest.mark.parametrize("logprobs_mode", list(LogprobsMode)) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index c7b4ba34c602e..fe93e906064e4 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -165,7 +165,8 @@ class SamplingParams( the sampled token, so there may be up to `logprobs+1` elements in the response. When set to -1, return all `vocab_size` log probabilities.""" prompt_logprobs: Optional[int] = None - """Number of log probabilities to return per prompt token.""" + """Number of log probabilities to return per prompt token. + When set to -1, return all `vocab_size` log probabilities.""" # NOTE: This parameter is only exposed at the engine level for now. # It is not exposed in the OpenAI API server, as the OpenAI API does # not support returning only a list of token IDs. @@ -409,9 +410,11 @@ class SamplingParams( and self.logprobs < 0): raise ValueError( f"logprobs must be non-negative or -1, got {self.logprobs}.") - if self.prompt_logprobs is not None and self.prompt_logprobs < 0: - raise ValueError(f"prompt_logprobs must be non-negative, got " - f"{self.prompt_logprobs}.") + if (self.prompt_logprobs is not None and self.prompt_logprobs != -1 + and self.prompt_logprobs < 0): + raise ValueError( + f"prompt_logprobs must be non-negative or -1, got " + f"{self.prompt_logprobs}.") if (self.truncate_prompt_tokens is not None and (self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1)): diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 1aa117ded4ed8..baade243140d5 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -65,19 +65,27 @@ class Processor: ) -> None: max_logprobs = self.model_config.max_logprobs if max_logprobs == -1: - return + max_logprobs = self.model_config.get_vocab_size() + # Validate sample logprobs. - if params.logprobs and (params.logprobs == -1 - or params.logprobs > max_logprobs): - raise ValueError( - f"Requested sample logprobs of {params.logprobs}, " - f"which is greater than max allowed: {max_logprobs}") + if params.logprobs: + num_logprobs = params.logprobs + if num_logprobs == -1: + num_logprobs = self.model_config.get_vocab_size() + if num_logprobs > max_logprobs: + raise ValueError( + f"Requested sample logprobs of {num_logprobs}, " + f"which is is greater than max allowed: {max_logprobs}") # Validate prompt logprobs. - if params.prompt_logprobs and params.prompt_logprobs > max_logprobs: - raise ValueError( - f"Requested prompt logprobs of {params.prompt_logprobs}, " - f"which is greater than max allowed: {max_logprobs}") + if params.prompt_logprobs: + num_prompt_logprobs = params.prompt_logprobs + if num_prompt_logprobs == -1: + num_prompt_logprobs = self.model_config.get_vocab_size() + if num_prompt_logprobs > max_logprobs: + raise ValueError( + f"Requested prompt logprobs of {num_prompt_logprobs}, " + f"which is is greater than max allowed: {max_logprobs}") def _validate_sampling_params( self, diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 83fc821b84946..bf9b16575e609 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -360,8 +360,9 @@ class InputBatch: if sampling_params.logprobs == -1 else sampling_params.logprobs) if sampling_params.prompt_logprobs is not None: - self.num_prompt_logprobs[ - req_id] = sampling_params.prompt_logprobs + self.num_prompt_logprobs[req_id] = ( + self.vocab_size if sampling_params.prompt_logprobs == -1 + else sampling_params.prompt_logprobs) if sampling_params.allowed_token_ids: self.has_allowed_token_ids.add(req_id) From 3a3e91bdfe86adcb8fceb1cb8c5f883878fc65b4 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Sun, 7 Sep 2025 19:51:59 -0700 Subject: [PATCH 019/584] [CI/Build] Disable flaky test_structured_output tests (#24404) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index c10b1abb2b3b7..126d8ce8c8e00 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -46,12 +46,12 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None), ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None), - ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None), - ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None), + #FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402 + # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None), + # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None), + #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"), ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG), - #FIXME: This test is flaky on CI thus disabled - #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"), ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG), From 3bca396f79cf3ba248ee369fe73533255006b2b7 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Sun, 7 Sep 2025 20:31:35 -0700 Subject: [PATCH 020/584] [CI/Build] Fix local image inputs in test_pixtral.py (#24401) Signed-off-by: Chenheli Hua Co-authored-by: Roger Wang --- .../multimodal/generation/test_pixtral.py | 30 +++++++------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index f95dbc7547ecc..a4e21aface41f 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -187,27 +187,19 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str, name_1="output") -@pytest.fixture -def prompt(request, local_asset_server) -> TextPrompt: - names = request.param - urls = [local_asset_server.url_for(n) for n in names] - return _create_engine_inputs_hf(urls) - - @pytest.mark.parametrize( - "prompt,expected_ranges", - [ - pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]), - pytest.param(IMG_URLS[1:4], [ - PlaceholderRange(offset=11, length=266), - PlaceholderRange(offset=277, length=1056), - PlaceholderRange(offset=1333, length=418) - ]) - ], -) -def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt, + "image_urls,expected_ranges", + [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]), + (IMG_URLS[1:4], [ + PlaceholderRange(offset=11, length=266), + PlaceholderRange(offset=277, length=1056), + PlaceholderRange(offset=1333, length=418) + ])]) +def test_multi_modal_placeholders(vllm_runner, image_urls: list[str], expected_ranges: list[PlaceholderRange], - monkeypatch) -> None: + local_asset_server, monkeypatch) -> None: + local_image_urls = [local_asset_server.url_for(u) for u in image_urls] + prompt = _create_engine_inputs_hf(local_image_urls) # This placeholder checking test only works with V0 engine # where `multi_modal_placeholders` is returned with `RequestOutput` From 8c892b1831a8a35fbf5a89fedfaae0ab9bef2dd0 Mon Sep 17 00:00:00 2001 From: Al-Ekram Elahee Hridoy Date: Sun, 7 Sep 2025 23:33:52 -0600 Subject: [PATCH 021/584] [Doc] Fix UTF-8 encoding issues in documentation generation on Windows (#24361) Signed-off-by: alekramelaheehridoy Signed-off-by: alekramelaheehridoy Co-authored-by: alekramelaheehridoy --- docs/mkdocs/hooks/generate_argparse.py | 3 ++- docs/mkdocs/hooks/generate_examples.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 051a2d904406d..91454ec272b81 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -165,6 +165,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): # Generate documentation for each parser for stem, parser in parsers.items(): doc_path = ARGPARSE_DOC_DIR / f"{stem}.md" - with open(doc_path, "w") as f: + # Specify encoding for building on Windows + with open(doc_path, "w", encoding="utf-8") as f: f.write(parser.format_help()) logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR)) diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 881df791698e2..ac2101daac2ef 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -106,7 +106,8 @@ class Example: def determine_title(self) -> str: if not self.is_code: - with open(self.main_file) as f: + # Specify encoding for building on Windows + with open(self.main_file, encoding="utf-8") as f: first_line = f.readline().strip() match = re.match(r'^#\s+(?P.+)$', first_line) if match: @@ -174,6 +175,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): doc_path = EXAMPLE_DOC_DIR / example.category / example_name if not doc_path.parent.exists(): doc_path.parent.mkdir(parents=True) - with open(doc_path, "w+") as f: + # Specify encoding for building on Windows + with open(doc_path, "w+", encoding="utf-8") as f: f.write(example.generate()) logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR)) From 61aa4b2901fa2a1376bcd781da253d83526e3090 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Mon, 8 Sep 2025 14:07:00 +0800 Subject: [PATCH 022/584] [P/D] Add a shutdown method to the Connector API (#22699) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> --- vllm/distributed/kv_transfer/__init__.py | 7 ++++--- .../kv_transfer/kv_connector/v1/base.py | 8 ++++++++ .../distributed/kv_transfer/kv_transfer_state.py | 7 +++++++ vllm/executor/executor_base.py | 2 +- vllm/v1/core/sched/scheduler.py | 2 ++ vllm/v1/executor/multiproc_executor.py | 16 +++++++++------- vllm/v1/worker/gpu_worker.py | 3 +++ .../v1/worker/kv_connector_model_runner_mixin.py | 8 +++++++- vllm/v1/worker/tpu_worker.py | 3 +++ vllm/worker/worker_base.py | 8 ++++++++ 10 files changed, 52 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index fa9b7e4f14c02..cf58e7914972c 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_transfer_state import ( - KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group, - has_kv_transfer_group, is_v1_kv_transfer_group) + KVConnectorBaseType, ensure_kv_transfer_initialized, + ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group, + is_v1_kv_transfer_group) __all__ = [ "get_kv_transfer_group", "has_kv_transfer_group", "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized", - "KVConnectorBaseType" + "ensure_kv_transfer_shutdown", "KVConnectorBaseType" ] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 2804003f5a708..f3f493144d283 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -226,6 +226,14 @@ class KVConnectorBase_V1(ABC): """ return None, None + def shutdown(self): + """ + Shutdown the connector. This is called when the worker process + is shutting down to ensure that all the async operations are + completed and the connector is cleaned up properly. + """ + return None + # ============================== # Scheduler-side methods # ============================== diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 5e0f64fca220c..d5747bed92771 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -64,3 +64,10 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: config=vllm_config, role=KVConnectorRole.WORKER) else: raise ValueError("V0 is no longer supported") + + +def ensure_kv_transfer_shutdown() -> None: + global _KV_CONNECTOR_AGENT + if _KV_CONNECTOR_AGENT is not None: + _KV_CONNECTOR_AGENT.shutdown() + _KV_CONNECTOR_AGENT = None diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 813232cd19281..a3c1d79a58b26 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -231,7 +231,7 @@ class ExecutorBase(ABC): def shutdown(self) -> None: """Shutdown the executor.""" - return + self.collective_rpc("shutdown") def __del__(self): self.shutdown() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 31f7e9c70f8b3..2d40e96632c95 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1188,6 +1188,8 @@ class Scheduler(SchedulerInterface): def shutdown(self) -> None: if self.kv_event_publisher: self.kv_event_publisher.shutdown() + if self.connector is not None: + self.connector.shutdown() ######################################################################## # KV Connector Related Methods diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ef6303495c245..c3d6c20e22e2a 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing -import os import pickle import queue import signal @@ -507,6 +506,7 @@ class WorkerProc: return cast(list[WorkerProcHandle], ready_proc_handles) def shutdown(self): + self.worker.shutdown() self.rpc_broadcast_mq = None self.worker_response_mq = None destroy_model_parallel() @@ -536,7 +536,7 @@ class WorkerProc: # tuple[Connection, Connection] reader, ready_writer = kwargs.pop("ready_pipe") death_pipe = kwargs.pop("death_pipe", None) - + shutdown_event = threading.Event() # Start death monitoring thread if death_pipe is provided if death_pipe is not None: @@ -548,7 +548,7 @@ class WorkerProc: # Parent process has exited, terminate this worker logger.info("Parent process exited, terminating worker") # Send signal to self to trigger clean shutdown - os.kill(os.getpid(), signal.SIGTERM) + shutdown_event.set() except Exception as e: logger.warning("Death monitoring error: %s", e) @@ -576,7 +576,7 @@ class WorkerProc: ready_writer.close() ready_writer = None - worker.worker_busy_loop() + worker.worker_busy_loop(cancel=shutdown_event) except Exception: # NOTE: if an Exception arises in busy_loop, we send @@ -586,6 +586,8 @@ class WorkerProc: if ready_writer is not None: logger.exception("WorkerProc failed to start.") + elif shutdown_event.is_set(): + logger.info("WorkerProc shutting down.") else: logger.exception("WorkerProc failed.") @@ -637,11 +639,11 @@ class WorkerProc: output = self.async_output_queue.get() self.enqueue_output(output) - def worker_busy_loop(self): + def worker_busy_loop(self, cancel: Optional[threading.Event] = None): """Main busy loop for Multiprocessing Workers""" while True: - method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue() - + method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue( + cancel=cancel) try: if isinstance(method, str): func = getattr(self.worker, method) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6a3bc5d46df27..726f59603437f 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -601,6 +601,9 @@ class Worker(WorkerBase): self.model_runner.save_tensorized_model( tensorizer_config=tensorizer_config, ) + def shutdown(self) -> None: + self.model_runner.ensure_kv_transfer_shutdown() + def init_worker_distributed_environment( vllm_config: VllmConfig, diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index e2ffa2f12fda5..67bb967d2edfa 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -9,7 +9,8 @@ from typing import Generator # noqa: UP035 from typing import TYPE_CHECKING, Optional from vllm.config import VllmConfig -from vllm.distributed.kv_transfer import (get_kv_transfer_group, +from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown, + get_kv_transfer_group, has_kv_transfer_group) from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.forward_context import get_forward_context, set_forward_context @@ -42,6 +43,11 @@ class KVConnectorModelRunnerMixin: # Do this here to save a collective_rpc. kv_connector.start_load_kv(get_forward_context()) + @staticmethod + def ensure_kv_transfer_shutdown() -> None: + if has_kv_transfer_group(): + ensure_kv_transfer_shutdown() + @staticmethod def maybe_wait_for_kv_save() -> None: if has_kv_transfer_group(): diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 3f4e3ecbd4e26..fc72b954df9cf 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -330,6 +330,9 @@ class TPUWorker: ensure_kv_transfer_initialized(vllm_config) + def shutdown(self) -> None: + self.model_runner.ensure_kv_transfer_shutdown() + if USE_TPU_COMMONS: from tpu_commons.worker import TPUWorker as TPUCommonsWorker diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index a1fa7f2cf7a2e..aa76d21f0fcaa 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -129,6 +129,10 @@ class WorkerBase: """Get vocabulary size from model configuration.""" return self.model_config.get_vocab_size() + def shutdown(self) -> None: + """Clean up resources held by the worker.""" + return + class DelegateWorkerBase(WorkerBase): """ @@ -519,6 +523,10 @@ class WorkerWrapperBase: from vllm.utils import init_cached_hf_modules init_cached_hf_modules() + def shutdown(self) -> None: + if self.worker is not None: + self.worker.shutdown() + def adjust_rank(self, rank_mapping: Dict[int, int]) -> None: """ Adjust the rpc_rank based on the given mapping. From 8a4660260624eb34c365da34e4962e6ed15bbc0b Mon Sep 17 00:00:00 2001 From: Chatcharin Sangbutsarakum <67754293+what-in-the-nim@users.noreply.github.com> Date: Mon, 8 Sep 2025 13:10:54 +0700 Subject: [PATCH 023/584] [Model] Remove unnecessary CUDA sync of GLM-4.1V image and video preprocess (#24332) Signed-off-by: Win <chatcharinsang@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/glm4_1v.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index fd5fecac67d67..055cab901361f 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1429,6 +1429,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, self, image_input: Glm4vImageInputs) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": image_embeds = image_input["image_embeds"].type(self.visual.dtype) @@ -1443,13 +1444,15 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, image_embeds = self.visual(pixel_values, grid_thw=grid_thw.tolist()) merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + return image_embeds.split(sizes) def _process_video_input( self, video_input: Glm4vVideoInputs) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"].type(self.visual.dtype) @@ -1466,8 +1469,9 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw=grid_thw.tolist()) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + return video_embeds.split(sizes) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: mm_input_by_modality = {} From 60f0843ef8fb4b0c4e6788acc042873a0a2ea2a1 Mon Sep 17 00:00:00 2001 From: Chatcharin Sangbutsarakum <67754293+what-in-the-nim@users.noreply.github.com> Date: Mon, 8 Sep 2025 13:11:12 +0700 Subject: [PATCH 024/584] [Model] Remove unnecessary CUDA sync of Qwen2VL image and video preprocess (#24334) Signed-off-by: Win <chatcharinsang@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/qwen2_vl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f00b214b1ef18..90a1ad2a658ab 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1218,6 +1218,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": image_embeds = image_input["image_embeds"] @@ -1227,15 +1228,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() - return image_embeds.split(sizes.tolist()) + return image_embeds.split(sizes) def _process_video_input( self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"] @@ -1245,9 +1248,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() - return video_embeds.split(sizes.tolist()) + return video_embeds.split(sizes) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: modalities = {} From 425b04b8f47ec2a7fe12a7e379e3b45e0153a504 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Mon, 8 Sep 2025 14:49:52 +0800 Subject: [PATCH 025/584] [gpt-oss][Responses API] Fix the function call id format (#24409) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> --- vllm/entrypoints/harmony_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index d1ff06425fcb3..a3693ce60e4e6 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -273,7 +273,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: call_id=f"call_{random_id}", type="function_call", name=function_name, - id=f"ft_{random_id}", + id=f"fc_{random_id}", ) output_items.append(response_item) elif recipient is not None and (recipient.startswith("python") From 2f0b833a05190a71e837f0b6d3e2a01c25c86e73 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Mon, 8 Sep 2025 15:19:40 +0800 Subject: [PATCH 026/584] [Docs] Fix a tip indentation and typo (#24419) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- docs/contributing/profiling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index dffd62385e017..5b83d93274f0d 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -19,7 +19,7 @@ When using `vllm bench serve`, you can enable profiling by passing the `--profil Traces can be visualized using <https://ui.perfetto.dev/>. !!! tip -You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`. + You can directly call bench module without installing vLLM using `python -m vllm.entrypoints.cli.main bench`. !!! tip Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. From f4962a6d55a340ebb569d377c842deff7611d8f7 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Mon, 8 Sep 2025 09:22:16 +0200 Subject: [PATCH 027/584] [Doc]: fix typos in Python comments (#24417) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- examples/offline_inference/chat_with_tools.py | 2 +- vllm/attention/backends/mla/common.py | 2 +- vllm/config/__init__.py | 2 +- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/multiprocessing/client.py | 2 +- vllm/entrypoints/openai/cli_args.py | 2 +- .../openai/tool_parsers/llama4_pythonic_tool_parser.py | 2 +- vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py | 2 +- vllm/model_executor/layers/fused_moe/modular_kernel.py | 2 +- vllm/model_executor/layers/quantization/utils/marlin_utils.py | 2 +- vllm/v1/worker/block_table.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index 6e56e24f2092c..3a95b1fdfbabc 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools) print(outputs[0].outputs[0].text.strip()) # yields -# 'The weather in Dallas, TX is 85 degrees fahrenheit. ' +# 'The weather in Dallas, TX is 85 degrees Fahrenheit. ' # 'It is partly cloudly, with highs in the 90's.' diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 3b9037521168e..789393eb39a73 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1052,7 +1052,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): return layer.weight # we currently do not have quantized bmm's which are needed for - # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform # the bmm's in 16-bit, the extra memory overhead of this is fairly low kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T assert kv_b_proj_weight.shape == ( diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 063af69f41dad..f6f1838aedfc2 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1169,7 +1169,7 @@ class ModelConfig: ] # Any custom overrides will be in quantization_methods so we place # them at the start of the list so custom overrides have preference - # over the built in ones. + # over the built-in ones. quantization_methods = quantization_methods + overrides # Detect which checkpoint is it diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index c2f73fa281555..20d1e31a7106b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -770,7 +770,7 @@ class NixlConnectorWorker: # with joint KV for each block. This minimizes the overhead in # registerMem allowing faster descs queries. In order to be able to # split on kv_heads dim as required by heterogeneous TP, one must - # be able to index K/V separately. Hence the we double the number + # be able to index K/V separately. Hence we double the number # of 'virtual' regions here and halve `block_len` below. self.num_regions *= 2 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fdd25a2f9ce2f..bee97f4cd04d8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1159,7 +1159,7 @@ class EngineArgs: # Note(hc): In the current implementation of decode context # parallel(DCP), tp_size needs to be divisible by dcp_size, # because the world size does not change by dcp, it simply - # reuse the GPUs of TP group, and split one TP group into + # reuses the GPUs of TP group, and split one TP group into # tp_size//dcp_size DCP groups. assert self.tensor_parallel_size % self.decode_context_parallel_size \ == 0, ( diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 0beb9c8cc0b97..7d1f29a9824d7 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -235,7 +235,7 @@ class MQLLMEngineClient(EngineClient): # therefore we have to inform that the current # processed requests failed as well. Send back a dead # engine error give this feedback and also give a - # 'hint' to the server to shutdown next. + # 'hint' to the server to shut down next. exception = self.dead_error if request_id is None: diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index d0b5d013eb9e5..7e1df795fb056 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -204,7 +204,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" frontend_kwargs["lora_modules"]["type"] = optional_type(str) frontend_kwargs["lora_modules"]["action"] = LoRAParserAction - # Special case: Middleware needs append action + # Special case: Middleware needs to append action frontend_kwargs["middleware"]["action"] = "append" frontend_kwargs["middleware"]["type"] = str if "nargs" in frontend_kwargs["middleware"]: diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index 6bf44a4345a9d..9a9a19ce2188e 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -176,7 +176,7 @@ class Llama4PythonicToolParser(ToolParser): index] += delta.function.arguments # HACK: serving_chat.py inspects the internal state of tool parsers - # when determining it's final streaming delta, automatically + # when determining its final streaming delta, automatically # adding autocompleted JSON. # These two lines avoid that nonsense while ensuring finish_reason # is set to tool_calls when at least one tool is called. diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index c0691f122904e..e6b300fd84e94 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -143,7 +143,7 @@ class MistralToolParser(ToolParser): except json.JSONDecodeError: # use a regex to find the part corresponding to the tool call. # NOTE: This use case should not happen if the model is trained - # correctly. It's a easy possible fix so it's included, but + # correctly. It's an easy possible fix so it's included, but # can be brittle for very complex / highly nested tool calls raw_tool_call = self.tool_call_regex.findall(tool_content)[0] function_call_arr = json.loads(raw_tool_call) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 7a8c6f8571deb..281563c3bfca2 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -302,7 +302,7 @@ class FusedMoEPrepareAndFinalize(ABC): def max_num_tokens_per_rank(self) -> Optional[int]: """ Some PrepareFinalize All2All implementations are batched. Meaning, - they can processes only as set of tokens at a time. This + they can process only as set of tokens at a time. This function returns the batch size i.e the maximum number of tokens the implementation can process at a time. Return None if there are no such restrictions. diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 02057b476c6e2..317ad079b392d 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -201,7 +201,7 @@ def marlin_make_workspace(output_size_per_partition: int, def marlin_make_workspace_new(device: torch.device, max_blocks_per_sm: int = 1) -> torch.Tensor: # In the new marlin kernel, we use the num of threadblocks as workspace - # size. The num of threadblocks is is sms_count * max_blocks_per_sm. + # size. The num of threadblocks is sms_count * max_blocks_per_sm. sms = torch.cuda.get_device_properties(device).multi_processor_count return torch.zeros(sms * max_blocks_per_sm, dtype=torch.int, diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index c5902595a496b..0e509b7453b9d 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -98,7 +98,7 @@ class BlockTable: # here because M (max_model_len) is not necessarily divisible by # block_size. if self.dcp_world_size > 1: - # Note(hc): The DCP implement store kvcache with a interleave + # Note(hc): The DCP implement store kvcache with an interleave # style, the kvcache for the token whose token_idx is i is # always stored on the GPU whose dcp_rank equals i % cp_world_size: From c2a8b08fcdc48579871cd568b0799037a1214ea1 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Mon, 8 Sep 2025 17:28:32 +0800 Subject: [PATCH 028/584] [Doc] Fix issues in integrations/llamastack.md (#24428) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- docs/deployment/integrations/llamastack.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md index 28031f01f85e8..8eb7f8d81275d 100644 --- a/docs/deployment/integrations/llamastack.md +++ b/docs/deployment/integrations/llamastack.md @@ -1,6 +1,6 @@ # Llama Stack -vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . +vLLM is also available via [Llama Stack](https://github.com/llamastack/llama-stack). To install Llama Stack, run @@ -8,9 +8,9 @@ To install Llama Stack, run pip install llama-stack -q ``` -## Inference using OpenAI Compatible API +## Inference using OpenAI-Compatible API -Then start Llama Stack server pointing to your vLLM server with the following configuration: +Then start the Llama Stack server and configure it to point to your vLLM server with the following settings: ```yaml inference: @@ -20,15 +20,15 @@ inference: url: http://127.0.0.1:8000 ``` -Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider. +Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/providers/inference/remote_vllm.html) for more details on this remote vLLM provider. -## Inference via Embedded vLLM +## Inference using Embedded vLLM -An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm) +An [inline provider](https://github.com/llamastack/llama-stack/tree/main/llama_stack/providers/inline/inference) is also available. This is a sample of configuration using that method: ```yaml -inference +inference: - provider_type: vllm config: model: Llama3.1-8B-Instruct From 5e537f45b453096c489d06f0c47776f9436ef99b Mon Sep 17 00:00:00 2001 From: Li Wang <wangli858794774@gmail.com> Date: Mon, 8 Sep 2025 19:03:02 +0800 Subject: [PATCH 029/584] [Bugfix] Fix get_quant_config when using modelscope (#24421) Signed-off-by: wangli <wangli858794774@gmail.com> --- .../model_loader/default_loader.py | 38 ++------------- .../model_loader/weight_utils.py | 46 ++++++++++++++++++- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 1e5aa9e571edb..c8ad3a55d932a 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -7,20 +7,19 @@ import time from collections.abc import Generator, Iterable from typing import Optional, cast -import huggingface_hub import torch from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm import envs from vllm.config import LoadConfig, ModelConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, fastsafetensors_weights_iterator, filter_duplicate_safetensors_files, - filter_files_not_needed_for_inference, get_lock, np_cache_weights_iterator, - pt_weights_iterator, safetensors_weights_iterator) + filter_files_not_needed_for_inference, maybe_download_from_modelscope, + np_cache_weights_iterator, pt_weights_iterator, + safetensors_weights_iterator) from vllm.platforms import current_platform logger = init_logger(__name__) @@ -57,35 +56,6 @@ class DefaultModelLoader(BaseModelLoader): raise ValueError(f"Model loader extra config is not supported for " f"load format {load_config.load_format}") - def _maybe_download_from_modelscope( - self, model: str, revision: Optional[str]) -> Optional[str]: - """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True. - - Returns the path to the downloaded model, or None if the model is not - downloaded from ModelScope.""" - if envs.VLLM_USE_MODELSCOPE: - # download model from ModelScope hub, - # lazy import so that modelscope is not required for normal use. - # pylint: disable=C. - from modelscope.hub.snapshot_download import snapshot_download - - # Use file lock to prevent multiple processes from - # downloading the same model weights at the same time. - with get_lock(model, self.load_config.download_dir): - if not os.path.exists(model): - model_path = snapshot_download( - model_id=model, - cache_dir=self.load_config.download_dir, - local_files_only=huggingface_hub.constants. - HF_HUB_OFFLINE, - revision=revision, - ignore_file_pattern=self.load_config.ignore_patterns, - ) - else: - model_path = model - return model_path - return None - def _prepare_weights( self, model_name_or_path: str, @@ -96,7 +66,7 @@ class DefaultModelLoader(BaseModelLoader): """Prepare weights for the model. If the model is not local, it will be downloaded.""" - model_name_or_path = (self._maybe_download_from_modelscope( + model_name_or_path = (maybe_download_from_modelscope( model_name_or_path, revision) or model_name_or_path) is_local = os.path.isdir(model_name_or_path) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f87eeaa4563ff..50056038b6506 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -21,6 +21,7 @@ from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm +from vllm import envs from vllm.config import LoadConfig, ModelConfig from vllm.distributed import get_tensor_model_parallel_rank from vllm.logger import init_logger @@ -95,6 +96,41 @@ def get_lock(model_name_or_path: Union[str, Path], return lock +def maybe_download_from_modelscope( + model: str, + revision: Optional[str] = None, + download_dir: Optional[str] = None, + ignore_patterns: Optional[Union[str, list[str]]] = None, + allow_patterns: Optional[Union[list[str], + str]] = None) -> Optional[str]: + """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True. + + Returns the path to the downloaded model, or None if the model is not + downloaded from ModelScope.""" + if envs.VLLM_USE_MODELSCOPE: + # download model from ModelScope hub, + # lazy import so that modelscope is not required for normal use. + # pylint: disable=C. + from modelscope.hub.snapshot_download import snapshot_download + + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model, download_dir): + if not os.path.exists(model): + model_path = snapshot_download( + model_id=model, + cache_dir=download_dir, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + revision=revision, + ignore_file_pattern=ignore_patterns, + allow_patterns=allow_patterns, + ) + else: + model_path = model + return model_path + return None + + def _shared_pointers(tensors): ptrs = defaultdict(list) for k, v in tensors.items(): @@ -169,7 +205,13 @@ def get_quant_config(model_config: ModelConfig, # Inflight BNB quantization if model_config.quantization == "bitsandbytes": return quant_cls.from_config({}) - is_local = os.path.isdir(model_config.model) + model_name_or_path = maybe_download_from_modelscope( + model_config.model, + revision=model_config.revision, + download_dir=load_config.download_dir, + allow_patterns=["*.json"], + ) or model_config.model + is_local = os.path.isdir(model_name_or_path) if not is_local: # Download the config files. with get_lock(model_config.model, load_config.download_dir): @@ -182,7 +224,7 @@ def get_quant_config(model_config: ModelConfig, tqdm_class=DisabledTqdm, ) else: - hf_folder = model_config.model + hf_folder = model_name_or_path possible_config_filenames = quant_cls.get_config_filenames() From e041314184e3b1c4d797b2b025a5a9f54ade5a00 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Mon, 8 Sep 2025 14:42:41 +0300 Subject: [PATCH 030/584] [Bugfix] Fix mamba2 prefill chunking (#23279) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/kernels/mamba/test_mamba_ssm_ssd.py | 233 +++++++++++++++++- .../layers/mamba/ops/ssd_chunk_scan.py | 3 + .../layers/mamba/ops/ssd_combined.py | 9 +- .../layers/mamba/ops/ssd_state_passing.py | 84 +++++-- vllm/v1/attention/backends/mamba2_attn.py | 55 ++++- 5 files changed, 349 insertions(+), 35 deletions(-) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 2c554baaff76c..1ce7f9d85e876 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -115,21 +115,27 @@ def generate_continuous_batched_examples(example_lens_by_batch, n_heads, d_head, itype, - device='cuda'): + device='cuda', + return_naive_ref=True): # this function generates a random examples of certain length # and then cut according to "example_lens_by_batch" and feed - # them in continuous batches to the kernels + # them in continuous batches to the kernels. + # If if return_naive_ref=True, the naive torch implementation + # ssd_minimal_discrete will be used to compute and return + # reference output. # generate the full-length example A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads, d_head, itype) - Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), - A * dt, - B, - C, - block_len=full_length // 4) + if return_naive_ref: + Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), + A * dt, + B, + C, + block_len=full_length // + 4) # internal function that outputs a cont batch of examples # given a tuple of lengths for each example in the batch @@ -179,7 +185,8 @@ def generate_continuous_batched_examples(example_lens_by_batch, IND_S = [x % full_length for x in IND_E] IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)] - yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)], + yield ([Y_min[s, IND_S[s]:IND_E[s]] + for s in range(num_examples)] if return_naive_ref else None, cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2)) @@ -324,3 +331,213 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, if clear: states[i].fill_(0.) exhausted[i] = False + + +@pytest.mark.parametrize("chunk_size", [8, 256]) +@pytest.mark.parametrize("seqlens", [ + (16, 2, 8, 13), + (270, 88, 212, 203), + (16, 20), +]) +def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens): + + # This test verifies the correctness of the chunked prefill implementation + # in the mamba2 ssd kernels, by comparing concatenation (in the sequence + # dimension) of chunked results with the full sequence result. + # It is different from test_mamba_chunk_scan_cont_batch by: + # 1. Not using the naive torch implementaion (ssd_minimal_discrete) to get + # reference outputs. Instead, it compares chunked kernel outputs to full + # sequence kernel outputs. This is the most straightforward way to + # assert chunked prefill correctness. + # 2. It focuses on cases where sequences change in the middle of mamba + # chunks, and not necessarily on chunk boundaries. + + max_seqlen = max(seqlens) + # This test can have larger error for longer sequences + if max_seqlen > 256: + atol, rtol = 1e-2, 5e-3 + else: + atol, rtol = 5e-3, 5e-3 + + num_sequences = len(seqlens) + n_heads = 16 + d_head = 64 + itype = torch.float32 + + # hold state during the cutting process so we know if an + # example has been exhausted and needs to cycle + last_taken: dict = {} # map: eg -> pointer to last taken sample + exhausted: dict = {} # map: eg -> boolean indicating example is exhausted + _, cu_seqlens, seq_idx, (A, dt, X, B, C) = next( + generate_continuous_batched_examples([seqlens], + num_sequences, + max_seqlen, + last_taken, + exhausted, + n_heads, + d_head, + itype, + return_naive_ref=False)) + seqlens = torch.tensor(seqlens, dtype=torch.int32, device=X.device) + device = X.device + + ## full seqlen computation + chunk_indices, chunk_offsets = \ + _query_start_loc_to_chunk_indices_offsets( + cu_seqlens, chunk_size, cu_seqlens[-1]) + Y_ref = torch.empty_like(X) + state_ref = mamba_chunk_scan_combined( + X, + dt, + A, + B, + C, + chunk_size, + D=None, + cu_seqlens=cu_seqlens, + seq_idx=seq_idx, + chunk_indices=chunk_indices, + chunk_offsets=chunk_offsets, + return_varlen_states=True, + initial_states=None, + out=Y_ref, + ) + + ## chunked seqlen computation + # first chunk + chunked_seqlens = seqlens // 2 + chunked_cu_seqlens = torch.cat([ + torch.tensor([0], device=device), + torch.cumsum(chunked_seqlens, dim=0) + ], + dim=0) + chunked_seq_idx = torch.repeat_interleave( + torch.arange(len(chunked_seqlens), device=device), + chunked_seqlens, + output_size=chunked_cu_seqlens[-1]).unsqueeze(0).to(torch.int32) + chunked_input_seq_len = chunked_cu_seqlens[-1] + X_chunked = torch.zeros_like(X)[:, :chunked_input_seq_len, ...] + dt_chunked = torch.zeros_like(dt)[:, :chunked_input_seq_len, ...] + B_chunked = torch.zeros_like(B)[:, :chunked_input_seq_len, ...] + C_chunked = torch.zeros_like(C)[:, :chunked_input_seq_len, ...] + for i in range(num_sequences): + # fmt: off + chunk_f = lambda x, i: x[:, cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...] # noqa: E501 + + X_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i) # noqa: E501 + dt_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i) # noqa: E501 + B_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i) # noqa: E501 + C_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i) # noqa: E501 + # fmt: on + + chunk_indices, chunk_offsets = \ + _query_start_loc_to_chunk_indices_offsets( + chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1]) + Y_partial = torch.empty_like(X_chunked) + partial_state = mamba_chunk_scan_combined( + X_chunked, + dt_chunked, + A, + B_chunked, + C_chunked, + chunk_size, + D=None, + cu_seqlens=chunked_cu_seqlens, + seq_idx=chunked_seq_idx, + chunk_indices=chunk_indices, + chunk_offsets=chunk_offsets, + return_varlen_states=True, + initial_states=None, + out=Y_partial, + ) + + # remaining chunk + remaining_chunked_seqlens = seqlens - chunked_seqlens + remaining_chunked_cu_seqlens = torch.cat([ + torch.tensor([0], device=device), + torch.cumsum(remaining_chunked_seqlens, dim=0) + ], + dim=0) + remaining_chunked_seq_idx = torch.repeat_interleave( + torch.arange(len(remaining_chunked_seqlens), device=device), + remaining_chunked_seqlens, + output_size=remaining_chunked_cu_seqlens[-1]).unsqueeze(0).to( + torch.int32) + remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1] + # fmt: off + remaining_X_chunked = torch.zeros_like(X)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501 + remaining_dt_chunked = torch.zeros_like(dt)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501 + remaining_B_chunked = torch.zeros_like(B)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501 + remaining_C_chunked = torch.zeros_like(C)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501 + for i in range(num_sequences): + remaining_chunk_f = lambda x, i: x[:, cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...] # noqa: E501 + + remaining_X_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i) # noqa: E501 + remaining_dt_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i) # noqa: E501 + remaining_B_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i) # noqa: E501 + remaining_C_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i) # noqa: E501 + + # assert input chunking is correct + concat_chunk_f = lambda pt1, pt2, i: torch.cat([ + pt1[:,chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...], + pt2[:,remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...], + ], + dim=1) + concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=1) # noqa: E501 + # fmt: on + + assert concat_batch_f(X_chunked, remaining_X_chunked).equal(X) + assert concat_batch_f(dt_chunked, remaining_dt_chunked).equal(dt) + assert concat_batch_f(B_chunked, remaining_B_chunked).equal(B) + assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C) + + chunk_indices, chunk_offsets = \ + _query_start_loc_to_chunk_indices_offsets( + remaining_chunked_cu_seqlens, + chunk_size, + remaining_chunked_cu_seqlens[-1]) + + Y_chunked = torch.empty_like(remaining_X_chunked) + state_chunked = mamba_chunk_scan_combined( + remaining_X_chunked, + remaining_dt_chunked, + A, + remaining_B_chunked, + remaining_C_chunked, + chunk_size, + D=None, + cu_seqlens=remaining_chunked_cu_seqlens, + seq_idx=remaining_chunked_seq_idx, + chunk_indices=chunk_indices, + chunk_offsets=chunk_offsets, + return_varlen_states=True, + initial_states=partial_state, + out=Y_chunked, + ) + Y = concat_batch_f(Y_partial, Y_chunked) + + # kernel chunked is same as kernel overall + for i in range(num_sequences): + Y_seq = Y[:, cu_seqlens[i]:cu_seqlens[i + 1], ...] + Y_ref_seq = Y_ref[:, cu_seqlens[i]:cu_seqlens[i + 1], ...] + torch.testing.assert_close( + Y_seq[:, :chunked_seqlens[i], ...], + Y_ref_seq[:, :chunked_seqlens[i], ...], + atol=atol, + rtol=rtol, + msg=lambda x: f"seq{i} output part1 " + x) # noqa: B023 + torch.testing.assert_close( + Y_seq[:, chunked_seqlens[i]:, ...], + Y_ref_seq[:, chunked_seqlens[i]:, ...], + atol=atol, + rtol=rtol, + msg=lambda x: f"seq{i} output part2 " + x) # noqa: B023 + + state_seq = state_chunked[i] + state_seq_ref = state_ref[i] + torch.testing.assert_close( + state_seq, + state_seq_ref, + atol=atol, + rtol=rtol, + msg=lambda x: f"seq{i} state " + x) # noqa: B023 diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py index 365139e237c66..fb8350e191c94 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -289,6 +289,9 @@ def _chunk_scan_fwd_kernel( # get the cs at the offset boundary # - c_off == 0 is a passthrough + # - We need dA_cs at the boundary, defined by c_off - no need + # to increase pointer by pid_m (it is a constant offset, + # i.e. the same for all blocks) dA_cs_m_boundary = tl.load( dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize, mask=(((c_off - 1) > -1) and ((c_off) < chunk_size)), diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py index d0b3e9e5235bf..fcc5c905bf77f 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py @@ -106,21 +106,24 @@ def _mamba_chunk_scan_combined_fwd(x, # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries # (middle term of factorization of off-diag blocks; A terms) # - for handling chunked prefill, this requires i) initial_states - # ii) seq_idx and iii) is_cont_batched to be all specified. + # ii) seq_idx iii) is_cont_batched and (iv) chunk_offsets to be all specified. # - When a new seq_idx is detected, we will stop passing the prev_state # and switch accordingly to the init_state corresponding to the new seq_idx. + # - We will also make sure that the dA_cumsum is taken only from the start of the + # sequence (hence we need the full dA_cumsum tensor and not just the values at chunk boundaries) # - this will ensure that states will be updated with the rightmost flushed seq_idx # of the previous chunk. This implies that the first chunk of states is either 0 # or equal to init_states of the first example. states, final_states = _state_passing_fwd( rearrange(states, "... p n -> ... (p n)"), - dA_cumsum[:, :, :, -1], + dA_cumsum, initial_states=rearrange(initial_states, "... p n -> ... (p n)") if initial_states is not None else None, seq_idx=seq_idx, chunk_size=chunk_size, out_dtype=state_dtype if state_dtype is not None else C.dtype, - is_cont_batched=cu_seqlens is not None) + is_cont_batched=cu_seqlens is not None, + chunk_offsets=chunk_offsets) states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]) diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py index a28fc9ffad71b..d61c3a8cdbe9c 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py @@ -31,6 +31,8 @@ def _state_passing_fwd_kernel( dA_cs_ptr, initstates_ptr, seq_idx_ptr, + chunk_offsets_ptr, + chunk_meta_num, # Matrix dimensions dim, nchunks, @@ -51,6 +53,7 @@ def _state_passing_fwd_kernel( stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, + stride_dA_cs_csize, stride_initstates_batch, stride_initstates_head, stride_initstates_dim, @@ -66,7 +69,8 @@ def _state_passing_fwd_kernel( pid_h = tl.program_id(axis=2) pid_m = tl.program_id(axis=0) states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head - dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + ( + chunk_size - 1) * stride_dA_cs_csize out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head final_states_ptr += pid_b * stride_final_states_batch + pid_h * stride_final_states_head if HAS_INITSTATES: @@ -95,35 +99,62 @@ def _state_passing_fwd_kernel( tl.store(out_ptrs, states, mask=offs_m < dim) out_ptrs += stride_out_chunk - seq_idx = 0 + prev_seq_idx_chunk_end = 0 + logical_chunk_idx = 0 for c in range(nchunks): new_states = tl.load(states_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) dA_cs = tl.load(dA_cs_ptr).to(tl.float32) - scale = tl.exp(dA_cs) + scale_mask = True if HAS_SEQ_IDX: # - the seq to pass forward is the one that is flushed to the right # boundary. - # - that is given by seq_idx_new below. - seq_idx_new = tl.load(seq_idx_ptr + - (min((c + 1) * chunk_size, seqlen) - 1) * - stride_seq_idx_seqlen) + # - that is given by seq_idx_chunk_end below: the sequence index at the end of the chunk. + seq_idx_chunk_end = tl.load(seq_idx_ptr + (min( + (c + 1) * chunk_size, seqlen) - 1) * stride_seq_idx_seqlen) if HAS_INITSTATES: - if IS_CONT_BATCHED and seq_idx != seq_idx_new: + if IS_CONT_BATCHED and prev_seq_idx_chunk_end != seq_idx_chunk_end: # this means in the current chunk the rightmost flushed seq # has changed. # - so we do not propagate the state from previous chunk # - but rather we load that sequence's init state - initstates_ptrs = initstates_ptr + seq_idx_new * stride_initstates_batch + initstates_ptrs = initstates_ptr + seq_idx_chunk_end * stride_initstates_batch # - update state with seq_idx_new's init state states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) - else: - scale = tl.where(seq_idx_new == seq_idx, scale, 0.0) - seq_idx = seq_idx_new + # - we need to consider the cumsum only of the last sequence in the chunk + # - find its starting position (given by c_off of the logical chunk index) + # - and subtract the cumsum just before that position from the total cumsum + # - first, update the logical chunk index (add the number of sequences in the current physical chunk): + # sequence index at the start of the current chunk + seq_idx_chunk_start = tl.load(seq_idx_ptr + + min(c * chunk_size, seqlen) * + stride_seq_idx_seqlen) + logical_chunk_idx += seq_idx_chunk_end - seq_idx_chunk_start + # - load the chunk offset: + c_off = tl.load(chunk_offsets_ptr + logical_chunk_idx, + mask=logical_chunk_idx < chunk_meta_num, + other=0) + # - if offset is 0, then the sequence starts at the beginning of the chunk, and we don't need to subtract anything + if c_off > 0: + # - dA_cs_ptr currently points to the cumsum at the end of the chunk - subtract the chunk size and add the offset + dA_cs_boundary = tl.load( + dA_cs_ptr - (chunk_size - 1) * stride_dA_cs_csize + + (c_off - 1) * stride_dA_cs_csize, + mask=(c_off - 1) > -1 and c_off < chunk_size, + other=0.0) + dA_cs -= dA_cs_boundary + + # - increment logical chunk index for every physical chunk + logical_chunk_idx += 1 + else: + scale_mask = seq_idx_chunk_end == prev_seq_idx_chunk_end + prev_seq_idx_chunk_end = seq_idx_chunk_end + + scale = tl.where(scale_mask, tl.exp(dA_cs), 0.0) states = scale * states + new_states if c < nchunks - 1: tl.store(out_ptrs, states, mask=offs_m < dim) @@ -136,28 +167,36 @@ def _state_passing_fwd_kernel( def _state_passing_fwd( states, - dA_chunk_cumsum, + dA_cumsum, initial_states=None, seq_idx=None, chunk_size=None, out_dtype=None, is_cont_batched=False, + chunk_offsets=None, ): batch, nchunks, nheads, dim = states.shape - assert dA_chunk_cumsum.shape == (batch, nheads, nchunks) + if chunk_size is None: + chunk_size = dA_cumsum.shape[-1] + else: + assert chunk_size == dA_cumsum.shape[-1] + assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size) if initial_states is not None: if is_cont_batched: # - if cu_seqlens is provided, then the initial states # are used for continuous batching. In which case we # require seq_idx to be provided - assert seq_idx is not None, "" + assert seq_idx is not None, "seq_idx must be provided for continuous batching" + # - we also need chunk_offsets to be provided, to account + # for computation of dA_cumsum from the start of the + # sequence + assert chunk_offsets is not None, "chunk_offsets must be provided for continuous batching" else: # - this is the regular batching case, where initial # states are used are for each example of the batch. assert initial_states.shape == (batch, nheads, dim) if seq_idx is not None: - assert chunk_size is not None seqlen = seq_idx.shape[-1] assert seq_idx.shape == (batch, seqlen) out_dtype = states.dtype if out_dtype is None else out_dtype @@ -173,13 +212,15 @@ def _state_passing_fwd( states, out, final_states, - dA_chunk_cumsum, + dA_cumsum, initial_states, seq_idx, + chunk_offsets, + len(chunk_offsets) if chunk_offsets is not None else 0, dim, nchunks, seqlen if seq_idx is not None else 0, - chunk_size if seq_idx is not None else 0, + chunk_size, states.stride(0), states.stride(1), states.stride(2), @@ -191,9 +232,10 @@ def _state_passing_fwd( final_states.stride(0), final_states.stride(1), final_states.stride(2), - dA_chunk_cumsum.stride(0), - dA_chunk_cumsum.stride(2), - dA_chunk_cumsum.stride(1), + dA_cumsum.stride(0), + dA_cumsum.stride(2), + dA_cumsum.stride(1), + dA_cumsum.stride(3), *((initial_states.stride(0), initial_states.stride(1), initial_states.stride(2)) if initial_states is not None else (0, 0, 0)), diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index f3e6cd7430e0b..359bad1ea9dee 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -16,9 +16,58 @@ from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, from vllm.v1.kv_cache_interface import AttentionSpec -def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor, - chunk_size: int, - total_seqlens: int): +def _query_start_loc_to_chunk_indices_offsets( + query_start_loc: torch.Tensor, chunk_size: int, + total_seqlens: int) -> tuple[torch.Tensor, torch.Tensor]: + """ + Args: + query_start_loc (torch.Tensor): 1D tensor of cumulative sequence + lengths, shape (num_seqs + 1,). + The first element should be 0. Each entry represents the starting + index of a sequence in the flattened token array. + chunk_size (int): The size of each physical mamba chunk + (number of tokens per chunk). + total_seqlens (int): The total number of tokens in the batch. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: A tuple containing: + - chunk_indices (torch.Tensor): 1D tensor of indices + indicating the physical chunk for each logical chunk. + - chunk_offsets (torch.Tensor): 1D tensor of offsets + indicating the starting index of each logical chunk within + its physical chunk. + + This function computes the chunk indices and offsets for the given + query_start_loc and chunk_size. Both are tensors of integers with length N, + where N is the number of logical (pseudo) chunks. + A logical chunk is a sequence of tokens that are all part of the same + sequence and are all in the same physical mamba chunk. + In other words, a logical chunk changes every time we cross a sequence + boundary or a physical mamba chunk boundary. + Logical chunks are needed to handle batched requests with initial states + (see _state_passing_fwd and _chunk_scan_fwd). + The chunk_indices tensor contains the index of the physical chunk for each + logical chunk. + The chunk_offsets tensor contains the offset (AKA starting index) of the + logical chunk in the physical chunk. + + Example: + query_start_loc = [0, 5, 10] + chunk_size = 8 + total_seqlens = 10 + -> chunk_indices = [0, 0, 1] + -> chunk_offsets = [0, 5, 0] + + In this example, we have 2 sequences, each with 5 tokens. The physical + chunk size is 8 tokens. + We have three logical chunks: + - the first logical chunk starts at token 0 in the first physical chunk + and contains all 5 tokens from the first sequence + - the second logical chunk starts at token 5 in the first physical chunk + and contains first 3 tokens from the second sequence + - the third logical chunk starts at token 0 in the second physical chunk + and contains the remaining 2 tokens from the second sequence + """ cu_seqlens = query_start_loc[1:] # remove prepended 0 From 9cd76b71abf15b31878f8d9675546f809a6ba150 Mon Sep 17 00:00:00 2001 From: Christian Pinto <christian.pinto@ibm.com> Date: Mon, 8 Sep 2025 15:40:26 +0200 Subject: [PATCH 031/584] [Misc] Terratorch related fixes (#24337) Signed-off-by: Christian Pinto <christian.pinto@ibm.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- .../prithvi_geospatial_mae_io_processor.py | 4 ++-- .../online_serving/prithvi_geospatial_mae.py | 4 ++-- requirements/test.in | 2 +- requirements/test.txt | 2 +- .../entrypoints/openai/test_skip_tokenizer.py | 2 +- tests/models/registry.py | 4 ++-- tests/models/test_terratorch.py | 2 +- .../prithvi_io_processor/__init__.py | 6 ++---- .../prithvi_io_processor/prithvi_processor.py | 20 ++----------------- .../prithvi_io_processor_plugin/setup.py | 3 +-- .../test_io_processor_plugins.py | 6 +++--- 11 files changed, 18 insertions(+), 37 deletions(-) diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py index 5d629fabf0a27..418c40645f9f2 100644 --- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py +++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py @@ -18,7 +18,7 @@ from vllm.pooling_params import PoolingParams def main(): torch.set_default_dtype(torch.float16) - image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501 + image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501 img_prompt = dict( data=image_url, @@ -36,7 +36,7 @@ def main(): # to avoid the model going OOM. # The maximum number depends on the available GPU memory max_num_seqs=32, - io_processor_plugin="prithvi_to_tiff_india", + io_processor_plugin="prithvi_to_tiff", model_impl="terratorch", ) diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py index c6eed64838ea4..611a7cbc89fa2 100644 --- a/examples/online_serving/prithvi_geospatial_mae.py +++ b/examples/online_serving/prithvi_geospatial_mae.py @@ -18,11 +18,11 @@ import requests # --model-impl terratorch # --task embed --trust-remote-code # --skip-tokenizer-init --enforce-eager -# --io-processor-plugin prithvi_to_tiff_india +# --io-processor-plugin prithvi_to_tiff def main(): - image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501 + image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501 server_endpoint = "http://localhost:8000/pooling" request_payload_url = { diff --git a/requirements/test.in b/requirements/test.in index 5db9cd797904f..1bbf0074a8881 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -54,4 +54,4 @@ runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 pydantic>=2.10 # 2.9 leads to error on python 3.10 decord==0.6.0 -terratorch==1.1rc3 # required for PrithviMAE test +terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test diff --git a/requirements/test.txt b/requirements/test.txt index 332a9b9cfbf59..65ef7c3c64ba6 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1042,7 +1042,7 @@ tensorboardx==2.6.4 # via lightning tensorizer==2.10.1 # via -r requirements/test.in -terratorch==1.1rc3 +terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e # via -r requirements/test.in threadpoolctl==3.5.0 # via scikit-learn diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py index af520ac61d8df..840e0dac81c97 100644 --- a/tests/entrypoints/openai/test_skip_tokenizer.py +++ b/tests/entrypoints/openai/test_skip_tokenizer.py @@ -11,7 +11,7 @@ import torch from ...utils import RemoteOpenAIServer -MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11" +MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" DTYPE = "float16" diff --git a/tests/models/registry.py b/tests/models/registry.py index c6ff50b5426e1..e4c215b108446 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -383,7 +383,7 @@ _EMBEDDING_EXAMPLE_MODELS = { "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", trust_remote_code=True), "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501 - "PrithviGeoSpatialMAE": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 + "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 dtype=torch.float16, enforce_eager=True, skip_tokenizer_init=True, @@ -391,7 +391,7 @@ _EMBEDDING_EXAMPLE_MODELS = { # going OOM in CI max_num_seqs=32, ), - "Terratorch": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", + "Terratorch": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 dtype=torch.float16, enforce_eager=True, skip_tokenizer_init=True, diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py index bfa54280dc02d..d6d43ca2f7e15 100644 --- a/tests/models/test_terratorch.py +++ b/tests/models/test_terratorch.py @@ -11,7 +11,7 @@ from vllm.utils import set_default_torch_num_threads @pytest.mark.parametrize( "model", [ - "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", + "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", "mgazz/Prithvi_v2_eo_300_tl_unet_agb" ], ) diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py index a750c756c11a2..4bbb79c98a82a 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -def register_prithvi_india(): - return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia" # noqa: E501 -def register_prithvi_valencia(): - return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia" # noqa: E501 +def register_prithvi(): + return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor" # noqa: E501 diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py index 0ebaafda94dc5..42874f0398f0a 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py @@ -234,6 +234,8 @@ def load_image( class PrithviMultimodalDataProcessor(IOProcessor): + indices = [0, 1, 2, 3, 4, 5] + def __init__(self, vllm_config: VllmConfig): super().__init__(vllm_config) @@ -412,21 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor): format="tiff", data=out_data, request_id=request_id) - - -class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor): - - def __init__(self, vllm_config: VllmConfig): - - super().__init__(vllm_config) - - self.indices = [1, 2, 3, 8, 11, 12] - - -class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor): - - def __init__(self, vllm_config: VllmConfig): - - super().__init__(vllm_config) - - self.indices = [0, 1, 2, 3, 4, 5] diff --git a/tests/plugins/prithvi_io_processor_plugin/setup.py b/tests/plugins/prithvi_io_processor_plugin/setup.py index a03b1fbbd4a80..3ddda1a47bbe4 100644 --- a/tests/plugins/prithvi_io_processor_plugin/setup.py +++ b/tests/plugins/prithvi_io_processor_plugin/setup.py @@ -9,8 +9,7 @@ setup( packages=["prithvi_io_processor"], entry_points={ "vllm.io_processor_plugins": [ - "prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india", # noqa: E501 - "prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia", # noqa: E501 + "prithvi_to_tiff = prithvi_io_processor:register_prithvi", # noqa: E501 ] }, ) diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index 825165e89b33c..3567a701a3afa 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import IOProcessorResponse from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams -MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11" +MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501 @@ -35,7 +35,7 @@ def server(): "--max-num-seqs", "32", "--io-processor-plugin", - "prithvi_to_tiff_valencia", + "prithvi_to_tiff", "--model-impl", "terratorch", ] @@ -107,7 +107,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): # to avoid the model going OOM in CI. max_num_seqs=1, model_impl="terratorch", - io_processor_plugin="prithvi_to_tiff_valencia", + io_processor_plugin="prithvi_to_tiff", ) as llm_runner: pooler_output = llm_runner.get_llm().encode( img_prompt, From 03dd652c16f8aa53190277cb1e48c5938caf6d76 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 8 Sep 2025 14:41:27 +0100 Subject: [PATCH 032/584] Move `KVEventsConfig` from `config/__init__.py` to `config/kv_events.py` (#24433) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/distributed/conftest.py | 2 +- vllm/config/__init__.py | 43 +----------------------------- vllm/config/kv_events.py | 50 +++++++++++++++++++++++++++++++++++ vllm/distributed/kv_events.py | 2 +- 4 files changed, 53 insertions(+), 44 deletions(-) create mode 100644 vllm/config/kv_events.py diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py index 666a715cc0da1..7dc4a0cc3d582 100644 --- a/tests/distributed/conftest.py +++ b/tests/distributed/conftest.py @@ -8,7 +8,7 @@ import msgspec.msgpack import pytest import zmq -from vllm.config import KVEventsConfig +from vllm.config.kv_events import KVEventsConfig from vllm.distributed.kv_events import EventPublisherFactory from .test_events import SampleBatch diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index f6f1838aedfc2..e3ce1987fe9ee 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -33,6 +33,7 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType, PrefixCachingHashAlgo) from vllm.config.compilation import (CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig) +from vllm.config.kv_events import KVEventsConfig from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -3310,48 +3311,6 @@ class KVTransferConfig: return self.kv_connector_extra_config.get(key, default) -@config -@dataclass -class KVEventsConfig: - """Configuration for KV event publishing.""" - - enable_kv_cache_events: bool = False - """If True, enable KV cache events for tracking block storage and removal. - Events can be published externally by zmq using the event publisher config. - """ - - publisher: str = "null" - """The publisher to use for publishing kv events. Can be "null", "zmq". - """ - - endpoint: str = "tcp://*:5557" - """The zmq endpoint to use for publishing kv events. - """ - - replay_endpoint: Optional[str] = None - """The zmq endpoint to use for replaying kv events. - """ - - buffer_steps: int = 10_000 - """The number of steps to cache for replay endpoint. Will only save - events from the last N steps for the replay endpoint. - """ - - hwm: int = 100_000 - """The zmq high water mark for the event publisher. After queueing N events, - events will start dropping if the consumer is not keeping up. - """ - - max_queue_size: int = 100_000 - """The maximum number of events to queue while waiting for publishing. - """ - - topic: str = "" - """The topic to use for the event publisher. Consumers can subscribe to - this topic to receive events. - """ - - @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class VllmConfig: diff --git a/vllm/config/kv_events.py b/vllm/config/kv_events.py new file mode 100644 index 0000000000000..1c6bdffa1281d --- /dev/null +++ b/vllm/config/kv_events.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + + +@config +@dataclass +class KVEventsConfig: + """Configuration for KV event publishing.""" + + enable_kv_cache_events: bool = False + """If True, enable KV cache events for tracking block storage and removal. + Events can be published externally by zmq using the event publisher config. + """ + + publisher: str = "null" + """The publisher to use for publishing kv events. Can be "null", "zmq". + """ + + endpoint: str = "tcp://*:5557" + """The zmq endpoint to use for publishing kv events. + """ + + replay_endpoint: Optional[str] = None + """The zmq endpoint to use for replaying kv events. + """ + + buffer_steps: int = 10_000 + """The number of steps to cache for replay endpoint. Will only save + events from the last N steps for the replay endpoint. + """ + + hwm: int = 100_000 + """The zmq high water mark for the event publisher. After queueing N events, + events will start dropping if the consumer is not keeping up. + """ + + max_queue_size: int = 100_000 + """The maximum number of events to queue while waiting for publishing. + """ + + topic: str = "" + """The topic to use for the event publisher. Consumers can subscribe to + this topic to receive events. + """ diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 37f8f72fa9056..09f42b550fe2b 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -14,7 +14,7 @@ from typing import Any, Callable, Optional, Union import msgspec import zmq -from vllm.config import KVEventsConfig +from vllm.config.kv_events import KVEventsConfig from vllm.logger import init_logger logger = init_logger(__name__) From 01dfb5e982f45fff33c715e8c248555b0361276e Mon Sep 17 00:00:00 2001 From: Chenheli Hua <huachenheli@outlook.com> Date: Mon, 8 Sep 2025 06:42:20 -0700 Subject: [PATCH 033/584] [Frontend] User-provided uuids for medias in chat. (RFC #22044) (#23449) Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> --- docs/features/multimodal_inputs.md | 42 +- tests/entrypoints/openai/test_vision.py | 129 ++++ tests/entrypoints/test_chat_utils.py | 752 +++++++++++++++++++++- vllm/entrypoints/chat_utils.py | 181 ++++-- vllm/entrypoints/llm.py | 5 +- vllm/entrypoints/openai/serving_engine.py | 6 +- vllm/inputs/preprocess.py | 36 +- vllm/model_executor/models/terratorch.py | 7 +- 8 files changed, 1079 insertions(+), 79 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 206ab7a468755..77baa27c7a958 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f ```python from vllm import LLM - + # Default white background (no configuration needed) llm = LLM(model="llava-hf/llava-1.5-7b-hf") - + # Custom black background for dark theme llm = LLM( model="llava-hf/llava-1.5-7b-hf", media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} ) - + # Custom brand color background (e.g., blue) llm = LLM( - model="llava-hf/llava-1.5-7b-hf", + model="llava-hf/llava-1.5-7b-hf", media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} ) ``` @@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd ## Online Serving -Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). +Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests. !!! important A chat template is **required** to use Chat Completions API. @@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows: # NOTE: The prompt formatting with the image token `<image>` is not needed # since the prompt will be processed automatically by the API server. {"type": "text", "text": "What’s in this image?"}, - {"type": "image_url", "image_url": {"url": image_url}}, + { + "type": "image_url", + "image_url": { + url": image_url + }, + "uuid": image_url # Optional + }, ], }], ) @@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows: "role": "user", "content": [ {"type": "text", "text": "What are the animals in these images?"}, - {"type": "image_url", "image_url": {"url": image_url_duck}}, - {"type": "image_url", "image_url": {"url": image_url_lion}}, + { + "type": "image_url", + "image_url": { + "url": image_url_duck + }, + "uuid": image_url_duck # Optional + }, + { + "type": "image_url", + "image_url": { + "url": image_url_lion + }, + "uuid": image_url_lion # Optional + }, ], }], ) @@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows: "video_url": { "url": video_url }, + "uuid": video_url # Optional }, ], }], @@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows: "data": audio_base64, "format": "wav" }, + "uuid": audio_url # Optional }, ], }], @@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag "audio_url": { "url": audio_url }, + "uuid": audio_url # Optional }, ], }], @@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se model = "llava-hf/llava-1.5-7b-hf" embeds = { "type": "image_embeds", - "image_embeds": f"{base64_image_embedding}" + "image_embeds": f"{base64_image_embedding}", + "uuid": image_url # Optional } # Pass additional parameters (available to Qwen2-VL and MiniCPM-V) @@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se "image_embeds": f"{base64_image_embedding}" , # Required "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct }, + "uuid": image_url # Optional } model = "openbmb/MiniCPM-V-2_6" embeds = { @@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se "image_embeds": f"{base64_image_embedding}" , # Required "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 }, + "uuid": image_url # Optional } chat_completion = client.chat.completions.create( messages=[ diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 9d61754059e2f..29a3b40d2d865 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "image_urls", + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) +async def test_completions_with_image( + client: openai.AsyncOpenAI, + model_name: str, + image_urls: list[str], +): + for image_url in image_urls: + chat_completion = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + } + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion.choices[0].message.content is not None + assert isinstance(chat_completion.choices[0].message.content, str) + assert len(chat_completion.choices[0].message.content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "image_urls", + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) +async def test_completions_with_image_with_uuid( + client: openai.AsyncOpenAI, + model_name: str, + image_urls: list[str], +): + for image_url in image_urls: + chat_completion = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_url + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion.choices[0].message.content is not None + assert isinstance(chat_completion.choices[0].message.content, str) + assert len(chat_completion.choices[0].message.content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "image_urls", + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) +async def test_completions_with_image_with_incorrect_uuid_format( + client: openai.AsyncOpenAI, + model_name: str, + image_urls: list[str], +): + for image_url in image_urls: + chat_completion = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + "incorrect_uuid_key": image_url, + }, + "also_incorrect_uuid_key": image_url, + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion.choices[0].message.content is not None + assert isinstance(chat_completion.choices[0].message.content, str) + assert len(chat_completion.choices[0].message.content) > 0 diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 18db1027c004d..5149ca346050e 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, resolve_chat_template_content_format, resolve_hf_chat_template) from vllm.entrypoints.llm import apply_hf_chat_template -from vllm.multimodal import MultiModalDataDict +from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64, encode_video_base64) from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -179,6 +179,27 @@ def _assert_mm_data_is_image_input( assert isinstance(image_data, list) and len(image_data) == image_count +def _assert_mm_uuids( + mm_uuids: Optional[MultiModalUUIDDict], + media_count: int, + expected_uuids: list[Optional[str]], + modality: str = "image", +) -> None: + if len(expected_uuids) > 0: + assert mm_uuids is not None + assert modality in mm_uuids + + image_uuids = mm_uuids.get(modality) + assert image_uuids is not None + + assert isinstance(image_uuids, + list) and len(image_uuids) == media_count + + assert image_uuids == expected_uuids + else: + assert mm_uuids is None + + ModalityType = Literal["image", "video", "audio"] MultiModalDataCounts = Mapping[ModalityType, int] @@ -201,7 +222,7 @@ def test_parse_chat_messages_single_image( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -228,6 +249,260 @@ def test_parse_chat_messages_single_image( "content": "<|image_1|>\nWhat's in the image?" }] _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) + + +def test_parse_chat_messages_single_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + +def test_parse_chat_messages_single_image_with_bad_uuid_format( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + "uuid": image_uuid, + }, + "bad_uuid_key": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) + + +def test_parse_chat_messages_multiple_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_single_image_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(await mm_future, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_images_with_uuids_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid1, + }, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?", + }] + _assert_mm_data_is_image_input(await mm_future, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid2 = "my_uuid_2" + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?", + }] + _assert_mm_data_is_image_input(await mm_future, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2]) def test_parse_chat_messages_empty_system( @@ -235,7 +510,7 @@ def test_parse_chat_messages_empty_system( mistral_tokenizer, ): # Test string format - conversation, _ = parse_chat_messages( + conversation, _, _ = parse_chat_messages( [ { "role": "system", @@ -265,7 +540,7 @@ def test_parse_chat_messages_empty_system( ] # Test openai format - conversation, _ = parse_chat_messages( + conversation, _, _ = parse_chat_messages( [ { "role": "system", @@ -307,7 +582,7 @@ async def test_parse_chat_messages_single_image_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures( + conversation, mm_future, mm_uuids = parse_chat_messages_futures( [{ "role": "user", @@ -334,6 +609,7 @@ async def test_parse_chat_messages_single_image_async( "content": "<|image_1|>\nWhat's in the image?" }] _assert_mm_data_is_image_input(await mm_future, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) def test_parse_chat_messages_multiple_images( @@ -341,7 +617,7 @@ def test_parse_chat_messages_multiple_images( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -374,6 +650,7 @@ def test_parse_chat_messages_multiple_images( "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) @pytest.mark.asyncio @@ -382,7 +659,7 @@ async def test_parse_chat_messages_multiple_images_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures( + conversation, mm_future, mm_uuids = parse_chat_messages_futures( [{ "role": "user", @@ -415,6 +692,7 @@ async def test_parse_chat_messages_multiple_images_async( "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(await mm_future, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_placeholder_already_in_prompt( @@ -422,7 +700,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -458,6 +736,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( "What's in <|image_1|> and how does it compare to <|image_2|>?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_placeholder_one_already_in_prompt( @@ -465,7 +744,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -503,6 +782,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( "other one?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_across_messages( @@ -510,7 +790,7 @@ def test_parse_chat_messages_multiple_images_across_messages( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": @@ -569,13 +849,84 @@ def test_parse_chat_messages_multiple_images_across_messages( }, ] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) + + +def test_parse_chat_messages_multiple_images_with_uuids_across_messages( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What about this one?" + }, + ], + }, + ], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": "user", + "content": "<|image_1|>\nWhat's in this image?" + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": "user", + "content": "<|image_2|>\nWhat about this one?" + }, + ] + _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_context_text_format( phi3v_model_config, phi3v_tokenizer, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", @@ -621,6 +972,8 @@ def test_parse_chat_messages_context_text_format( }], }, ] + assert mm_data is None + assert mm_uuids is None def test_parse_chat_messages_rejects_too_many_images_in_one_message( @@ -736,7 +1089,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -762,6 +1115,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_interleave( @@ -769,7 +1123,7 @@ def test_parse_chat_messages_multiple_images_interleave( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -813,6 +1167,7 @@ def test_parse_chat_messages_multiple_images_interleave( "Do they have differences?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) @pytest.mark.asyncio @@ -821,7 +1176,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages_futures( + conversation, mm_data, mm_uuids = parse_chat_messages_futures( [{ "role": "user", @@ -865,6 +1220,63 @@ async def test_parse_chat_messages_multiple_images_interleave_async( "Do they have differences?", }] _assert_mm_data_is_image_input(await mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "I need you to compare this image", + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "and this one" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "Do they have differences?" + }, + ], + }], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 + "Do they have differences?", + }] + _assert_mm_data_is_image_input(await mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_multiple_images_multiple_messages_interleave( @@ -872,7 +1284,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": @@ -935,6 +1347,81 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( }, ] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) + + +def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501 + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "Be accurate." + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + ], + }, + ], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": "user", + "content": "What's on this image?\n<|image_1|>\nBe accurate.", + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": "user", + "content": "What's on this image?\n<|image_2|>" + }, + ] + _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( @@ -944,7 +1431,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( video_url, audio_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": @@ -1030,6 +1517,229 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ] _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=[None, None]) + _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None]) + _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) + + +def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501 + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": "image_123", + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + "uuid": "audio_123", + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": "image_123", + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": { + "url": video_url + }, + "uuid": "video_123", + }, + ], + }, + ], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] + + _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=["image_123", "image_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="video", + expected_uuids=["video_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=["audio_123"]) + + +def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": "image_123", + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + } + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": { + "url": video_url + }, + "uuid": "video_123", + }, + ], + }, + ], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] + + _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=["image_123", None]) + _assert_mm_uuids(mm_uuids, + 1, + modality="video", + expected_uuids=["video_123"]) + _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) def test_parse_chat_messages_multiple_images_interleave_with_placeholders( @@ -1081,7 +1791,7 @@ def test_mllama_single_image( image_url, ): """Ensures that a single image is parsed correctly mllama.""" - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -1100,6 +1810,7 @@ def test_mllama_single_image( content_format="openai", ) _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) assert conversation == [{ "role": "user", @@ -1121,7 +1832,7 @@ def test_mllama_interleaved_images( image_url, ): """Ensures that multiple image are parsed as interleaved dicts.""" - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -1147,6 +1858,7 @@ def test_mllama_interleaved_images( content_format="openai", ) _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) assert conversation == [{ "role": "user", @@ -1227,7 +1939,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): # Now parse with vLLMs chat utils & apply the template vllm_conversation = get_conversation(is_hf=False) - conversation, _ = parse_chat_messages( + conversation, _, _ = parse_chat_messages( vllm_conversation, model_config, tokenizer_group, @@ -1518,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config, }], }] - conversation_with_thinking, _ = parse_chat_messages( + conversation_with_thinking, _, _ = parse_chat_messages( messages, mistral_model_config, mistral_tokenizer, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 80e2c44a02513..b53dbfb3a26a2 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -41,7 +41,8 @@ from typing_extensions import Required, TypeAlias, TypedDict from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalUUIDDict) from vllm.multimodal.utils import MediaConnector # yapf: disable from vllm.transformers_utils.chat_templates import ( @@ -72,6 +73,11 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False): type: Required[Literal["audio_url"]] """The type of the content part.""" + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): @@ -83,6 +89,11 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): """ type: Required[Literal["image_embeds"]] """The type of the content part.""" + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class VideoURL(TypedDict, total=False): @@ -97,6 +108,11 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): type: Required[Literal["video_url"]] """The type of the content part.""" + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class PILImage(BaseModel): @@ -118,6 +134,11 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False): """ image_pil: Required[PILImage] + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): @@ -131,6 +152,11 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """ image_url: Required[str] + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): @@ -155,6 +181,11 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): """ video_url: Required[str] + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class CustomThinkCompletionContentParam(TypedDict, total=False): @@ -567,6 +598,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): self._tokenizer = tokenizer self._items_by_modality = defaultdict[str, list[_T]](list) + self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list) @property def model_config(self) -> ModelConfig: @@ -591,10 +623,15 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): def mm_processor(self): return self.mm_registry.create_processor(self.model_config) - def add(self, modality: ModalityStr, item: _T) -> Optional[str]: + def add( + self, modality: ModalityStr, item: _T, uuid: Optional[str] = None + ) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. + + An optional uuid can be added which serves as a unique identifier of the + media. """ input_modality = modality.replace("_embeds", "") num_items = len(self._items_by_modality[modality]) + 1 @@ -602,9 +639,35 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): self.mm_processor.validate_num_items(input_modality, num_items) self._items_by_modality[modality].append(item) + self._uuids_by_modality[modality].append(uuid) return self.model_cls.get_placeholder_str(modality, num_items) + def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]: + if not self._items_by_modality: + return None + mm_uuids = {} + uuids_by_modality = dict(self._uuids_by_modality) + if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality: + raise ValueError( + "Mixing raw image and embedding inputs is not allowed" + ) + + if "image_embeds" in uuids_by_modality: + image_embeds_uuids = uuids_by_modality["image_embeds"] + if len(image_embeds_uuids) > 1: + raise ValueError( + "Only one message can have {'type': 'image_embeds'}" + ) + mm_uuids["image"] = uuids_by_modality["image_embeds"] + if "image" in uuids_by_modality: + mm_uuids["image"] = uuids_by_modality["image"] # UUIDs of images + if "audio" in uuids_by_modality: + mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios + if "video" in uuids_by_modality: + mm_uuids["video"] = uuids_by_modality["video"] # UUIDs of videos + return mm_uuids + @abstractmethod def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError @@ -697,29 +760,35 @@ class BaseMultiModalContentParser(ABC): return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds( - self, image_embeds: Union[str, dict[str, str]] + self, + image_embeds: Union[str, dict[str, str]], + uuid: Optional[str] = None, ) -> None: raise NotImplementedError @abstractmethod - def parse_image_pil(self, image_pil: Image.Image) -> None: + def parse_image_pil( + self, image_pil: Image.Image, uuid: Optional[str] = None + ) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, input_audio: InputAudio, uuid: Optional[str] = None + ) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: raise NotImplementedError @@ -734,49 +803,55 @@ class MultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: image = self._connector.fetch_image(image_url) - placeholder = self._tracker.add("image", image) + placeholder = self._tracker.add("image", image, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( - self, image_embeds: Union[str, dict[str, str]] + self, + image_embeds: Union[str, dict[str, str]], + uuid: Optional[str] = None, ) -> None: if isinstance(image_embeds, dict): embeds = { k: self._connector.fetch_image_embedding(v) for k, v in image_embeds.items() } - placeholder = self._tracker.add("image_embeds", embeds) + placeholder = self._tracker.add("image_embeds", embeds, uuid) if isinstance(image_embeds, str): embedding = self._connector.fetch_image_embedding(image_embeds) - placeholder = self._tracker.add("image_embeds", embedding) + placeholder = self._tracker.add("image_embeds", embedding, uuid) self._add_placeholder("image", placeholder) - def parse_image_pil(self, image_pil: Image.Image) -> None: - placeholder = self._tracker.add("image", image_pil) + def parse_image_pil( + self, image_pil: Image.Image, uuid: Optional[str] = None + ) -> None: + placeholder = self._tracker.add("image", image_pil, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: audio = self._connector.fetch_audio(audio_url) - placeholder = self._tracker.add("audio", audio) + placeholder = self._tracker.add("audio", audio, uuid) self._add_placeholder("audio", placeholder) - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, input_audio: InputAudio, uuid: Optional[str] = None + ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - return self.parse_audio(audio_url) + return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: video = self._connector.fetch_video(video_url=video_url) - placeholder = self._tracker.add("video", video) + placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -790,14 +865,16 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: image_coro = self._connector.fetch_image_async(image_url) - placeholder = self._tracker.add("image", image_coro) + placeholder = self._tracker.add("image", image_coro, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( - self, image_embeds: Union[str, dict[str, str]] + self, + image_embeds: Union[str, dict[str, str]], + uuid: Optional[str] = None, ) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() @@ -812,33 +889,37 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): embedding = self._connector.fetch_image_embedding(image_embeds) future.set_result(embedding) - placeholder = self._tracker.add("image_embeds", future) + placeholder = self._tracker.add("image_embeds", future, uuid) self._add_placeholder("image", placeholder) - def parse_image_pil(self, image_pil: Image.Image) -> None: + def parse_image_pil( + self, image_pil: Image.Image, uuid: Optional[str] = None + ) -> None: future: asyncio.Future[Image.Image] = asyncio.Future() future.set_result(image_pil) - placeholder = self._tracker.add("image", future) + placeholder = self._tracker.add("image", future, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) - placeholder = self._tracker.add("audio", audio_coro) + placeholder = self._tracker.add("audio", audio_coro, uuid) self._add_placeholder("audio", placeholder) - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, input_audio: InputAudio, uuid: Optional[str] = None + ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - return self.parse_audio(audio_url) + return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: video = self._connector.fetch_video_async(video_url=video_url) - placeholder = self._tracker.add("video", video) + placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -1177,30 +1258,36 @@ def _parse_chat_message_content_part( else: return str_content + # For media items, if a user has provided one, use it. Otherwise, insert + # a placeholder empty uuid. + uuid = part.get("uuid", None) + if uuid is not None: + uuid = str(uuid) + modality = None if part_type == "image_pil": image_content = cast(Image.Image, content) - mm_parser.parse_image_pil(image_content) + mm_parser.parse_image_pil(image_content, uuid) modality = "image" elif part_type in ("image_url", "input_image"): str_content = cast(str, content) - mm_parser.parse_image(str_content) + mm_parser.parse_image(str_content, uuid) modality = "image" elif part_type == "image_embeds": content = cast(Union[str, dict[str, str]], content) - mm_parser.parse_image_embeds(content) + mm_parser.parse_image_embeds(content, uuid) modality = "image" elif part_type == "audio_url": str_content = cast(str, content) - mm_parser.parse_audio(str_content) + mm_parser.parse_audio(str_content, uuid) modality = "audio" elif part_type == "input_audio": dict_content = cast(InputAudio, content) - mm_parser.parse_input_audio(dict_content) + mm_parser.parse_input_audio(dict_content, uuid) modality = "audio" elif part_type == "video_url": str_content = cast(str, content) - mm_parser.parse_video(str_content) + mm_parser.parse_video(str_content, uuid) modality = "video" else: raise NotImplementedError(f"Unknown part type: {part_type}") @@ -1288,7 +1375,11 @@ def parse_chat_messages( model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]: +) -> tuple[ + list[ConversationMessage], + Optional[MultiModalDataDict], + Optional[MultiModalUUIDDict], +]: conversation: list[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -1308,7 +1399,7 @@ def parse_chat_messages( _postprocess_messages(conversation) - return conversation, mm_tracker.all_mm_data() + return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids() def parse_chat_messages_futures( @@ -1316,7 +1407,11 @@ def parse_chat_messages_futures( model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: +) -> tuple[ + list[ConversationMessage], + Awaitable[Optional[MultiModalDataDict]], + Optional[MultiModalUUIDDict], +]: conversation: list[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) @@ -1336,7 +1431,7 @@ def parse_chat_messages_futures( _postprocess_messages(conversation) - return conversation, mm_tracker.all_mm_data() + return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids() def apply_hf_chat_template( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9b2ad808eb03e..d33fd0ec0b494 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -796,7 +796,7 @@ class LLM: # NOTE: _parse_chat_message_content_parts() currently doesn't # handle mm_processor_kwargs, since there is no implementation in # the chat message parsing for it. - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( msgs, model_config, tokenizer, @@ -826,6 +826,9 @@ class LLM: if mm_data is not None: prompt["multi_modal_data"] = mm_data + if mm_uuids is not None: + prompt["multi_modal_uuids"] = mm_uuids + if mm_processor_kwargs is not None: prompt["mm_processor_kwargs"] = mm_processor_kwargs diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 1a2236de4fa42..d6e8d93a57e14 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -929,7 +929,7 @@ class OpenAIServing: tokenizer, model_config=model_config, ) - conversation, mm_data_future = parse_chat_messages_futures( + conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, model_config, tokenizer, @@ -1006,6 +1006,10 @@ class OpenAIServing: prompt_token_ids=prompt_inputs["prompt_token_ids"]) if mm_data is not None: engine_prompt["multi_modal_data"] = mm_data + + if mm_uuids is not None: + engine_prompt["multi_modal_uuids"] = mm_uuids + if request.mm_processor_kwargs is not None: engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 094fcf021b619..ec82be831e0de 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -276,13 +276,23 @@ class InputPreprocessor: if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply( + mm_input = mm_processor.apply( prompt, mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, mm_hash_overrides=mm_hash_overrides, ) + mm_hashes = mm_input["mm_hashes"] + + # Validate that all mm items have a string as their hash + if not contains_only_strings(mm_hashes): + raise ValueError( + f"mm_hashes must contain only strings, got: {mm_hashes}. " + "This is likely due to an incorrect custom implementation of " + "MultiModalProcessor.apply method.") + + return mm_input async def _process_multimodal_async( self, @@ -310,13 +320,23 @@ class InputPreprocessor: if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply( + mm_input = mm_processor.apply( prompt, mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, mm_hash_overrides=mm_hash_overrides, ) + mm_hashes = mm_input["mm_hashes"] + + # Validate that all mm items have a string as their hash + if not contains_only_strings(mm_hashes): + raise ValueError( + f"mm_hashes must contain only strings, got: {mm_hashes}. " + "This is likely due to an incorrect custom implementation of " + "MultiModalProcessor.apply method.") + + return mm_input def _process_embeds( self, @@ -953,3 +973,15 @@ class InputPreprocessor: def clear_cache(self) -> None: if self.mm_processor_cache is not None: self.mm_processor_cache.clear_cache() + + +# Helper function to validate that a nested dictionary contains +# only strings or list of strings as the leaf values. +def contains_only_strings(obj: object): + if isinstance(obj, str): + return True + if isinstance(obj, list): + return all(isinstance(x, str) for x in obj) + if isinstance(obj, dict): + return all(contains_only_strings(v) for v in obj.values()) + return False diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 739396a4932cb..453da1a51d987 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -174,9 +174,10 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor): mm_items = self._to_mm_items(mm_data) tokenization_kwargs = tokenization_kwargs or {} - mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else - self._hash_mm_items(mm_items, hf_processor_mm_kwargs, - tokenization_kwargs)) + mm_hashes = self._hash_mm_items(mm_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides) mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} mm_processed_data = BatchFeature(image_data) From 717fc00e98c18d183f9141393b4185ddfb6606b3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 8 Sep 2025 14:45:14 +0100 Subject: [PATCH 034/584] [Docs] Move feature compatibility tables to README (#24431) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/.nav.yml | 5 +---- docs/features/{compatibility_matrix.md => README.md} | 8 +++++--- 2 files changed, 6 insertions(+), 7 deletions(-) rename docs/features/{compatibility_matrix.md => README.md} (98%) diff --git a/docs/.nav.yml b/docs/.nav.yml index dbac0e12f1bf2..8a21dc9f1d703 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -32,10 +32,7 @@ nav: - models/pooling_models.md - models/extensions - Hardware Supported Models: models/hardware_supported_models - - Features: - - features/compatibility_matrix.md - - features/* - - features/quantization + - Features: features - Developer Guide: - contributing/README.md - General: diff --git a/docs/features/compatibility_matrix.md b/docs/features/README.md similarity index 98% rename from docs/features/compatibility_matrix.md rename to docs/features/README.md index 5b08b3810776c..de23cd0a90eb5 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/README.md @@ -1,4 +1,6 @@ -# Compatibility Matrix +# Features + +## Compatibility Matrix The tables below show mutually exclusive features and the support on some hardware. @@ -12,7 +14,7 @@ The symbols used have the following meanings: !!! note Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination. -## Feature x Feature +### Feature x Feature <style> td:not(:first-child) { @@ -56,7 +58,7 @@ th:not(:first-child) { [](){ #feature-x-hardware } -## Feature x Hardware +### Feature x Hardware | Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU | |-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| From 55be93baf52476a5ac24b4ec6f2feca9c22ba34d Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Mon, 8 Sep 2025 18:36:54 +0200 Subject: [PATCH 035/584] [Doc]: fix 2 hyperlinks leading to Ray site after they changed Ray's doc structure (#24438) Signed-off-by: Didier Durand <durand.didier@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/serving/parallelism_scaling.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md index fa7fc1b290d50..cef1127fc5c15 100644 --- a/docs/serving/parallelism_scaling.md +++ b/docs/serving/parallelism_scaling.md @@ -66,7 +66,7 @@ Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens. -Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads. +Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads. For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html). @@ -104,7 +104,7 @@ Note that `VLLM_HOST_IP` is unique for each worker. Keep the shells running thes From any node, enter a container and run `ray status` and `ray list nodes` to verify that Ray finds the expected number of nodes and GPUs. !!! tip - Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/vllm-rayservice.html). + Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/rayserve-llm-example.html). ### Running vLLM on a Ray cluster From c44797a4d6d91033dee20cb179f6ccb239ae9656 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" <rongfu.leng@daocloud.io> Date: Tue, 9 Sep 2025 00:36:57 +0800 Subject: [PATCH 036/584] [Docs]add eplb_config param use docs (#24213) Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> --- docs/serving/expert_parallel_deployment.md | 35 +++++++++++++++++----- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 280b3322b11c3..7bf87b151e6af 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -123,12 +123,33 @@ When enabled, vLLM collects load statistics with every forward pass and periodic ### EPLB Parameters +Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. The available keys and their descriptions are: + | Parameter | Description | Default | |-----------|-------------|---------| -| `--eplb-window-size` | Number of engine steps to track for rebalancing decisions | - | -| `--eplb-step-interval` | Frequency of rebalancing (every N engine steps) | - | -| `--eplb-log-balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` | -| `--num-redundant-experts` | Additional global experts per EP rank beyond equal distribution | `0` | +| `window_size`| Number of engine steps to track for rebalancing decisions | 1000 | +| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 | +| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` | +| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` | + +For example: + +```bash +vllm serve Qwen/Qwen3-30B-A3B \ + --enable-eplb \ + --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}' +``` + +??? tip "Prefer individual arguments instead of JSON?" + + ```bash + vllm serve Qwen/Qwen3-30B-A3B \ + --enable-eplb \ + --eplb-config.window_size 1000 \ + --eplb-config.step_interval 3000 \ + --eplb-config.num_redundant_experts 2 \ + --eplb-config.log_balancedness true + ``` ### Expert Distribution Formula @@ -146,12 +167,10 @@ VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V --data-parallel-size 8 \ # Data parallelism --enable-expert-parallel \ # Enable EP --enable-eplb \ # Enable load balancer - --eplb-log-balancedness \ # Log balancing metrics - --eplb-window-size 1000 \ # Track last 1000 engine steps - --eplb-step-interval 3000 # Rebalance every 3000 steps + --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}' ``` -For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--num-redundant-experts` to 32 in large scale use cases so the most popular experts are always available. +For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available. ## Disaggregated Serving (Prefill/Decode Split) From 6f4a82f8b5a13a62dac0b84c7c817a33c602af52 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Tue, 9 Sep 2025 00:37:08 +0800 Subject: [PATCH 037/584] [Model] Enable BNB support for qwen2_5_omni_thinker (#24420) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- .../models/qwen2_5_omni_thinker.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index d05eb76cdf6fd..e79428d17a70a 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -41,6 +41,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2_5_vl import ( Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs, @@ -66,7 +67,8 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -726,7 +728,7 @@ class Qwen2_5OmniConditionalGenerationMixin: dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder, ) class Qwen2_5OmniThinkerForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsPP, + nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, Qwen2_5OmniConditionalGenerationMixin): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -734,6 +736,22 @@ class Qwen2_5OmniThinkerForConditionalGeneration( "thinker.model.": "language_model.model.", "thinker.": "", }) + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "attn.qkv": [ + "attn.q", + "attn.k", + "attn.v", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: @@ -956,3 +974,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration( mapper=self.hf_to_vllm_mapper) return loaded_weights + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="merger.", + tower_model=["visual.", "audio_tower."]) From 3feeeb9fea6c96a1006d37241c0c1cd9283fe6ef Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Mon, 8 Sep 2025 13:32:42 -0400 Subject: [PATCH 038/584] [Spec Decode][Benchmark] Add Spec Bench Dataset for benchmarking (#23563) Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- vllm/benchmarks/datasets.py | 80 ++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 882b68ac9e2fd..2e0c02d31767d 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1020,7 +1020,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): default="random", choices=[ "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", - "custom", "prefix_repetition" + "custom", "prefix_repetition", "spec_bench" ], help="Name of the dataset to benchmark on.", ) @@ -1053,6 +1053,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "Skip applying chat template to prompt, used only for custom dataset.", ) + spec_bench_group = parser.add_argument_group("spec bench dataset options") + spec_bench_group.add_argument( + "--spec-bench-output-len", + type=int, + default=256, + help= + "Num of output tokens per request, used only for spec bench dataset.", + ) + spec_bench_group.add_argument( + "--spec-bench-category", + type=str, + default=None, + help= + "Category for spec bench dataset. If None, use all categories.", + ) + sonnet_group = parser.add_argument_group("sonnet dataset options") sonnet_group.add_argument( "--sonnet-input-len", @@ -1404,6 +1420,14 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: else: # For datasets that follow a similar structure, use a mapping. dataset_mapping = { + "spec_bench": + lambda: SpecBench(dataset_path=args.dataset_path, + category=args.spec_bench_category).sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.spec_bench_output_len, + request_id_prefix=args.request_id_prefix, + ), "sharegpt": lambda: ShareGPTDataset( random_seed=args.seed, dataset_path=args.dataset_path ).sample( @@ -1541,6 +1565,14 @@ class CustomDataset(BenchmarkDataset): request_id_prefix: str = "", **kwargs, ) -> list: + # load all data if needed + self.num_available_samples = len(self.data) + if num_requests <= 0: + num_requests = self.num_available_samples + logger.info("num_requests is set to 0 or negative, " + "so using all available samples: %d", + num_requests) + sampled_requests = [] for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: @@ -1572,6 +1604,52 @@ class CustomDataset(BenchmarkDataset): return sampled_requests +# ----------------------------------------------------------------------------- +# Spec Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class SpecBench(CustomDataset): + """ + Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench + Download the dataset using: + wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl + """ # noqa: E501 + + def __init__(self, **kwargs) -> None: + self.category = kwargs.pop("category", None) + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + self.data = [] + + # Load the JSONL file + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, + lines=True) + + # check if the JSONL file has a 'turns' column + if "turns" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'turns' column.") + + for _, row in jsonl_data.iterrows(): + # sample only from a specific category if specified + if (not self.category) or (self.category == row['category']): + prompt = row["turns"][0] + self.data.append({"prompt": prompt}) + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample(self, **kwargs) -> list: + # leverage CustomDataset sample + kwargs["skip_chat_template"] = False + return super().sample(**kwargs) + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- From cd086369260ccf2289b9b3d96319792705df4e5b Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Mon, 8 Sep 2025 13:32:52 -0400 Subject: [PATCH 039/584] [Spec Decode][Benchmark] Add Blitzedit dataset (#23605) Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/benchmarks/datasets.py | 113 ++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 2e0c02d31767d..1c358a835cac5 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1101,6 +1101,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "from the ShareGPT dataset.", ) + blazedit_group = parser.add_argument_group("blazedit dataset options") + blazedit_group.add_argument( + "--blazedit-min-distance", + type=float, + default=0.0, + help= + "Minimum distance for blazedit dataset. Min: 0, Max: 1.0", + ) + blazedit_group.add_argument( + "--blazedit-max-distance", + type=float, + default=1.0, + help= + "Maximum distance for blazedit dataset. Min: 0, Max: 1.0", + ) + random_group = parser.add_argument_group("random dataset options") random_group.add_argument( "--random-input-len", @@ -1333,6 +1349,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: elif args.dataset_name == "hf": # all following datasets are implemented from the # HuggingFaceDataset base class + hf_kwargs = {} if ( args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS @@ -1376,6 +1393,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: ): dataset_class = ASRDataset args.hf_split = "train" + elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS: + dataset_class = BlazeditDataset + args.hf_split = "train" + hf_kwargs = { + "min_distance": args.blazedit_min_distance, + "max_distance": args.blazedit_max_distance, + } elif ( args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS @@ -1415,6 +1439,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, output_len=args.hf_output_len, request_id_prefix=args.request_id_prefix, + **hf_kwargs ) else: @@ -2090,6 +2115,94 @@ class MTBenchDataset(HuggingFaceDataset): return sampled_requests +# ----------------------------------------------------------------------------- +# Blazedit Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BlazeditDataset(HuggingFaceDataset): + """ + Blazedit Dataset. + https://github.com/ise-uiuc/blazedit + + 5k char version: vdaita/edit_5k_char + 10k char version: vdaita/edit_10k_char + """ # noqa: E501 + + # 5k char version will have output as ~5k chars + # 10k char version will have output as ~10k chars + # Assuming 3 char per token, 10k chars will be 3333 tokens + # We set default to 4000 to be safe + DEFAULT_OUTPUT_LEN = 4000 + SUPPORTED_DATASET_PATHS = { + "vdaita/edit_5k_char", + "vdaita/edit_10k_char", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + request_id_prefix: str = "", + min_distance: float = 0.0, + max_distance: float = 1.0, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + code = item["code"] + change_request = item["change_request"] + norm_distance = item["norm_distance"] + + # compare the levenshtein distance normalized by code length + if norm_distance < min_distance or norm_distance > max_distance: + continue + + # template copied from + # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501 + instruction = f"""Given a code file, please apply the change requests and generate the new file. + +Original file: +```python +{code} +``` + +Change request: +{change_request} + +Please generate the new code file in the "New file" section below.""" # noqa: E501 + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": instruction + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) + + return sampled_requests + + # ----------------------------------------------------------------------------- # AIMO Dataset Implementation # ----------------------------------------------------------------------------- From 8d7f39b48c9df750678171e37586dd84def5737e Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Tue, 9 Sep 2025 02:02:14 +0800 Subject: [PATCH 040/584] [Model] Remove quantized mixtral (#24437) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- tests/models/registry.py | 1 - vllm/model_executor/model_loader/utils.py | 16 - vllm/model_executor/models/mixtral_quant.py | 454 -------------------- vllm/model_executor/models/registry.py | 1 - 4 files changed, 472 deletions(-) delete mode 100644 vllm/model_executor/models/mixtral_quant.py diff --git a/tests/models/registry.py b/tests/models/registry.py index e4c215b108446..8bcdeb087c1f9 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -285,7 +285,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501 {"tiny": "TitanML/tiny-mixtral"}), # noqa: E501 - "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501 "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index f57ebdb1abcbc..c82fa5a40aa53 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -169,22 +169,6 @@ def get_model_architecture( model_config: ModelConfig) -> tuple[type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) - # Special handling for quantized Mixtral. - # FIXME(woosuk): This is a temporary hack. - mixtral_supported = [ - "fp8", - "compressed-tensors", - "gptq_marlin", - "awq_marlin", - "quark", - "bitsandbytes", - ] - - if (model_config.quantization is not None - and model_config.quantization not in mixtral_supported - and "MixtralForCausalLM" in architectures): - architectures = ["QuantMixtralForCausalLM"] - model_cls, arch = model_config.registry.resolve_model_cls( architectures, model_config=model_config, diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py deleted file mode 100644 index 692267b4d7271..0000000000000 --- a/vllm/model_executor/models/mixtral_quant.py +++ /dev/null @@ -1,454 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Mixtral model.""" -from collections.abc import Iterable -from itertools import islice -from typing import Optional, Union - -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn -from transformers import MixtralConfig - -from vllm.attention import Attention -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from .interfaces import SupportsPP -from .utils import (AutoWeightsLoader, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) - - -class MixtralMLP(nn.Module): - - def __init__( - self, - num_experts: int, - hidden_size: int, - intermediate_size: int, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: - super().__init__() - self.num_experts = num_experts - self.ffn_dim = intermediate_size - self.hidden_dim = hidden_size - - self.w1 = ReplicatedLinear(self.hidden_dim, - self.ffn_dim, - bias=False, - quant_config=quant_config) - self.w2 = ReplicatedLinear(self.ffn_dim, - self.hidden_dim, - bias=False, - quant_config=quant_config) - self.w3 = ReplicatedLinear(self.hidden_dim, - self.ffn_dim, - bias=False, - quant_config=quant_config) - - # TODO: Use vllm's SiluAndMul - self.act_fn = nn.SiLU() - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - w1_out, _ = self.w1(hidden_states) - w1_out = self.act_fn(w1_out) - w3_out, _ = self.w3(hidden_states) - current_hidden_states = w1_out * w3_out - current_hidden_states, _ = self.w2(current_hidden_states) - return current_hidden_states - - -class MixtralMoE(nn.Module): - - def __init__( - self, - config: MixtralConfig, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() - self.config = config - self.rank = get_tensor_model_parallel_rank() - self.tp_size = get_tensor_model_parallel_world_size() - self.num_total_experts = config.num_local_experts - self.top_k = config.num_experts_per_tok - if self.tp_size > self.num_total_experts: - raise ValueError( - f"Tensor parallel size {self.tp_size} is greater than " - f"the number of experts {self.num_total_experts}.") - # Split experts equally between ranks - self.expert_indices = np.array_split(range(self.num_total_experts), - self.tp_size)[self.rank].tolist() - if not self.expert_indices: - raise ValueError( - f"Rank {self.rank} has no experts assigned to it.") - - self.experts = nn.ModuleList([ - MixtralMLP(self.num_total_experts, - config.hidden_size, - config.intermediate_size, - quant_config=quant_config) - if idx in self.expert_indices else None - for idx in range(self.num_total_experts) - ]) - self.gate = ReplicatedLinear(config.hidden_size, - self.num_total_experts, - bias=False, - quant_config=None) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) - - routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) - routing_weights, selected_experts = torch.topk(routing_weights, - self.top_k, - dim=-1) - routing_weights /= routing_weights.sum(dim=-1, keepdim=True) - - final_hidden_states = None - for expert_idx in self.expert_indices: - expert_layer = self.experts[expert_idx] - expert_mask = (selected_experts == expert_idx) - expert_weights = (routing_weights * expert_mask).sum(dim=-1, - keepdim=True) - - current_hidden_states = expert_layer(hidden_states).mul_( - expert_weights) - if final_hidden_states is None: - final_hidden_states = current_hidden_states - else: - final_hidden_states.add_(current_hidden_states) - - return tensor_model_parallel_all_reduce(final_hidden_states).view( - num_tokens, hidden_dim) - - -class MixtralAttention(nn.Module): - - def __init__( - self, - config: MixtralConfig, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - rope_theta: float = 10000, - quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - # MixtralConfig has an optional head_dim argument - self.head_dim = getattr(config, "head_dim", None) - if self.head_dim is None: - self.head_dim = self.hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - quant_config=quant_config, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position, - base=int(self.rope_theta), - is_neox_style=True, - ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class MixtralDecoderLayer(nn.Module): - - def __init__( - self, - config: MixtralConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - self.self_attn = MixtralAttention( - config=config, - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - max_position=config.max_position_embeddings, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - self.block_sparse_moe = MixtralMoE(config=config, - quant_config=quant_config) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: Optional[torch.Tensor], - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.block_sparse_moe(hidden_states) - return hidden_states, residual - - -class MixtralModel(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: MixtralDecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix - ), - prefix=f"{prefix}.layers") - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - for layer in islice(self.layers, self.start_layer, self.end_layer): - hidden_states, residual = layer(positions, hidden_states, residual) - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if name.endswith("scale"): - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Skip experts that are not assigned to this worker. - if ("block_sparse_moe.experts." in name - and name not in params_dict): - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class MixtralForCausalLM(nn.Module, SupportsPP): - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = MixtralModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c522fcab7f333..43075956b450b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -110,7 +110,6 @@ _TEXT_GENERATION_MODELS = { "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), - "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), # transformers's mpt class has lower case "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), From 7be141b2c52366f0cc4e731c36819aed178d2258 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu <riverclouds.zhu@qq.com> Date: Tue, 9 Sep 2025 02:48:06 +0800 Subject: [PATCH 041/584] [CI] Enable encoder model compilation test (#24442) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> --- tests/compile/test_basic_correctness.py | 31 ++++++++++++++----------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 422cb94b036ca..f6783704342f6 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -62,8 +62,12 @@ class TestSetting: TestSetting( model="BAAI/bge-multilingual-gemma2", model_args=[ - "--runner", "pooling", "--dtype", "bfloat16", - "--max-model-len", "2048" + "--runner", + "pooling", + "--dtype", + "bfloat16", + "--max-model-len", + "2048", ], pp_size=1, tp_size=1, @@ -71,17 +75,15 @@ class TestSetting: method="encode", fullgraph=True, ), - # TODO: bert models are not supported in V1 yet - # # encoder-based embedding model (BERT) - # TestSetting( - # model="BAAI/bge-base-en-v1.5", - # model_args=["--runner", "pooling"], - # pp_size=1, - # tp_size=1, - # attn_backend="XFORMERS", - # method="encode", - # fullgraph=True, - # ), + TestSetting( + model="BAAI/bge-base-en-v1.5", + model_args=["--runner", "pooling"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="encode", + fullgraph=True, + ), # vision language model TestSetting( model="microsoft/Phi-3.5-vision-instruct", @@ -92,7 +94,8 @@ class TestSetting: method="generate_with_image", fullgraph=False, ), - ]) + ], +) def test_compile_correctness( monkeypatch: pytest.MonkeyPatch, test_setting: TestSetting, From 43d9ad03ba8584547537da71aee3e1810a0abbfd Mon Sep 17 00:00:00 2001 From: Yang Kaiyong <kaiyongyang@outlook.com> Date: Tue, 9 Sep 2025 02:49:39 +0800 Subject: [PATCH 042/584] [Model loader]: support multi-thread model weight loading (#23928) Signed-off-by: Yang Kaiyong <yangkaiyong.yky@antgroup.com> Signed-off-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Simon Mo <simon.mo@hey.com> --- .../model_loader/default_loader.py | 53 +++++++++++---- .../model_loader/weight_utils.py | 64 +++++++++++++++++++ 2 files changed, 105 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index c8ad3a55d932a..4badc31753445 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -18,8 +18,9 @@ from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, fastsafetensors_weights_iterator, filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, maybe_download_from_modelscope, - np_cache_weights_iterator, pt_weights_iterator, - safetensors_weights_iterator) + multi_thread_pt_weights_iterator, + multi_thread_safetensors_weights_iterator, np_cache_weights_iterator, + pt_weights_iterator, safetensors_weights_iterator) from vllm.platforms import current_platform logger = init_logger(__name__) @@ -28,6 +29,9 @@ logger = init_logger(__name__) class DefaultModelLoader(BaseModelLoader): """Model loader that can load different file types from disk.""" + # default number of thread when enable multithread weight loading + DEFAULT_NUM_THREADS = 8 + @dataclasses.dataclass class Source: """A source for weights.""" @@ -52,9 +56,15 @@ class DefaultModelLoader(BaseModelLoader): def __init__(self, load_config: LoadConfig): super().__init__(load_config) - if load_config.model_loader_extra_config: - raise ValueError(f"Model loader extra config is not supported for " - f"load format {load_config.load_format}") + + extra_config = load_config.model_loader_extra_config + allowed_keys = {"enable_multithread_load", "num_threads"} + unexpected_keys = set(extra_config.keys()) - allowed_keys + + if unexpected_keys: + raise ValueError(f"Unexpected extra config keys for load format " + f"{load_config.load_format}: " + f"{unexpected_keys}") def _prepare_weights( self, @@ -145,6 +155,7 @@ class DefaultModelLoader(BaseModelLoader): self, source: "Source" ) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" + extra_config = self.load_config.model_loader_extra_config hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( source.model_or_path, source.revision, source.fall_back_to_pt, source.allow_patterns_overrides) @@ -165,16 +176,34 @@ class DefaultModelLoader(BaseModelLoader): self.load_config.use_tqdm_on_load, ) else: - weights_iterator = safetensors_weights_iterator( + if extra_config.get("enable_multithread_load"): + weights_iterator = ( + multi_thread_safetensors_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + max_workers=extra_config.get( + "num_threads", self.DEFAULT_NUM_THREADS), + )) + else: + weights_iterator = safetensors_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + ) + else: + if extra_config.get("enable_multithread_load"): + weights_iterator = multi_thread_pt_weights_iterator( hf_weights_files, self.load_config.use_tqdm_on_load, + self.load_config.pt_load_map_location, + max_workers=extra_config.get("num_threads", + self.DEFAULT_NUM_THREADS), + ) + else: + weights_iterator = pt_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + self.load_config.pt_load_map_location, ) - else: - weights_iterator = pt_weights_iterator( - hf_weights_files, - self.load_config.use_tqdm_on_load, - self.load_config.pt_load_map_location, - ) if current_platform.is_tpu(): from vllm.platforms.tpu import USE_TPU_COMMONS diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 50056038b6506..a4eda36148d7a 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for downloading and initializing model weights.""" +import concurrent.futures import fnmatch import glob import hashlib @@ -531,6 +532,36 @@ def safetensors_weights_iterator( yield name, param +def multi_thread_safetensors_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, + max_workers: int = 4, +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Multi-Thread iterate over the weights in the model safetensor files.""" + + def _load_file(st_file: str): + result = load_file(st_file, device="cpu") + return result + + with concurrent.futures.ThreadPoolExecutor( + max_workers=max_workers) as executor: + futures = [ + executor.submit(_load_file, st_file) + for st_file in hf_weights_files + ] + futures_iter = tqdm( + concurrent.futures.as_completed(futures), + total=len(hf_weights_files), + desc="Multi-thread loading shards", + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ) + + for future in futures_iter: + state_dict = future.result() + yield from state_dict.items() + + def runai_safetensors_weights_iterator( hf_weights_files: list[str], use_tqdm_on_load: bool, @@ -611,6 +642,39 @@ def pt_weights_iterator( del state +def multi_thread_pt_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, + pt_load_map_location: Union[str, dict[str, str]] = "cpu", + max_workers: int = 4, +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Multi-Thread iterate over the weights in the model bin/pt files.""" + + def _load_file(bin_file: str): + return torch.load(bin_file, + map_location=pt_load_map_location, + weights_only=True) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=max_workers) as executor: + futures = [ + executor.submit(_load_file, bin_file) + for bin_file in hf_weights_files + ] + futures_iter = tqdm( + concurrent.futures.as_completed(futures), + total=len(hf_weights_files), + desc="Multi-thread loading pt checkpoint shards", + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ) + + for future in futures_iter: + state = future.result() + yield from state.items() + del state + + def get_gguf_extra_tensor_names( gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]: reader = gguf.GGUFReader(gguf_file) From 41183c1fe09c60cd77d683e64895c08f0d84b693 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Mon, 8 Sep 2025 16:44:13 -0400 Subject: [PATCH 043/584] [Spec Decode] Fix offline spec_decode.py (#24257) Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/benchmarks/datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1c358a835cac5..784536054a19f 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1310,6 +1310,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser): def get_samples(args, tokenizer) -> list[SampleRequest]: + + if not hasattr(args, "request_id_prefix"): + args.request_id_prefix = "" + if args.dataset_name == "custom": dataset = CustomDataset(dataset_path=args.dataset_path) input_requests = dataset.sample( From 620db1fc5879329dae62367238f97d446904a3ee Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni001@gmail.com> Date: Mon, 8 Sep 2025 15:05:26 -0700 Subject: [PATCH 044/584] [Attention] FlashAttention MLA cudagraph support (#23958) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- .../compile/piecewise/test_full_cudagraph.py | 12 ++- tests/v1/attention/test_mla_backends.py | 5 -- tests/v1/cudagraph/test_cudagraph_mode.py | 10 +++ vllm/v1/attention/backends/mla/common.py | 22 ++++-- .../attention/backends/mla/flashattn_mla.py | 77 +++++++++++++++++-- vllm/v1/attention/backends/mla/flashmla.py | 11 +-- .../attention/backends/mla/rocm_aiter_mla.py | 10 ++- 7 files changed, 118 insertions(+), 29 deletions(-) diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 97140a9db7af6..2454f85342eba 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -61,6 +61,16 @@ backend_configs = { "cudagraph_mode": "FULL_AND_PIECEWISE", }, specific_gpu_arch=(9, 0)), + # FlashAttention MLA on Hopper + "FlashAttentionMLA": + BackendConfig(name="FlashAttentionMLA", + env_vars={ + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", + }, + comp_config={ + "cudagraph_mode": "FULL_DECODE_ONLY", + }, + specific_gpu_arch=(9, 0)), # Cutlass MLA on Blackwell "CutlassMLA": BackendConfig( @@ -102,7 +112,7 @@ backend_configs = { test_params_full_cudagraph = [] # deepseek-ai/DeepSeek-V2-Lite with MLA -MLA_backends = ["FlashMLA", "CutlassMLA"] +MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"] for mla_backend in MLA_backends: test_params_full_cudagraph.append( pytest.param( diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index e7cd116fdc834..a62993950affe 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -73,7 +73,6 @@ def create_and_prepopulate_kv_cache( kv_c_contexts: list[torch.Tensor], k_pe_contexts: list[torch.Tensor], block_size: int, - num_kv_heads: int, head_size: int, dtype: torch.dtype, device: torch.device, @@ -87,7 +86,6 @@ def create_and_prepopulate_kv_cache( k_pe_contexts: List of key positional embedding context tensors for each sequence block_size: Size of each block - num_kv_heads: Number of KV heads (should be 1 for MLA) head_size: Size of each head (latent dimension) dtype: Data type for the cache device: Device to create the cache on @@ -285,8 +283,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): query_lens = batch_spec.query_lens num_q_heads = vllm_config.model_config.get_num_attention_heads( vllm_config.parallel_config) - num_kv_heads = vllm_config.model_config.get_num_kv_heads( - vllm_config.parallel_config) head_size = vllm_config.model_config.get_head_size() dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) block_size = vllm_config.cache_config.block_size @@ -476,7 +472,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): kv_c_contexts=kv_c_contexts, k_pe_contexts=k_pe_contexts, block_size=block_size, - num_kv_heads=num_kv_heads, head_size=head_size, dtype=dtype, device=device, diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 81655e4175006..25e01806f4956 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -62,6 +62,16 @@ backend_configs = { "cudagraph_mode": "FULL_AND_PIECEWISE", }, specific_gpu_arch=(9, 0)), + # FlashAttention MLA on Hopper + "FlashAttentionMLA": + BackendConfig(name="FlashAttentionMLA", + env_vars={ + "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", + }, + comp_config={ + "cudagraph_mode": "FULL_DECODE_ONLY", + }, + specific_gpu_arch=(9, 0)), # FA2 "FA2": BackendConfig(name="FA2", diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index ec1216a16bc46..226bc436058d8 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -443,11 +443,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.metadata_cls = metadata_cls \ if metadata_cls is not None else MLACommonMetadata self.kv_cache_spec = kv_cache_spec - self.device = device scheduler_config = vllm_config.scheduler_config self.model_config = vllm_config.model_config - cache_config = vllm_config.cache_config parallel_config = vllm_config.parallel_config + cache_config = vllm_config.cache_config + self.compilation_config = vllm_config.compilation_config + self.device = device + self.num_heads = self.model_config.get_num_attention_heads( parallel_config) self.mla_dims = get_mla_dims(self.model_config) @@ -608,10 +610,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): prefill.prefill_main = self._fi_prefill_main prefill.prefill_chunks = self._fi_prefill_chunks - def _build_decode( - self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, - seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, - query_start_loc_device: torch.Tensor) -> MLACommonDecodeMetadata: + def _build_decode(self, block_table_tensor: torch.Tensor, + seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, + query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor, + num_decode_tokens: int) -> MLACommonDecodeMetadata: return MLACommonDecodeMetadata( block_table=block_table_tensor, seq_lens=seq_lens_device, @@ -624,11 +628,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): Currently, only decode is supported for full cudagraphs with MLA. """ m = common_attn_metadata - assert m.num_reqs == m.num_actual_tokens, \ + assert m.num_reqs <= (m.num_actual_tokens * + self.reorder_batch_threshold), \ "MLA only supports decode-only full CUDAGraph capture. " \ "Make sure all cudagraph capture sizes <= max_num_seq." - assert m.max_query_len == 1 # decode-only + assert m.max_query_len <= self.reorder_batch_threshold # decode only return self.build(0, m) @@ -819,6 +824,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): seq_lens_device=seq_lens[:num_decodes], query_start_loc_cpu=query_start_loc_cpu[:num_decodes + 1], query_start_loc_device=query_start_loc[:num_decodes + 1], + num_decode_tokens=num_decode_tokens, ) attn_metadata = self.metadata_cls( diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index e2a63c2f577e0..12f206637d7c6 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -17,11 +17,16 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) +from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata logger = init_logger(__name__) +# NOTE(matt): This is an arbitrary number, copied from +# woosuk's implementation in standard FlashAttention backend +_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16 + class FlashAttnMLABackend(MLACommonBackend): @@ -48,6 +53,7 @@ class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata): max_query_len: int max_seq_len: int scheduler_metadata: Optional[torch.Tensor] = None + max_num_splits: int = 0 @dataclass @@ -57,14 +63,41 @@ class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]): class FlashAttnMLAMetadataBuilder( MLACommonMetadataBuilder[FlashAttnMLAMetadata]): + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_BATCH + reorder_batch_threshold: ClassVar[int] = 512 def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): super().__init__(kv_cache_spec, layer_names, vllm_config, device, FlashAttnMLAMetadata) + self.max_num_splits = 0 # No upper bound on the number of splits. self.fa_aot_schedule = (get_flash_attn_version() == 3) + self.use_full_cuda_graph = \ + self.compilation_config.cudagraph_mode.has_full_cudagraphs() + + if self.use_full_cuda_graph and self.fa_aot_schedule: + self.max_cudagraph_size = self.compilation_config.max_capture_size + + if self.max_cudagraph_size > 992: + # This condition derives from FA3's internal heuristic. + # TODO(woosuk): Support larger cudagraph sizes. + raise ValueError( + "Capture size larger than 992 is not supported for " + "full cuda graph.") + + self.scheduler_metadata = torch.zeros( + vllm_config.scheduler_config.max_num_seqs + 1, + dtype=torch.int32, + device=self.device, + ) + # When using cuda graph, we need to set the upper bound of the + # number of splits so that large enough intermediate buffers are + # pre-allocated during capture. + self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): if self.fa_aot_schedule: @@ -81,14 +114,16 @@ class FlashAttnMLAMetadataBuilder( page_size=self.page_size, cu_seqlens_q=cu_query_lens, causal=causal, + num_splits=self.max_num_splits, ) return None - def _build_decode( - self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, - seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, - query_start_loc_device: torch.Tensor - ) -> FlashAttnMLADecodeMetadata: + def _build_decode(self, block_table_tensor: torch.Tensor, + seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, + query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor, + num_decode_tokens: int) -> FlashAttnMLADecodeMetadata: query_lens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]) max_query_len = query_lens_cpu.max().item() max_seq_len = seq_lens_cpu.max().item() @@ -102,6 +137,29 @@ class FlashAttnMLAMetadataBuilder( causal=True, ) + # For FA3 + full cudagraph + max_num_splits = 0 + if self.use_full_cuda_graph and scheduler_metadata is not None: + n = scheduler_metadata.shape[0] + # Ensure the persistent buffer is large enough + assert n <= self.scheduler_metadata.shape[0], \ + f"Scheduler metadata size {n} exceeds buffer size " + \ + f"{self.scheduler_metadata.shape[0]}" + self.scheduler_metadata[:n] = scheduler_metadata + # NOTE(woosuk): We should zero out the rest of the scheduler + # metadata to guarantee the correctness. Otherwise, some thread + # blocks may use the invalid scheduler metadata and overwrite the + # output buffer. + self.scheduler_metadata[n:] = 0 + scheduler_metadata = self.scheduler_metadata[:n] + + if num_decode_tokens <= self.max_cudagraph_size: + # NOTE(woosuk): Setting num_splits > 1 may increase the memory + # usage, because the intermediate buffers of size [num_splits, + # num_heads, num_tokens, head_size] are allocated. Therefore, + # we only set num_splits when using cuda graphs. + max_num_splits = self.max_num_splits + return FlashAttnMLADecodeMetadata( block_table=block_table_tensor, seq_lens=seq_lens_device, @@ -109,6 +167,7 @@ class FlashAttnMLAMetadataBuilder( max_query_len=max_query_len, max_seq_len=max_seq_len, scheduler_metadata=scheduler_metadata, + max_num_splits=max_num_splits, ) @@ -175,12 +234,17 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]): kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank] k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank:] + # NOTE(matt): During CUDA graph capture, max_query_len can be 0, but the + # kernel uses this to calculate grid dimensions. Ensure it's at least 1 + # to prevent invalid grid configuration during graph capture. + max_seqlen_q = max(attn_metadata.decode.max_query_len, 1) + o = flash_attn_varlen_func( q=q_pe, k=k_pe_cache.unsqueeze(-2), # Add head dim of 1 v=kv_c_cache.unsqueeze(-2), # Add head dim of 1 q_v=q_nope, - max_seqlen_q=attn_metadata.decode.max_query_len, + max_seqlen_q=max_seqlen_q, cu_seqlens_q=attn_metadata.decode.query_start_loc, max_seqlen_k=attn_metadata.decode.max_seq_len, seqused_k=attn_metadata.decode.seq_lens, @@ -189,6 +253,7 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]): causal=True, fa_version=3, # only version 3 is supported scheduler_metadata=attn_metadata.decode.scheduler_metadata, + num_splits=attn_metadata.decode.max_num_splits, ) return self._v_up_proj(o) diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 1824bbadb6a1a..2f13f19218d9c 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -62,7 +62,6 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): super().__init__(kv_cache_spec, layer_names, vllm_config, device, FlashMLAMetadata) - self.compilation_config = vllm_config.compilation_config self.num_q_heads = vllm_config.model_config.get_num_attention_heads( vllm_config.parallel_config) @@ -85,10 +84,12 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): device=self.device, dtype=torch.int32) - def _build_decode( - self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, - seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, - query_start_loc_device: torch.Tensor) -> FlashMLADecodeMetadata: + def _build_decode(self, block_table_tensor: torch.Tensor, + seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, + query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor, + num_decode_tokens: int) -> FlashMLADecodeMetadata: tile_scheduler_metadata, num_splits = \ get_mla_metadata( seq_lens_device, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index fc6b1998e8eb0..db27a34d8959a 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -104,10 +104,12 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): dtype=torch.int32, device=device) - def _build_decode( - self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, - seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, - query_start_loc_device: torch.Tensor) -> AiterMLADecodeMetadata: + def _build_decode(self, block_table_tensor: torch.Tensor, + seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, + query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor, + num_decode_tokens: int) -> AiterMLADecodeMetadata: page_size = self.kv_cache_spec.block_size block_table_bounds = (seq_lens_device + page_size - 1) // page_size device = self.device From e680723eba82c8f486dfb8ba00f72cccb6be5727 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Tue, 9 Sep 2025 06:28:03 +0800 Subject: [PATCH 045/584] [Bugfix] Disable the statslogger if the api_server_count is greater than 1 (#22227) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> --- vllm/v1/engine/async_llm.py | 1 + vllm/v1/metrics/loggers.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d23602eaaffa9..f57075c6fa82d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -143,6 +143,7 @@ class AsyncLLM(EngineClient): engine_idxs=self.engine_core.engine_ranks_managed, custom_stat_loggers=stat_loggers, enable_default_loggers=log_stats, + client_count=client_count, ) self.logger_manager.log_engine_initialized() diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f480344c854f7..347185d8341ee 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -652,6 +652,7 @@ class StatLoggerManager: engine_idxs: Optional[list[int]] = None, custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, enable_default_loggers: bool = True, + client_count: int = 1, ): self.engine_idxs = engine_idxs if engine_idxs else [0] @@ -660,7 +661,12 @@ class StatLoggerManager: factories.extend(custom_stat_loggers) if enable_default_loggers and logger.isEnabledFor(logging.INFO): - factories.append(LoggingStatLogger) + if client_count > 1: + logger.warning( + "AsyncLLM created with api_server_count more than 1; " + "disabling stats logging to avoid incomplete stats.") + else: + factories.append(LoggingStatLogger) # engine_idx: StatLogger self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {} From e10fef08838612b4560e9c72e5cb1414a5edfa13 Mon Sep 17 00:00:00 2001 From: R3hankhan <Rehan.Khan7@ibm.com> Date: Tue, 9 Sep 2025 05:20:34 +0530 Subject: [PATCH 046/584] [Hardware][IBM Z] Fix Outlines Core issue for s390x (#24034) Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com> --- docker/Dockerfile.s390x | 46 +++++++++++++++++++++++++++++++++++++++-- requirements/common.txt | 3 +-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x index 9270b48c54d4b..9942b7626f81e 100644 --- a/docker/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -16,7 +16,8 @@ ENV LANG=C.UTF-8 \ RUN microdnf install -y \ which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ - openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \ + openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \ + clang llvm-devel llvm-static clang-devel && \ microdnf clean all # Python Installation @@ -191,7 +192,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ -DCOMPILER_RT_BUILD_ORC=OFF \ -DCOMPILER_RT_INCLUDE_TESTS=OFF \ ${CMAKE_ARGS} -GNinja ../llvm \ - && ninja install . && \ # build llvmlite cd ../../llvmlite && python setup.py bdist_wheel && \ @@ -200,6 +200,45 @@ RUN --mount=type=cache,target=/root/.cache/uv \ sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \ fi && python setup.py bdist_wheel +# Edit aws-lc-sys to support s390x +FROM python-install AS aws-lc-sys-editor +WORKDIR /tmp +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" +ARG AWS_LC_VERSION=v0.30.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + git clone --recursive https://github.com/aws/aws-lc-rs.git && \ + cd aws-lc-rs && \ + git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \ + git submodule sync && \ + git submodule update --init --recursive && \ + cd aws-lc-sys && \ + sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \ + sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \ + sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c + +# Build Outlines Core +FROM python-install AS outlines-core-builder +WORKDIR /tmp +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" +ARG OUTLINES_CORE_VERSION=0.2.10 +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + --mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \ + git clone https://github.com/dottxt-ai/outlines-core.git && \ + cd outlines-core && \ + git checkout tags/${OUTLINES_CORE_VERSION} && \ + sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \ + echo '[patch.crates-io]' >> Cargo.toml && \ + echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \ + uv pip install maturin && \ + python -m maturin build --release --out dist # Final build stage FROM python-install AS vllm-cpu @@ -230,6 +269,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \ + --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \ sed -i '/^torch/d' requirements/build.txt && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \ @@ -237,6 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \ LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \ NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \ + OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \ uv pip install -v \ $ARROW_WHL_FILE \ $VISION_WHL_FILE \ @@ -244,6 +285,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ $TORCH_WHL_FILE \ $LLVM_WHL_FILE \ $NUMBA_WHL_FILE \ + $OUTLINES_CORE_WHL_FILE \ --index-strategy unsafe-best-match \ -r requirements/build.txt \ -r requirements/cpu.txt diff --git a/requirements/common.txt b/requirements/common.txt index ce0795488cc1e..8f5bc9176d90a 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -20,8 +20,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.11.3 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" -outlines_core == 0.2.10 ; platform_machine != "s390x" -outlines == 0.1.11 ; platform_machine == "s390x" +outlines_core == 0.2.10 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 From 6910b56da2226f88dd5b825ae57af8dea4e1ac47 Mon Sep 17 00:00:00 2001 From: Sahithi Chigurupati <58884509+csahithi@users.noreply.github.com> Date: Mon, 8 Sep 2025 18:18:09 -0700 Subject: [PATCH 047/584] [CI] Add nightly multiarch manifests to dockerhub (#24102) Signed-off-by: Sahithi Chigurupati <chigurupati.sahithi@gmail.com> Signed-off-by: Simon Mo <simon.mo@hey.com> Signed-off-by: simon-mo <simon.mo@hey.com> Co-authored-by: Simon Mo <simon.mo@hey.com> --- .buildkite/release-pipeline.yaml | 22 +++++ .buildkite/scripts/cleanup-nightly-builds.sh | 97 ++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100755 .buildkite/scripts/cleanup-nightly-builds.sh diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 597dfbf99022d..a1de41652c9a6 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -149,3 +149,25 @@ steps: - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" + + - label: "Build and publish nightly multi-arch image to DockerHub" + depends_on: + - create-multi-arch-manifest + if: build.env("NIGHTLY") == "1" + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" + - "docker push vllm/vllm-openai:nightly" + - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" + # Clean up old nightly builds (keep only last 14) + - "bash .buildkite/scripts/cleanup-nightly-builds.sh" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN + env: + DOCKER_BUILDKIT: "1" diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh new file mode 100755 index 0000000000000..1a82f7d085233 --- /dev/null +++ b/.buildkite/scripts/cleanup-nightly-builds.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +set -ex + +# Clean up old nightly builds from DockerHub, keeping only the last 14 builds +# This script uses DockerHub API to list and delete old tags with "nightly-" prefix + +# DockerHub API endpoint for vllm/vllm-openai repository +REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" + +# Get DockerHub token from environment +if [ -z "$DOCKERHUB_TOKEN" ]; then + echo "Error: DOCKERHUB_TOKEN environment variable is not set" + exit 1 +fi + +# Function to get all tags from DockerHub +get_all_tags() { + local page=1 + local all_tags="" + + while true; do + local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \ + "$REPO_API_URL?page=$page&page_size=100") + + # Get both last_updated timestamp and tag name, separated by | + local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') + + if [ -z "$tags" ]; then + break + fi + + all_tags="$all_tags$tags"$'\n' + page=$((page + 1)) + done + + # Sort by timestamp (newest first) and extract just the tag names + echo "$all_tags" | sort -r | cut -d'|' -f2 +} + +delete_tag() { + local tag_name="$1" + echo "Deleting tag: $tag_name" + + local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" + local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url") + + if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then + echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" + else + echo "Successfully deleted tag: $tag_name" + fi +} + +# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first) +echo "Fetching all tags from DockerHub..." +all_tags=$(get_all_tags) + +if [ -z "$all_tags" ]; then + echo "No tags found to clean up" + exit 0 +fi + +# Count total tags +total_tags=$(echo "$all_tags" | wc -l) +echo "Found $total_tags tags" + +# Keep only the last 14 builds (including the current one) +tags_to_keep=14 +tags_to_delete=$((total_tags - tags_to_keep)) + +if [ $tags_to_delete -le 0 ]; then + echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)" + exit 0 +fi + +echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep" + +# Get tags to delete (skip the first $tags_to_keep tags) +tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1))) + +if [ -z "$tags_to_delete_list" ]; then + echo "No tags to delete" + exit 0 +fi + +# Delete old tags +echo "Deleting old tags..." +while IFS= read -r tag; do + if [ -n "$tag" ]; then + delete_tag "$tag" + # Add a small delay to avoid rate limiting + sleep 1 + fi +done <<< "$tags_to_delete_list" + +echo "Cleanup completed successfully" From 4f87abdcc6efc9946f4d7a205f4bbedf5d53b014 Mon Sep 17 00:00:00 2001 From: Zhiyu <zhiyuc@nvidia.com> Date: Mon, 8 Sep 2025 18:53:13 -0700 Subject: [PATCH 048/584] Update reviewers for modelopt related files (#24468) --- .github/mergify.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 495d207d44260..befad23da8664 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -273,6 +273,20 @@ pull_request_rules: users: - "sangstar" +- name: assign reviewer for modelopt changes + conditions: + - or: + - files~=^vllm/model_executor/layers/quantization/modelopt\.py$ + - files~=^vllm/model_executor/layers/quantization/__init__\.py$ + - files~=^tests/models/quantization/test_modelopt\.py$ + - files~=^tests/quantization/test_modelopt\.py$ + - files~=^tests/models/quantization/test_nvfp4\.py$ + - files~=^docs/features/quantization/modelopt\.md$ + actions: + assign: + users: + - "Edwardf0t1" + - name: remove 'needs-rebase' label when conflict is resolved conditions: - -conflict From 955c624915d66e42525f9b6e8e26a51d3892be6f Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith <tyler@neuralmagic.com> Date: Mon, 8 Sep 2025 22:01:51 -0400 Subject: [PATCH 049/584] [Bugfix][Wide EP] Fix redundant work when using DeepEP, TP Attn, and EP MoE (#24134) Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> --- vllm/model_executor/layers/fused_moe/layer.py | 21 ++- vllm/model_executor/models/deepseek_eagle.py | 7 +- vllm/model_executor/models/deepseek_mtp.py | 27 ++-- vllm/model_executor/models/deepseek_v2.py | 136 ++++++++++++++---- 4 files changed, 132 insertions(+), 59 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 272ad39565375..2f88a63665c52 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -35,7 +35,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum -from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx, +from vllm.utils import (cdiv, direct_register_custom_op, has_deep_ep, has_pplx, round_up) if current_platform.is_cuda_alike(): @@ -786,6 +786,7 @@ class FusedMoE(CustomOp): enable_eplb: bool = False, num_redundant_experts: int = 0, has_bias: bool = False, + is_sequence_parallel=False, ): super().__init__() if params_dtype is None: @@ -797,6 +798,10 @@ class FusedMoE(CustomOp): dp_size_ = (dp_size if dp_size is not None else get_dp_group().world_size) + self.is_sequence_parallel = is_sequence_parallel + if self.is_sequence_parallel: + self.sp_size = tp_size_ + vllm_config = get_current_vllm_config() self.moe_parallel_config: FusedMoEParallelConfig = ( FusedMoEParallelConfig.make( @@ -1699,14 +1704,22 @@ class FusedMoE(CustomOp): ctx = get_forward_context() # flashinfer_cutlass_kernels can handle: optional DP + TP/EP - max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu + max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens + + # If the input to the MoE is sequence parallel then divide by sp_size + # to find the maximum number of tokens for any individual dispatcher. + if self.is_sequence_parallel: + max_tokens_across_dispatchers = cdiv(max_tokens_across_dispatchers, + self.sp_size) + num_tokens = full_hidden_states.size(0) for chunk_idx, chunk_start_ in enumerate( - range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank)): + range(0, max_tokens_across_dispatchers, + moe_dp_chunk_size_per_rank)): chunk_start = chunk_start_ chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank, - max_tokens_across_dp) + max_tokens_across_dispatchers) # clamp start and end chunk_start = min(chunk_start, num_tokens - 1) chunk_end = min(chunk_end, num_tokens) diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 0c9c83cf61000..5e8447a7f48f9 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -37,8 +37,6 @@ class DeepseekV2Model(nn.Module): super().__init__() self.config = vllm_config. \ speculative_config.draft_model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.vocab_size = self.config.vocab_size @@ -51,11 +49,8 @@ class DeepseekV2Model(nn.Module): self.layers = nn.ModuleList([ DeepseekV2DecoderLayer( - self.config, + vllm_config, prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, ) for i in range(self.config.num_hidden_layers) ]) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 0ad001be71c19..8fbf16d206a86 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.config import VllmConfig from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -43,23 +43,19 @@ class SharedHead(nn.Module): class DeepSeekMultiTokenPredictorLayer(nn.Module): - def __init__( - self, - config: PretrainedConfig, - prefix: str, - model_config: ModelConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str) -> None: super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False) self.shared_head = SharedHead(config=config, quant_config=quant_config) - self.mtp_block = DeepseekV2DecoderLayer(config, prefix, model_config, - cache_config, quant_config) + self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix) def forward( self, @@ -95,13 +91,8 @@ class DeepSeekMultiTokenPredictor(nn.Module): # to map the exact layer index from weights self.layers = torch.nn.ModuleDict({ str(idx): - DeepSeekMultiTokenPredictorLayer( - config, - f"{prefix}.layers.{idx}", - model_config=vllm_config.model_config, - cache_config=vllm_config.cache_config, - quant_config=vllm_config.quant_config, - ) + DeepSeekMultiTokenPredictorLayer(vllm_config, + f"{prefix}.layers.{idx}") for idx in range(self.mtp_start_layer_idx, self.mtp_start_layer_idx + self.num_mtp_layers) }) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d65dcfebaeff8..e4a21febc5bde 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -32,12 +32,14 @@ import torch from torch import nn from transformers import DeepseekV2Config, DeepseekV3Config +import vllm.envs as envs from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import (CacheConfig, ModelConfig, VllmConfig, - get_current_vllm_config) +from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import (get_ep_group, get_pp_group, - get_tensor_model_parallel_world_size) + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -55,7 +57,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors +from vllm.utils import cdiv, direct_register_custom_op from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, @@ -72,19 +76,27 @@ class DeepseekV2MLP(nn.Module): hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + is_sequence_parallel=False, prefix: str = "", ) -> None: super().__init__() + + # If is_sequence_parallel, the input and output tensors are sharded + # across the ranks within the tp_group. In this case the weights are + # replicated and no collective ops are needed. + # Otherwise we use standard TP with an allreduce at the end. self.gate_up_proj = MergedColumnParallelLinear( hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config, + disable_tp=is_sequence_parallel, prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(intermediate_size, hidden_size, bias=False, quant_config=quant_config, reduce_results=reduce_results, + disable_tp=is_sequence_parallel, prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " @@ -98,17 +110,58 @@ class DeepseekV2MLP(nn.Module): return x +# Chunk x along the num_tokens axis for sequence parallelism +# NOTE: This is wrapped in a torch custom op to work around the following issue: +# The output tensor can have a sequence length 0 at small input sequence lengths +# even though we explicitly pad to avoid this. +def sequence_parallel_chunk(x: torch.Tensor) -> torch.Tensor: + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + + # all_gather needs the sequence length to be divisible by tp_size + seq_len = x.size(0) + remainder = seq_len % tp_size + if remainder != 0: + pad_len = tp_size - remainder + x = nn.functional.pad(x, (0, 0, 0, pad_len)) + + chunk = x.shape[0] // tp_size + start = tp_rank * chunk + return torch.narrow(x, 0, start, chunk) + + +def sequence_parallel_chunk_fake(x: torch.Tensor) -> torch.Tensor: + tp_size = get_tensor_model_parallel_world_size() + seq_len = cdiv(x.size(0), tp_size) + shape = list(x.shape) + shape[0] = seq_len + out = torch.empty(shape, dtype=x.dtype, device=x.device) + return out + + +direct_register_custom_op( + op_name="sequence_parallel_chunk", + op_func=sequence_parallel_chunk, + mutates_args=[], + fake_impl=sequence_parallel_chunk_fake, + dispatch_key=current_platform.dispatch_key, + tags=(torch.Tag.needs_fixed_stride_order, ), +) + + class DeepseekV2MoE(nn.Module): def __init__( self, config: Union[DeepseekV2Config, DeepseekV3Config], + parallel_config: ParallelConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - enable_eplb: bool = False, ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.routed_scaling_factor = config.routed_scaling_factor self.ep_group = get_ep_group().device_group @@ -117,6 +170,21 @@ class DeepseekV2MoE(nn.Module): self.n_routed_experts: int = config.n_routed_experts self.n_shared_experts: int = config.n_shared_experts + # The all_reduce at the end of attention (during o_proj) means that + # inputs are replicated across each rank of the tensor parallel group. + # If using expert-parallelism with DeepEP All2All ops, replicated + # tokens results in useless duplicate computation and communication. + # + # In this case, ensure the input to the experts is sequence parallel + # to avoid the excess work. + # + # Not needed for pplx-kernels as it can handle duplicate input tokens. + self.is_sequence_parallel = (envs.VLLM_ALL2ALL_BACKEND + in ("deepep_high_throughput", + "deepep_low_latency") + and parallel_config.enable_expert_parallel + and self.tp_size > 1) + if config.hidden_act != "silu": raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") @@ -133,9 +201,8 @@ class DeepseekV2MoE(nn.Module): self.gate.e_score_correction_bias = None # Load balancing settings. - vllm_config = get_current_vllm_config() - eplb_config = vllm_config.parallel_config.eplb_config - self.enable_eplb = enable_eplb + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts @@ -166,7 +233,9 @@ class DeepseekV2MoE(nn.Module): routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts) + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + ) self.shared_experts = None else: intermediate_size = (config.moe_intermediate_size * @@ -177,6 +246,7 @@ class DeepseekV2MoE(nn.Module): intermediate_size=intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + is_sequence_parallel=self.is_sequence_parallel, reduce_results=False, prefix=f"{prefix}.shared_experts", ) @@ -199,11 +269,22 @@ class DeepseekV2MoE(nn.Module): routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts) + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) + + # Chunk the hidden states so they aren't replicated across TP ranks. + # This avoids duplicate computation in self.experts. + # TODO: We can replace the all_reduce at the end of attn with a + # reduce_scatter instead of chunking here. + if self.is_sequence_parallel: + hidden_states = torch.ops.vllm.sequence_parallel_chunk( + hidden_states) + # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) @@ -228,7 +309,11 @@ class DeepseekV2MoE(nn.Module): assert shared_output is not None final_hidden_states += shared_output - if self.tp_size > 1: + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: final_hidden_states = ( self.experts.maybe_all_reduce_tensor_model_parallel( final_hidden_states)) @@ -532,16 +617,15 @@ class DeepseekV2MLAAttention(nn.Module): class DeepseekV2DecoderLayer(nn.Module): - def __init__( - self, - config: Union[DeepseekV2Config, DeepseekV3Config], - prefix: str, - model_config: ModelConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - enable_eplb: bool = False, - ) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str) -> None: super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + parallel_config = vllm_config.parallel_config + self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) @@ -578,9 +662,9 @@ class DeepseekV2DecoderLayer(nn.Module): and layer_idx % config.moe_layer_freq == 0): self.mlp = DeepseekV2MoE( config=config, + parallel_config=parallel_config, quant_config=quant_config, prefix=f"{prefix}.mlp", - enable_eplb=enable_eplb, ) else: self.mlp = DeepseekV2MLP( @@ -650,10 +734,7 @@ class DeepseekV2Model(nn.Module): super().__init__() config = vllm_config.model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - enable_eplb = vllm_config.parallel_config.enable_eplb self.config = config self.vocab_size = config.vocab_size @@ -669,14 +750,7 @@ class DeepseekV2Model(nn.Module): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: DeepseekV2DecoderLayer( - config, - prefix, - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, - enable_eplb=enable_eplb, - ), + lambda prefix: DeepseekV2DecoderLayer(vllm_config, prefix), prefix=f"{prefix}.layers") if get_pp_group().is_last_rank: From 170129eb283aa225f20666a434c02822f643242e Mon Sep 17 00:00:00 2001 From: zhiweiz <morgendave@gmail.com> Date: Mon, 8 Sep 2025 19:03:50 -0700 Subject: [PATCH 050/584] [gpt-oss] Harmony changes with container tool support (#23386) Signed-off-by: zhiweiz <zhiweiz@fb.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: zhiweiz <zhiweiz@fb.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> --- vllm/entrypoints/context.py | 87 ++++++++++++++++++-- vllm/entrypoints/harmony_utils.py | 50 +++++++++-- vllm/entrypoints/openai/serving_responses.py | 33 ++++++-- vllm/entrypoints/tool_server.py | 16 ++-- vllm/envs.py | 11 +++ 5 files changed, 170 insertions(+), 27 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 7723c5d5cbcfc..9012639457cad 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import contextlib import json import logging from abc import ABC, abstractmethod @@ -57,9 +59,14 @@ class ConversationContext(ABC): @abstractmethod async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack) -> None: + exit_stack: AsyncExitStack, + request_id: str) -> None: pass + @abstractmethod + async def cleanup_session(self) -> None: + raise NotImplementedError("Should not be called.") + class SimpleContext(ConversationContext): @@ -89,9 +96,13 @@ class SimpleContext(ConversationContext): raise NotImplementedError("Should not be called.") async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack) -> None: + exit_stack: AsyncExitStack, + request_id: str) -> None: pass + async def cleanup_session(self) -> None: + raise NotImplementedError("Should not be called.") + class HarmonyContext(ConversationContext): @@ -103,6 +114,7 @@ class HarmonyContext(ConversationContext): self._messages = messages self.available_tools = available_tools self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} + self.called_tools: set[str] = set() self.parser = get_streamable_parser_for_assistant() self.num_init_messages = len(messages) @@ -234,7 +246,8 @@ class HarmonyContext(ConversationContext): last_msg = self.messages[-1] recipient = last_msg.recipient return recipient is not None and (recipient.startswith("browser.") - or recipient.startswith("python")) + or recipient.startswith("python") or + recipient.startswith("container.")) async def call_tool(self) -> list[Message]: if not self.messages: @@ -248,6 +261,9 @@ class HarmonyContext(ConversationContext): elif recipient.startswith("python"): return await self.call_python_tool( self._tool_sessions["python"], last_msg) + elif recipient.startswith("container."): + return await self.call_container_tool( + self._tool_sessions["container"], last_msg) raise ValueError("No tool call found") def render_for_completion(self) -> list[int]: @@ -256,6 +272,7 @@ class HarmonyContext(ConversationContext): async def call_search_tool(self, tool_session: Union["ClientSession", Tool], last_msg: Message) -> list[Message]: + self.called_tools.add("browser") if isinstance(tool_session, Tool): return await tool_session.get_result(self) tool_name = last_msg.recipient.split(".")[1] @@ -265,12 +282,16 @@ class HarmonyContext(ConversationContext): content = TextContent(text=result_str) author = Author(role=Role.TOOL, name=last_msg.recipient) return [ - Message(author=author, content=[content], recipient=Role.ASSISTANT) + Message(author=author, + content=[content], + recipient=Role.ASSISTANT, + channel=last_msg.channel) ] async def call_python_tool(self, tool_session: Union["ClientSession", Tool], last_msg: Message) -> list[Message]: + self.called_tools.add("python") if isinstance(tool_session, Tool): return await tool_session.get_result(self) param = { @@ -290,13 +311,63 @@ class HarmonyContext(ConversationContext): ] async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack) -> None: + exit_stack: AsyncExitStack, + request_id: str) -> None: if tool_server: for tool_name in self.available_tools: if tool_name not in self._tool_sessions: - self._tool_sessions[ - tool_name] = await exit_stack.enter_async_context( - tool_server.new_session(tool_name)) + tool_session = await exit_stack.enter_async_context( + tool_server.new_session(tool_name, request_id)) + self._tool_sessions[tool_name] = tool_session + exit_stack.push_async_exit(self.cleanup_session) + + async def call_container_tool(self, tool_session: Union["ClientSession", + Tool], + last_msg: Message) -> list[Message]: + """ + Call container tool. Expect this to be run in a stateful docker + with command line terminal. + The official container tool would at least + expect the following format: + - for tool name: exec + - args: + { + "cmd":List[str] "command to execute", + "workdir":optional[str] "current working directory", + "env":optional[object/dict] "environment variables", + "session_name":optional[str] "session name", + "timeout":optional[int] "timeout in seconds", + "user":optional[str] "user name", + } + """ + self.called_tools.add("container") + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + tool_name = last_msg.recipient.split(".")[1].split(" ")[0] + args = json.loads(last_msg.content[0].text) + result = await tool_session.call_tool(tool_name, args) + result_str = result.content[0].text + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name=last_msg.recipient) + return [ + Message(author=author, + content=[content], + recipient=Role.ASSISTANT, + channel=last_msg.channel) + ] + + async def cleanup_session(self, *args, **kwargs) -> None: + """Can be used as coro to used in __aexit__""" + + async def cleanup_tool_session(tool_session): + if not isinstance(tool_session, Tool): + logger.info("Cleaning up tool session for %s", + tool_session._client_info) + with contextlib.suppress(Exception): + await tool_session.call_tool("cleanup_session", {}) + + await asyncio.gather(*(cleanup_tool_session(self._tool_sessions[tool]) + for tool in self.called_tools)) class StreamingHarmonyContext(HarmonyContext): diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index a3693ce60e4e6..f7528ba81dce5 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -16,11 +16,13 @@ from openai.types.responses.response_function_web_search import ( from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent) from openai.types.responses.tool import Tool -from openai_harmony import (Author, Conversation, DeveloperContent, - HarmonyEncodingName, Message, ReasoningEffort, - Role, StreamableParser, SystemContent, TextContent, - ToolDescription, load_harmony_encoding) +from openai_harmony import (Author, ChannelConfig, Conversation, + DeveloperContent, HarmonyEncodingName, Message, + ReasoningEffort, Role, StreamableParser, + SystemContent, TextContent, ToolDescription, + load_harmony_encoding) +from vllm import envs from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam, ResponseInputOutputItem) from vllm.utils import random_uuid @@ -33,6 +35,20 @@ REASONING_EFFORT = { _harmony_encoding = None +# Builtin tools that should be included in the system message when +# they are available and requested by the user. +# Tool args are provided by MCP tool descriptions. Output +# of the tools are stringified. +BUILTIN_TOOLS = { + "web_search_preview", + "code_interpreter", + "container", +} + + +def has_custom_tools(tool_types: list[str]) -> bool: + return not set(tool_types).issubset(BUILTIN_TOOLS) + def get_encoding(): global _harmony_encoding @@ -48,10 +64,19 @@ def get_system_message( start_date: Optional[str] = None, browser_description: Optional[str] = None, python_description: Optional[str] = None, + container_description: Optional[str] = None, + instructions: Optional[str] = None, + with_custom_tools: bool = False, ) -> Message: sys_msg_content = SystemContent.new() if model_identity is not None: sys_msg_content = sys_msg_content.with_model_identity(model_identity) + if (instructions is not None + and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS): + current_identity = sys_msg_content.model_identity + new_identity = (f'{current_identity}\n{instructions}' + if current_identity else instructions) + sys_msg_content = sys_msg_content.with_model_identity(new_identity) if reasoning_effort is not None: sys_msg_content = sys_msg_content.with_reasoning_effort( REASONING_EFFORT[reasoning_effort]) @@ -63,6 +88,14 @@ def get_system_message( sys_msg_content = sys_msg_content.with_tools(browser_description) if python_description is not None: sys_msg_content = sys_msg_content.with_tools(python_description) + if container_description is not None: + sys_msg_content = sys_msg_content.with_tools(container_description) + if not with_custom_tools: + channel_config = sys_msg_content.channel_config + invalid_channel = "commentary" + new_config = ChannelConfig.require_channels( + [c for c in channel_config.valid_channels if c != invalid_channel]) + sys_msg_content = sys_msg_content.with_channel_config(new_config) sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content) return sys_msg @@ -86,14 +119,17 @@ def get_developer_message( tools: Optional[list[Union[Tool, ChatCompletionToolsParam]]] = None, ) -> Message: dev_msg_content = DeveloperContent.new() - if instructions is not None: + if (instructions is not None + and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS): dev_msg_content = dev_msg_content.with_instructions(instructions) if tools is not None: function_tools: list[Union[Tool, ChatCompletionToolsParam]] = [] for tool in tools: - if tool.type in ("web_search_preview", "code_interpreter"): + if tool.type in ("web_search_preview", "code_interpreter", + "container"): # These are built-in tools that are added to the system message. pass + elif tool.type == "function": function_tools.append(tool) else: @@ -136,6 +172,8 @@ def parse_response_input( TextContent(text=text_prefix + c["text"]) for c in content ] msg = Message.from_role_and_contents(role, contents) + if role == "assistant": + msg = msg.with_channel("final") elif response_msg["type"] == "function_call_output": call_id = response_msg["call_id"] call_response: Optional[ResponseFunctionToolCall] = None diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index a102d4a4a5e68..c5177bdf5375d 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -44,8 +44,9 @@ from vllm.entrypoints.context import (ConversationContext, HarmonyContext, SimpleContext, StreamingHarmonyContext) from vllm.entrypoints.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, - get_system_message, get_user_message, parse_output_message, - parse_remaining_state, parse_response_input, render_for_completion) + get_system_message, get_user_message, has_custom_tools, + parse_output_message, parse_remaining_state, parse_response_input, + render_for_completion) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -266,6 +267,8 @@ class OpenAIServingResponses(OpenAIServing): builtin_tool_list.append("browser") if self.tool_server.has_tool("python"): builtin_tool_list.append("python") + if self.tool_server.has_tool("container"): + builtin_tool_list.append("container") if self.tool_server is not None: available_tools = builtin_tool_list @@ -448,7 +451,8 @@ class OpenAIServingResponses(OpenAIServing): async with AsyncExitStack() as exit_stack: try: - await context.init_tool_sessions(self.tool_server, exit_stack) + await context.init_tool_sessions(self.tool_server, exit_stack, + request.request_id) async for _ in result_generator: pass except asyncio.CancelledError: @@ -710,13 +714,21 @@ class OpenAIServingResponses(OpenAIServing): # New conversation. reasoning_effort = (request.reasoning.effort if request.reasoning else None) + # Temporary: OpenAI types doesn't have container tool + # so we used MCP to cover that, up for change tool_types = [tool.type for tool in request.tools] + if envs.VLLM_GPT_OSS_USE_CONTAINER_TOOL: + tool_types.append("container") enable_browser = ("web_search_preview" in tool_types and self.tool_server is not None and self.tool_server.has_tool("browser")) enable_code_interpreter = ("code_interpreter" in tool_types and self.tool_server is not None and self.tool_server.has_tool("python")) + enable_container = ("container" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("container")) + with_custom_tools = has_custom_tools(tool_types) sys_msg = get_system_message( reasoning_effort=reasoning_effort, browser_description=self.tool_server.get_tool_description( @@ -725,11 +737,17 @@ class OpenAIServingResponses(OpenAIServing): python_description=self.tool_server.get_tool_description( "python") if enable_code_interpreter and self.tool_server is not None else None, + container_description=self.tool_server.get_tool_description( + "container") + if enable_container and self.tool_server is not None else None, + instructions=request.instructions, + with_custom_tools=with_custom_tools, ) messages.append(sys_msg) - dev_msg = get_developer_message(request.instructions, - request.tools) - messages.append(dev_msg) + if with_custom_tools: + dev_msg = get_developer_message( + instructions=request.instructions, tools=request.tools) + messages.append(dev_msg) else: # Continue the previous conversation. # FIXME(woosuk): Currently, request params like reasoning and @@ -1613,7 +1631,8 @@ class OpenAIServingResponses(OpenAIServing): async with AsyncExitStack() as exit_stack: processer = None if self.use_harmony: - await context.init_tool_sessions(self.tool_server, exit_stack) + await context.init_tool_sessions(self.tool_server, exit_stack, + request.request_id) processer = self._process_harmony_streaming_events else: processer = self._process_simple_streaming_events diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py index 2f28595f27c6a..3f413df7108af 100644 --- a/vllm/entrypoints/tool_server.py +++ b/vllm/entrypoints/tool_server.py @@ -86,7 +86,8 @@ class ToolServer(ABC): pass @abstractmethod - def new_session(self, tool_name: str) -> AbstractAsyncContextManager[Any]: + def new_session(self, tool_name: str, + session_id: str) -> AbstractAsyncContextManager[Any]: """ Create a session for the tool. """ @@ -124,7 +125,8 @@ class MCPToolServer(ToolServer): description=tool.description, parameters=tool.inputSchema) for tool in list_tools_response.tools - ]) + ], + ) self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp if tool_from_mcp.name not in self.urls: self.urls[tool_from_mcp.name] = url @@ -142,14 +144,16 @@ class MCPToolServer(ToolServer): return self.harmony_tool_descriptions.get(tool_name) @asynccontextmanager - async def new_session(self, tool_name: str): + async def new_session(self, tool_name: str, session_id: str): from mcp import ClientSession from mcp.client.sse import sse_client url = self.urls.get(tool_name) + headers = {"x-session-id": session_id} if not url: raise KeyError(f"Tool '{tool_name}' is not supported") - async with sse_client(url=url) as streams, ClientSession( - *streams) as session: + async with sse_client(url=url, + headers=headers) as streams, ClientSession( + *streams) as session: await session.initialize() yield session @@ -182,7 +186,7 @@ class DemoToolServer(ToolServer): raise ValueError(f"Unknown tool {tool_name}") @asynccontextmanager - async def new_session(self, tool_name: str): + async def new_session(self, tool_name: str, session_id: str): if tool_name not in self.tools: raise KeyError(f"Tool '{tool_name}' is not supported") yield self.tools[tool_name] diff --git a/vllm/envs.py b/vllm/envs.py index 50783eeb95a42..927bea3bf9538 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -168,6 +168,8 @@ if TYPE_CHECKING: VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False + VLLM_GPT_OSS_USE_CONTAINER_TOOL: bool = False + VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False @@ -1201,6 +1203,15 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), + # Allows vllm use container tool + "VLLM_GPT_OSS_USE_CONTAINER_TOOL": + lambda: bool(int(os.getenv("VLLM_GPT_OSS_USE_CONTAINER_TOOL", "0"))), + + # Allows harmony instructions to be injected on system messages + "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": + lambda: bool( + int(os.getenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "0"))), + # Add optional custom scopes for profiling, disable to avoid overheads "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))), From 22a007053019fbe137605d3d7aab189fce8ee7b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Sep 2025 02:54:58 +0000 Subject: [PATCH 051/584] Bump actions/setup-python from 5.4.0 to 6.0.0 (#24414) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> --- .github/workflows/cleanup_pr_body.yml | 2 +- .github/workflows/pre-commit.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index d5c6b8d43a6ef..c3e132a536a42 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: '3.12' diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 195579f206a2f..e21d13b8161f3 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" From 13b89bd823b652979a8b2b446e1860aeb5e6a538 Mon Sep 17 00:00:00 2001 From: cjackal <44624812+cjackal@users.noreply.github.com> Date: Tue, 9 Sep 2025 12:07:58 +0900 Subject: [PATCH 052/584] [doc] update `vllm serve` cli args documentation (#24329) Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com> --- vllm/entrypoints/openai/cli_args.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 7e1df795fb056..a6db97e55d704 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -134,14 +134,13 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" """If specified, will run the OpenAI frontend server in the same process as the model serving engine.""" enable_request_id_headers: bool = False - """If specified, API server will add X-Request-Id header to responses. - Caution: this hurts performance at high QPS.""" + """If specified, API server will add X-Request-Id header to responses.""" enable_auto_tool_choice: bool = False - """If specified, exclude tool definitions in prompts when - tool_choice='none'.""" - exclude_tools_when_tool_choice_none: bool = False """Enable auto tool choice for supported models. Use `--tool-call-parser` to specify which parser to use.""" + exclude_tools_when_tool_choice_none: bool = False + """If specified, exclude tool definitions in prompts when + tool_choice='none'.""" tool_call_parser: Optional[str] = None """Select the tool call parser depending on the model that you're using. This is used to parse the model-generated tool call into OpenAI API format. From ed1623a88a86a38332b2510d7a6ca13526597002 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Sep 2025 03:11:20 +0000 Subject: [PATCH 053/584] Bump actions/stale from 9.1.0 to 10.0.0 (#24412) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> --- .github/workflows/stale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 656f3d3fa7bc4..82844810a633a 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,7 +13,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 + - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months From 562663a044acbff0b8df11e07a4929b18d71172f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Sep 2025 03:12:44 +0000 Subject: [PATCH 054/584] Bump actions/github-script from 7.0.1 to 8.0.0 (#24413) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> --- .github/workflows/add_label_automerge.yml | 2 +- .github/workflows/issue_autolabel.yml | 2 +- .github/workflows/reminder_comment.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index 315042fbf5cf4..d8bbedef3174b 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index e0ab3872d8fa3..c2b17abe811cd 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Label issues based on keywords - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | // Configuration: Add new labels and keywords here diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index 1ee605dc7bb0d..8884359fa0ce4 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Remind to run full CI on PR - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | try { From 3e0d4a34759333ad59069336a3691d8cecc8e0b9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 9 Sep 2025 04:30:32 +0100 Subject: [PATCH 055/584] Move `KVTransferConfig` from `config/__init__.py` to `config/kv_transfer.py` (#24434) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 103 +--------------- vllm/config/kv_transfer.py | 111 ++++++++++++++++++ .../kv_transfer/kv_connector/factory.py | 3 +- .../kv_connector/v1/multi_connector.py | 3 +- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- .../kv_transfer/kv_pipe/mooncake_pipe.py | 2 +- .../kv_transfer/kv_pipe/pynccl_pipe.py | 2 +- vllm/entrypoints/llm.py | 2 +- 8 files changed, 120 insertions(+), 108 deletions(-) create mode 100644 vllm/config/kv_transfer.py diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index e3ce1987fe9ee..e84b924ada0a7 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -9,7 +9,6 @@ import hashlib import inspect import json import textwrap -import uuid import warnings from collections.abc import Mapping from contextlib import contextmanager @@ -34,6 +33,7 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType, from vllm.config.compilation import (CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig) from vllm.config.kv_events import KVEventsConfig +from vllm.config.kv_transfer import KVTransferConfig from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -3210,107 +3210,6 @@ class ObservabilityConfig: self.collect_detailed_traces[0].split(",")) -KVProducer = Literal["kv_producer", "kv_both"] -KVConsumer = Literal["kv_consumer", "kv_both"] -KVRole = Literal[KVProducer, KVConsumer] - - -@config -@dataclass -class KVTransferConfig: - """Configuration for distributed KV cache transfer.""" - - kv_connector: Optional[str] = None - """The KV connector for vLLM to transmit KV caches between vLLM instances. - """ - - engine_id: Optional[str] = None - """The engine id for KV transfers.""" - - kv_buffer_device: Optional[str] = "cuda" - """The device used by kv connector to buffer the KV cache. - Currently only support 'cuda'.""" - - kv_buffer_size: float = 1e9 - """The buffer size for TorchDistributedConnector. Measured in number of - bytes. Recommended value: 1e9 (about 1GB).""" - - kv_role: Optional[KVRole] = None - """Whether this vLLM instance produces, consumes KV cache, or both. Choices - are 'kv_producer', 'kv_consumer', and 'kv_both'.""" - - kv_rank: Optional[int] = None - """The rank of this vLLM instance in the KV cache transfer. Typical value: - 0 for prefill instance, 1 for decode instance. - Currently only 1P1D is supported.""" - - kv_parallel_size: int = 1 - """The number of parallel instances for KV cache transfer. For - P2pNcclConnector, this should be 2.""" - - kv_ip: str = "127.0.0.1" - """The KV connector ip, used to build distributed connection.""" - - kv_port: int = 14579 - """The KV connector port, used to build distributed connection.""" - - kv_connector_extra_config: dict[str, Any] = field(default_factory=dict) - """any extra config that the connector may need.""" - - kv_connector_module_path: Optional[str] = None - """The Python module path to dynamically load the KV connector from. - Only supported in V1.""" - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self) -> None: - if self.engine_id is None: - self.engine_id = str(uuid.uuid4()) - - if self.kv_role is not None and self.kv_role not in get_args(KVRole): - raise ValueError(f"Unsupported kv_role: {self.kv_role}. " - f"Supported roles are {get_args(KVRole)}") - - if self.kv_connector is not None and self.kv_role is None: - raise ValueError("Please specify kv_disagg_role when kv_connector " - f"is set, supported roles are {get_args(KVRole)}") - - @property - def is_kv_transfer_instance(self) -> bool: - return self.kv_connector is not None and \ - self.kv_role in get_args(KVRole) - - @property - def is_kv_producer(self) -> bool: - return self.kv_connector is not None and \ - self.kv_role in get_args(KVProducer) - - @property - def is_kv_consumer(self) -> bool: - return self.kv_connector is not None and \ - self.kv_role in get_args(KVConsumer) - - def get_from_extra_config(self, key, default) -> Any: - return self.kv_connector_extra_config.get(key, default) - - @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class VllmConfig: diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py new file mode 100644 index 0000000000000..9abf4acacfe81 --- /dev/null +++ b/vllm/config/kv_transfer.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import uuid +from dataclasses import field +from typing import Any, Literal, Optional, get_args + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +KVProducer = Literal["kv_producer", "kv_both"] +KVConsumer = Literal["kv_consumer", "kv_both"] +KVRole = Literal[KVProducer, KVConsumer] + + +@config +@dataclass +class KVTransferConfig: + """Configuration for distributed KV cache transfer.""" + + kv_connector: Optional[str] = None + """The KV connector for vLLM to transmit KV caches between vLLM instances. + """ + + engine_id: Optional[str] = None + """The engine id for KV transfers.""" + + kv_buffer_device: Optional[str] = "cuda" + """The device used by kv connector to buffer the KV cache. + Currently only support 'cuda'.""" + + kv_buffer_size: float = 1e9 + """The buffer size for TorchDistributedConnector. Measured in number of + bytes. Recommended value: 1e9 (about 1GB).""" + + kv_role: Optional[KVRole] = None + """Whether this vLLM instance produces, consumes KV cache, or both. Choices + are 'kv_producer', 'kv_consumer', and 'kv_both'.""" + + kv_rank: Optional[int] = None + """The rank of this vLLM instance in the KV cache transfer. Typical value: + 0 for prefill instance, 1 for decode instance. + Currently only 1P1D is supported.""" + + kv_parallel_size: int = 1 + """The number of parallel instances for KV cache transfer. For + P2pNcclConnector, this should be 2.""" + + kv_ip: str = "127.0.0.1" + """The KV connector ip, used to build distributed connection.""" + + kv_port: int = 14579 + """The KV connector port, used to build distributed connection.""" + + kv_connector_extra_config: dict[str, Any] = field(default_factory=dict) + """any extra config that the connector may need.""" + + kv_connector_module_path: Optional[str] = None + """The Python module path to dynamically load the KV connector from. + Only supported in V1.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self) -> None: + if self.engine_id is None: + self.engine_id = str(uuid.uuid4()) + + if self.kv_role is not None and self.kv_role not in get_args(KVRole): + raise ValueError(f"Unsupported kv_role: {self.kv_role}. " + f"Supported roles are {get_args(KVRole)}") + + if self.kv_connector is not None and self.kv_role is None: + raise ValueError("Please specify kv_disagg_role when kv_connector " + f"is set, supported roles are {get_args(KVRole)}") + + @property + def is_kv_transfer_instance(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in get_args(KVRole) + + @property + def is_kv_producer(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in get_args(KVProducer) + + @property + def is_kv_consumer(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in get_args(KVConsumer) + + def get_from_extra_config(self, key, default) -> Any: + return self.kv_connector_extra_config.get(key, default) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 584fc1d655951..670f9c26b2104 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -14,7 +14,8 @@ from vllm.logger import init_logger # yapf: enable if TYPE_CHECKING: - from vllm.config import KVTransferConfig, VllmConfig + from vllm.config import VllmConfig + from vllm.config.kv_transfer import KVTransferConfig logger = init_logger(__name__) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 65bcb4d93b1e1..edbff4e4340f4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -7,7 +7,8 @@ from typing import TYPE_CHECKING, Any, Optional import torch -from vllm.config import KVTransferConfig, VllmConfig +from vllm.config import VllmConfig +from vllm.config.kv_transfer import KVTransferConfig from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index dfd95548c4632..fa7cc66ab654d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -15,7 +15,7 @@ import msgpack import torch import zmq -from vllm.config import KVTransferConfig +from vllm.config.kv_transfer import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501 diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py index 0b560d1b3b3ce..2a434e280179e 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -13,7 +13,7 @@ import zmq from safetensors.torch import load as safetensors_load from safetensors.torch import save as safetensors_save -from vllm.config import KVTransferConfig +from vllm.config.kv_transfer import KVTransferConfig from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase from vllm.logger import init_logger from vllm.utils import join_host_port, make_zmq_path, split_host_port diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 09de0b682efca..66120e9a0a1a0 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -20,7 +20,7 @@ from typing import Callable, Optional import torch -from vllm.config import KVTransferConfig +from vllm.config.kv_transfer import KVTransferConfig from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase from vllm.distributed.utils import StatelessProcessGroup diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index d33fd0ec0b494..e6fd61ae1aad9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -204,7 +204,7 @@ class LLM: if "kv_transfer_config" in kwargs and isinstance( kwargs["kv_transfer_config"], dict): - from vllm.config import KVTransferConfig + from vllm.config.kv_transfer import KVTransferConfig raw_config_dict = kwargs["kv_transfer_config"] try: kwargs["kv_transfer_config"] = KVTransferConfig( From b6fbc15634731285b2907d2eab87c51f69f1a102 Mon Sep 17 00:00:00 2001 From: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com> Date: Tue, 9 Sep 2025 11:37:16 +0800 Subject: [PATCH 056/584] [BugFix][Model] Fix Ernie4.5-VL hanging on long inputs (#24074) Signed-off-by: wangyafeng <wangyafeng@baidu.com> --- vllm/model_executor/models/ernie45_vl.py | 14 ++++++++++---- vllm/model_executor/models/ernie45_vl_moe.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index d880fc434e20f..97aace5a20c3a 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -66,8 +66,6 @@ from .vision import get_vit_attn_backend logger = init_logger(__name__) -_MAX_FRAMES_PER_VIDEO = 16 - # === Vision Transformer === # @@ -839,6 +837,15 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_image_tokens = self.get_max_image_tokens() + max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) + return {"image": max_image_tokens, "video": max_video_tokens} + def _get_vision_info( self, *, @@ -964,8 +971,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - max_frames_per_video = min(max_total_frames // max(max_videos, 1), - _MAX_FRAMES_PER_VIDEO) + max_frames_per_video = max_total_frames // max(max_videos, 1) return max(max_frames_per_video, 2) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 780974c3b758e..6034505fa7d68 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -287,8 +287,13 @@ class Ernie4_5_VLMoeMoE(nn.Module): if self.has_shared_experts: shared_output = self.shared_experts(hidden_states) - if visual_token_mask is not None and visual_token_mask.any(): - # assert visual_token_mask.shape[0] != hidden_states.shape[0] + if visual_token_mask is not None and visual_token_mask.all(): + # only vision modal input + router_logits, _ = self.vision_experts_gate(hidden_states) + final_hidden_states = self.vision_experts( + hidden_states=hidden_states, router_logits=router_logits) + elif visual_token_mask is not None and visual_token_mask.any(): + # text and vision modals input visual_token_mask = visual_token_mask.repeat( 1, self.hidden_size).bool() text_token_mask = ~visual_token_mask @@ -310,7 +315,7 @@ class Ernie4_5_VLMoeMoE(nn.Module): hidden_states=vision_hidden_states, router_logits=vision_router_logits).flatten() else: - # text modal input processing directly + # only text modal input text_router_logits, _ = self.text_experts_gate(hidden_states) final_hidden_states = self.text_experts( From bba1042c6fcb01a3ec12803ab979852040d3e4d6 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Tue, 9 Sep 2025 11:53:07 +0800 Subject: [PATCH 057/584] [Flashinfer] Support Flashinfer TRTLLM FP8-qkv BF16/FP16-out Attention Kernel (#23647) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- .../kernels/benchmark_trtllm_decode_attention.py | 1 + .../kernels/benchmark_trtllm_prefill_attention.py | 1 + .../attention/test_flashinfer_trtllm_attention.py | 12 ++++++++++++ vllm/compilation/fusion_attn.py | 6 ++++-- vllm/v1/attention/backends/flashinfer.py | 13 ++++--------- 5 files changed, 22 insertions(+), 11 deletions(-) diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 603ce5ecf0d2c..6ddab46214577 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -259,6 +259,7 @@ if __name__ == "__main__": # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) (None, None, None), (None, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, None), (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), ] diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 40903c6c3444f..131df74c7de1b 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -274,6 +274,7 @@ if __name__ == "__main__": quant_dtypes = [ # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) (None, None, None), + (FP8_DTYPE, FP8_DTYPE, None), (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), ] diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 8d0a11d8eb8ab..bd3ba554b32e2 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -35,6 +35,7 @@ QUANT_DTYPES = [ # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) (None, None, None), (None, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, None), (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), ] @@ -44,6 +45,7 @@ NUM_HEADS = [(64, 8), (40, 8)] HEAD_SIZE = [128] KV_LAYOUT = ["HND"] # currently only HND is supported BLOCK_SIZE = [16] +WINDOW_LEFT = [-1, 127] SOFT_CAP = [None, 50.0] NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. @@ -57,6 +59,7 @@ NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. @pytest.mark.parametrize("head_size", HEAD_SIZE) @pytest.mark.parametrize("kv_layout", KV_LAYOUT) @pytest.mark.parametrize("block_size", BLOCK_SIZE) +@pytest.mark.parametrize("window_left", WINDOW_LEFT) @pytest.mark.parametrize("soft_cap", SOFT_CAP) @torch.inference_mode def test_flashinfer_trtllm_decode_with_baseline( @@ -69,6 +72,7 @@ def test_flashinfer_trtllm_decode_with_baseline( head_size: int, kv_layout: str, block_size: int, + window_left: int, soft_cap: Optional[float], ) -> None: torch.set_default_device("cuda") @@ -155,6 +159,7 @@ def test_flashinfer_trtllm_decode_with_baseline( sm_scale=sm_scale, q_data_type=dtype, kv_data_type=dtype, + window_left=window_left, logits_soft_cap=soft_cap) output = torch.empty(ref_query.shape, dtype=dtype) @@ -188,6 +193,7 @@ def test_flashinfer_trtllm_decode_with_baseline( max_seq_len=max_seq_len, bmm1_scale=q_scale * k_scale * sm_scale, bmm2_scale=v_scale / o_scale, + window_left=window_left, o_sf_scale=o_sf_scale, out=output_trtllm, ) @@ -222,6 +228,7 @@ def test_flashinfer_trtllm_decode_with_baseline( @pytest.mark.parametrize("head_size", HEAD_SIZE) @pytest.mark.parametrize("kv_layout", KV_LAYOUT) @pytest.mark.parametrize("block_size", BLOCK_SIZE) +@pytest.mark.parametrize("window_left", WINDOW_LEFT) @pytest.mark.parametrize("soft_cap", [None]) @torch.inference_mode def test_flashinfer_trtllm_prefill_with_baseline( @@ -234,6 +241,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( head_size: int, kv_layout: str, block_size: int, + window_left: int, soft_cap: Optional[float], ) -> None: torch.set_default_device("cuda") @@ -334,6 +342,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( sm_scale=sm_scale, q_data_type=dtype, kv_data_type=dtype, + window_left=window_left, logits_soft_cap=soft_cap) output = torch.empty(ref_query.shape, dtype=dtype) @@ -371,6 +380,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( batch_size=batch_size, cum_seq_lens_q=q_indptr, cum_seq_lens_kv=kv_indptr, + window_left=window_left, o_sf_scale=o_sf_scale, out=output_trtllm, ) @@ -390,6 +400,8 @@ def test_flashinfer_trtllm_prefill_with_baseline( rtol, atol = 4e-1, 1e0 elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE: rtol, atol = 5e-2, 7e-2 + elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype: + rtol, atol = 4e-2, 6e-2 else: rtol, atol = 1e-2, 1e-2 diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 3095f17110fde..43c345695ef4e 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -258,8 +258,10 @@ class AttnFusionPass(VllmInductorPass): pattern_fp8 = AttentionFp8StaticQuantPattern(layer) pattern_fp8.register_if_supported(self.patterns) - pattern_nvfp4 = AttentionNvfp4QuantPattern(layer) - pattern_nvfp4.register_if_supported(self.patterns) + if current_platform.is_cuda() and hasattr(torch.ops._C, + "scaled_fp4_quant"): + pattern_nvfp4 = AttentionNvfp4QuantPattern(layer) + pattern_nvfp4.register_if_supported(self.patterns) if len(attn_layers) == 0: logger.warning( diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 06a853007a578..c7a565810b45b 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -194,19 +194,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): FlashInferBackend.validate_head_size(self.head_dim) self.page_size = self.kv_cache_spec.block_size - self.enable_fusion = ( - self.compilation_config.pass_config.enable_attn_fusion) - self.q_data_type = self.model_config.dtype self.cache_dtype = self.cache_config.cache_dtype if self.cache_dtype.startswith("fp8"): self.kv_cache_dtype = ( FlashInferBackend.get_fp8_dtype_for_flashinfer( self.cache_dtype)) - # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled - if self.enable_fusion: - self.q_data_type = self.kv_cache_dtype else: + assert self.kv_cache_spec.dtype == self.model_config.dtype self.kv_cache_dtype = self.kv_cache_spec.dtype + self.q_data_type = self.kv_cache_dtype self._cascade_wrapper = None # Wrapper for cascade attention @@ -668,8 +664,6 @@ class FlashInferImpl(AttentionImpl): # The attn+quant fusion happens when output_scale is provided. if output_scale is None: - assert attn_metadata.q_data_type != FP8_DTYPE, \ - "Query can only be FP8 if output fusion happened." assert output_block_scale is None, "output_block_scale "\ "is not supported when fusion has not happened" else: @@ -697,7 +691,8 @@ class FlashInferImpl(AttentionImpl): elif output.dtype == FP4_DTYPE: self.o_sf_scale = layer._o_scale_float - # Insert FP8 quant for query + # Insert FP8 quant for query + if attn_metadata.q_data_type == FP8_DTYPE: num_tokens, num_heads, head_size = query.shape query, _ = ops.scaled_fp8_quant( query.reshape( From 82dfb12e52ded608da2a458b2da5f48ba1acecdf Mon Sep 17 00:00:00 2001 From: Zebing Lin <linzebing1995@gmail.com> Date: Tue, 9 Sep 2025 00:34:37 -0400 Subject: [PATCH 058/584] [Core] Use sha256 bytes instead of BlockHash to reduce GC overhead (#23673) Signed-off-by: linzebing <linzebing1995@gmail.com> --- .../online_serving/kv_events_subscriber.py | 8 +- tests/utils_/test_utils.py | 20 +- tests/v1/core/test_kv_cache_utils.py | 65 +++-- tests/v1/core/test_prefix_caching.py | 225 ++++++++++-------- .../core/test_single_type_kv_cache_manager.py | 16 +- tests/v1/core/utils.py | 5 +- tests/v1/engine/test_engine_args.py | 13 +- tests/v1/kv_connector/unit/utils.py | 5 +- vllm/config/cache.py | 17 +- vllm/distributed/kv_events.py | 7 +- vllm/engine/arg_utils.py | 20 +- vllm/envs.py | 6 + vllm/utils/__init__.py | 27 +-- vllm/v1/core/block_pool.py | 27 ++- vllm/v1/core/kv_cache_utils.py | 120 +++++----- 15 files changed, 298 insertions(+), 283 deletions(-) diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index f238c66234dcc..9fd55fc9ddc94 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -6,6 +6,8 @@ import msgspec import zmq from msgspec.msgpack import Decoder +from vllm.v1.core.kv_cache_utils import BlockHash + # # Types copied from vllm.distributed.kv_events @@ -22,8 +24,8 @@ class KVCacheEvent( class BlockStored(KVCacheEvent): - block_hashes: list[int] - parent_block_hash: Optional[int] + block_hashes: list[BlockHash] + parent_block_hash: Optional[BlockHash] token_ids: list[int] block_size: int lora_id: Optional[int] @@ -31,7 +33,7 @@ class BlockStored(KVCacheEvent): class BlockRemoved(KVCacheEvent): - block_hashes: list[int] + block_hashes: list[BlockHash] medium: Optional[str] diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 66124dd854ee0..6dbba18b4dcfa 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -835,22 +835,20 @@ def test_model_specification(parser_with_config, cli_config_file, @pytest.mark.parametrize("input", [(), ("abc", ), (None, ), (None, bool, [1, 2, 3])]) -@pytest.mark.parametrize("output", [0, 1, 2]) -def test_sha256(input: tuple, output: int): - hash = sha256(input) - assert hash is not None - assert isinstance(hash, int) - assert hash != 0 +def test_sha256(input: tuple): + digest = sha256(input) + assert digest is not None + assert isinstance(digest, bytes) + assert digest != b"" - bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL) - assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), - byteorder="big") + input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL) + assert digest == hashlib.sha256(input_bytes).digest() # hashing again, returns the same value - assert hash == sha256(input) + assert digest == sha256(input) # hashing different input, returns different value - assert hash != sha256(input + (1, )) + assert digest != sha256(input + (1, )) @pytest.mark.parametrize( diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 4d0a26f76e98e..44e479098ad5d 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -6,20 +6,22 @@ from typing import Callable, Optional import pytest import torch +import vllm.v1.core.kv_cache_utils as kv_cache_utils from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.multimodal.inputs import (MultiModalFeatureSpec, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams -from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit +from vllm.utils import GiB_bytes, sha256, sha256_cbor from vllm.v1.core.kv_cache_manager import KVCacheManager # disable yapf here as it formats differently than isort such that both fail # yapf: disable from vllm.v1.core.kv_cache_utils import ( - FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, + BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, get_kv_cache_config, get_max_concurrency_for_kv_cache_config, get_request_block_hasher, hash_block_tokens, init_none_hash, - is_kv_cache_type_uniform, unify_kv_cache_configs) + is_kv_cache_type_uniform, make_block_hash_with_group_id, + unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor, SlidingWindowSpec) @@ -88,7 +90,7 @@ def new_sliding_window_spec(block_size=16, sliding_window=sliding_window) -@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) +@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_none_hash(monkeypatch, hash_fn): import vllm.v1.core.kv_cache_utils @@ -98,8 +100,8 @@ def test_none_hash(monkeypatch, hash_fn): reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils) reloaded_kv_cache_utils.init_none_hash(hash_fn) assert reloaded_kv_cache_utils.NONE_HASH is not None - assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int) - assert reloaded_kv_cache_utils.NONE_HASH != 0 + assert isinstance(reloaded_kv_cache_utils.NONE_HASH, bytes) + assert reloaded_kv_cache_utils.NONE_HASH != b"" # case 2: PYTHONHASHSEED is set, use the seed and hash_fn with monkeypatch.context() as m: @@ -107,12 +109,11 @@ def test_none_hash(monkeypatch, hash_fn): reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils) reloaded_kv_cache_utils.init_none_hash(hash_fn) assert reloaded_kv_cache_utils.NONE_HASH is not None - assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int) + assert isinstance(reloaded_kv_cache_utils.NONE_HASH, bytes) assert hash_fn('python hash seed') == reloaded_kv_cache_utils.NONE_HASH def test_kv_cache_block(): - import vllm.v1.core.kv_cache_utils # Test KVCacheBlock initialization block = KVCacheBlock(block_id=0) @@ -127,8 +128,7 @@ def test_kv_cache_block(): assert block.ref_cnt == 0 # Test block hash setting and resetting - block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123, - token_ids=(1, 2, 3)) + block_hash = make_block_hash_with_group_id(BlockHash(b"abc"), 0) block.block_hash = block_hash assert block.block_hash == block_hash @@ -407,27 +407,23 @@ def test_generate_block_hash_extra_keys_cache_salt(): assert next_mm_idx == 1 -@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) +@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_block_tokens(hash_fn): - import vllm.v1.core.kv_cache_utils init_none_hash(hash_fn) - parent_block_hash = 123 + parent_block_hash = BlockHash(b"123") curr_block_token_ids = (1, 2, 3) extra_keys = ("key1", "key2") block_hash = hash_block_tokens(hash_fn, parent_block_hash, curr_block_token_ids, extra_keys) - assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash) - assert block_hash.hash_value == hash_fn( - (parent_block_hash, curr_block_token_ids, extra_keys)) - assert block_hash.token_ids == curr_block_token_ids - assert block_hash.extra_keys == extra_keys + expected = hash_fn((parent_block_hash, curr_block_token_ids, extra_keys)) + assert block_hash == expected -@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) +@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_request_block_hasher(hash_fn): - import vllm.v1.core.kv_cache_utils - init_none_hash(hash_fn) + kv_cache_utils.init_none_hash(hash_fn) + request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], @@ -442,19 +438,13 @@ def test_request_block_hasher(hash_fn): block_hashes = request.block_hashes assert len(block_hashes) == 2 - assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash) - assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash) - - # Check the first block - assert block_hashes[0].token_ids == (0, 1, 2) - assert block_hashes[0].extra_keys == ("hash1", ) - - # Check the second block - assert block_hashes[1].token_ids == (3, 4, 5) - assert block_hashes[1].extra_keys == ("hash2", ) + assert block_hashes[0] == hash_fn( + (kv_cache_utils.NONE_HASH, (0, 1, 2), ("hash1", ))) + assert block_hashes[1] == hash_fn( + (block_hashes[0], (3, 4, 5), ("hash2", ))) -@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) +@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_tokens_different_mm_input(hash_fn): init_none_hash(hash_fn) @@ -484,9 +474,9 @@ def test_hash_tokens_different_mm_input(hash_fn): assert block_hashes1[1] != block_hashes2[1] -@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) +@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_request_tokens_no_mm_inputs(hash_fn): - init_none_hash(hash_fn) + kv_cache_utils.init_none_hash(hash_fn) request = make_request( request_id="0", @@ -500,10 +490,9 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn): block_hashes = request.block_hashes assert len(block_hashes) == 2 - assert block_hashes[0].token_ids == (0, 1, 2) - assert block_hashes[0].extra_keys is None - assert block_hashes[1].token_ids == (3, 4, 5) - assert block_hashes[1].extra_keys is None + assert block_hashes[0] == hash_fn( + (kv_cache_utils.NONE_HASH, (0, 1, 2), None)) + assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None)) def test_metrics(): diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index e7a8f63702b30..659d768bcf2e9 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -8,17 +8,19 @@ from typing import Callable, Optional import pytest import torch +import vllm.v1.core.kv_cache_utils as kv_cache_utils from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved from vllm.multimodal.inputs import (MultiModalFeatureSpec, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams -from vllm.utils import sha256, sha256_cbor_64bit +from vllm.utils import sha256, sha256_cbor from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_manager import KVCacheManager, Request -from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, - KVCacheBlock, +from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, + get_block_hash, get_group_id, get_request_block_hasher, - hash_block_tokens, init_none_hash) + hash_block_tokens, init_none_hash, + make_block_hash_with_group_id) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, SlidingWindowSpec) @@ -101,8 +103,10 @@ def make_kv_cache_config_hybrid_model(block_size: int, ) -@pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"]) -def test_prefill(hash_algo): +@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) +def test_prefill(hash_fn): + init_none_hash(hash_fn) + block_size = 16 manager = KVCacheManager( make_kv_cache_config(block_size, 11), @@ -110,10 +114,6 @@ def test_prefill(hash_algo): enable_caching=True, ) - # choose the hash function according to the parameter - hash_fn = (sha256_cbor_64bit if hash_algo == "sha256_cbor_64bit" else - sha256 if hash_algo == "sha256" else hash) - # Complete 3 blocks (48 tokens) common_token_ids = [i for i in range(3) for _ in range(16)] @@ -137,10 +137,12 @@ def test_prefill(hash_algo): block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16]) block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens) - assert manager.block_pool.blocks[ - block_id].block_hash.block_hash == block_hash + blk_hash = manager.block_pool.blocks[block_id].block_hash + assert blk_hash is not None + assert get_block_hash(blk_hash) == block_hash + assert get_group_id(blk_hash) == 0 assert manager.block_pool.blocks[block_id].ref_cnt == 1 - parent_block_hash = block_hash.hash_value + parent_block_hash = block_hash # Check partial block metadata for block_id in (4, ): @@ -233,7 +235,7 @@ def test_prefill_hybrid_model(): enable_caching=True, ) - hash_fn = hash + hash_fn = sha256 # Complete 3 blocks (48 tokens) common_token_ids = [i for i in range(3) for _ in range(block_size)] @@ -260,11 +262,13 @@ def test_prefill_hybrid_model(): block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16]) block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens) - for block_id in block_ids: - assert manager.block_pool.blocks[ - block_id].block_hash.block_hash == block_hash + for group_id, block_id in enumerate(block_ids): + blk_hash = manager.block_pool.blocks[block_id].block_hash + assert blk_hash is not None + assert get_block_hash(blk_hash) == block_hash + assert get_group_id(blk_hash) == group_id assert manager.block_pool.blocks[block_id].ref_cnt == 1 - parent_block_hash = block_hash.hash_value + parent_block_hash = block_hash # Check partial block metadata for block_id in (4, 8, 12): @@ -298,11 +302,10 @@ def test_prefill_hybrid_model(): cached_block_hash_to_block_bak = copy.copy( manager.block_pool.cached_block_hash_to_block) - def test_partial_request_hit(request_id: str, - hash_to_evict: list[BlockHashWithGroupId], + def test_partial_request_hit(request_id: str, hash_to_evict: list[bytes], expect_hit_length: int): req = make_request(request_id, common_token_ids + unique_token_ids, - block_size, hash) + block_size, sha256) for hash_with_group_id in hash_to_evict: manager.block_pool.cached_block_hash_to_block.pop( hash_with_group_id) @@ -319,33 +322,32 @@ def test_prefill_hybrid_model(): # Evict the blocks outside sliding window, does not affect the hit length. test_partial_request_hit("2", [ - BlockHashWithGroupId(block_hashes[0], 1), - BlockHashWithGroupId(block_hashes[0], 2) + make_block_hash_with_group_id(block_hashes[0], 1), + make_block_hash_with_group_id(block_hashes[0], 2) ], 3) # Evict the first block of full attention, makes total cache miss. - test_partial_request_hit("3", [ - BlockHashWithGroupId(block_hashes[0], 0), - ], 0) + test_partial_request_hit( + "3", [make_block_hash_with_group_id(block_hashes[0], 0)], 0) # Evict the last block of all layers, reduces the hit length to 2. test_partial_request_hit("4", [ - BlockHashWithGroupId(block_hashes[2], 0), - BlockHashWithGroupId(block_hashes[2], 1), - BlockHashWithGroupId(block_hashes[2], 2), + make_block_hash_with_group_id(block_hashes[2], 0), + make_block_hash_with_group_id(block_hashes[2], 1), + make_block_hash_with_group_id(block_hashes[2], 2), ], 2) # Evict the last block of full attention, reduces the hit length to 2. - test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)], - 2) + test_partial_request_hit( + "5", [make_block_hash_with_group_id(block_hashes[2], 0)], 2) # Evict the last block of sliding window, reduces the hit length to 2. - test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)], - 2) + test_partial_request_hit( + "6", [make_block_hash_with_group_id(block_hashes[2], 1)], 2) # Evict the last block of sliding window, reduces the hit length to 2. - test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)], - 2) + test_partial_request_hit( + "7", [make_block_hash_with_group_id(block_hashes[2], 2)], 2) # Evict different set of blocks for full attention and sliding window makes # total cache miss. @@ -353,9 +355,9 @@ def test_prefill_hybrid_model(): # The cache hit length of sliding window is 2 * block_size. # Then it is cache miss as the two type of layers have different hit length. test_partial_request_hit("8", [ - BlockHashWithGroupId(block_hashes[2], 0), - BlockHashWithGroupId(block_hashes[0], 1), - BlockHashWithGroupId(block_hashes[0], 2), + make_block_hash_with_group_id(block_hashes[2], 0), + make_block_hash_with_group_id(block_hashes[0], 1), + make_block_hash_with_group_id(block_hashes[0], 2), ], 0) @@ -372,8 +374,8 @@ def test_prefill_plp(): max_model_len=8192, enable_caching=True, ) - # the default hash function is hash - hash_fn = hash + # the default hash function is sha256 + hash_fn = sha256 # Complete 3 blocks (48 tokens) common_token_ids = [i for i in range(3) for _ in range(16)] @@ -404,10 +406,12 @@ def test_prefill_plp(): block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16]) block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens) - assert manager.block_pool.blocks[ - block_id].block_hash.block_hash == block_hash + blk_hash = (manager.block_pool.blocks[block_id].block_hash) + assert blk_hash is not None + assert get_block_hash(blk_hash) == block_hash + assert get_group_id(blk_hash) == 0 assert manager.block_pool.blocks[block_id].ref_cnt == 1 - parent_block_hash = block_hash.hash_value + parent_block_hash = block_hash # Check partial block metadata for block_id in (4, ): @@ -493,7 +497,7 @@ def test_decode(): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 req0 = make_request("0", common_token_ids + unique_token_ids, block_size, - hash) + sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -538,7 +542,7 @@ def test_evict(): ) last_token_id = 5 * 16 + 7 - req0 = make_request("0", list(range(last_token_id)), block_size, hash) + req0 = make_request("0", list(range(last_token_id)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -550,7 +554,7 @@ def test_evict(): # 3 blocks. req1 = make_request("1", list(range(last_token_id, last_token_id + 3 * 16)), block_size, - hash) + sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -572,7 +576,7 @@ def test_evict(): ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7] # Touch the first 2 blocks. - req2 = make_request("2", list(range(2 * 16 + 3)), block_size, hash) + req2 = make_request("2", list(range(2 * 16 + 3)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert computed_blocks.get_block_ids() == ([1, 2], ) assert num_computed_tokens == 2 * 16 @@ -597,7 +601,7 @@ def test_hash_block_correct_reuse(): # Allocate 1 block and cache it. num_tokens = block_size * 1 - req = make_request("0", list(range(num_tokens)), block_size, hash) + req = make_request("0", list(range(num_tokens)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -611,7 +615,7 @@ def test_hash_block_correct_reuse(): # Allocate a new block that's not full, make sure hash info on the # block is cleared. - req = make_request("1", list(range(num_tokens - 1)), block_size, hash) + req = make_request("1", list(range(num_tokens - 1)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -638,7 +642,7 @@ def test_computed_blocks_not_evicted(): # Allocate a block and cache it. num_tokens = block_size * 1 - req0 = make_request("0", list(range(num_tokens)), block_size, hash) + req0 = make_request("0", list(range(num_tokens)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -650,7 +654,7 @@ def test_computed_blocks_not_evicted(): # Allocate another block. req1 = make_request("1", list(range(num_tokens, num_tokens * 2)), - block_size, hash) + block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -666,7 +670,7 @@ def test_computed_blocks_not_evicted(): # Now if we have a cache hit on the first block, we should evict the second # cached block rather than the first one. - req2 = make_request("2", list(range(num_tokens * 2)), block_size, hash) + req2 = make_request("2", list(range(num_tokens * 2)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(computed_blocks.blocks[0]) == 1 assert computed_blocks.blocks[0][0].block_id == 1 @@ -691,7 +695,7 @@ def test_basic_prefix_caching_disabled(): ) req1 = make_request("1", list(range(10)), block_size, - hash) # 2 blocks and some more + sha256) # 2 blocks and some more computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] @@ -706,7 +710,7 @@ def test_basic_prefix_caching_disabled(): # No caching. req2 = make_request("2", list(range(16)), block_size, - hash) # shared prefix + sha256) # shared prefix computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -716,7 +720,7 @@ def test_basic_prefix_caching_disabled(): assert len(blocks.blocks[0]) == 4 # New requests should not have any blocks. - req3 = make_request("3", list(range(4)), block_size, hash) + req3 = make_request("3", list(range(4)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -726,7 +730,7 @@ def test_basic_prefix_caching_disabled(): assert not blocks -@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) +@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_cache_blocks(hash_fn): """ This is a unit test that tests the correctness of the _cache_full_blocks @@ -787,7 +791,7 @@ def test_cache_blocks_multi_group(): # Block 1/5: [4, 5, 6, 7] # Block 2/6: [8, 9, 10, 11] # Block 3/7: [12, 13] - req = make_request("0", list(range(14)), block_size, hash) + req = make_request("0", list(range(14)), block_size, sha256) # Cache the blocks for group 0. blocks = [KVCacheBlock(block_id=i) for i in range(2)] @@ -845,6 +849,8 @@ def test_mm_prefix_caching(): """ This tests that the multi-modal prefix caching is correct. """ + kv_cache_utils.init_none_hash(sha256) + block_size = 16 manager = KVCacheManager( make_kv_cache_config(block_size, 11), @@ -874,23 +880,30 @@ def test_mm_prefix_caching(): req0 = make_request("0", all_token_ids, block_size, - hash, + sha256, mm_positions=mm_positions, mm_hashes=mm_hashes) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - # Completed block should have hashes with extra keys. + # Completed block should have hashes assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 block_hashes = req0.block_hashes assert len(block_hashes) == 3 - assert block_hashes[0].extra_keys == ("aaa", ) - assert block_hashes[1].extra_keys == ("aaa", "bbb") - assert block_hashes[2].extra_keys == ("bbb", ) + assert block_hashes[0] == sha256( + (kv_cache_utils.NONE_HASH, tuple(all_token_ids[:block_size]), + ("aaa", ))) + assert block_hashes[1] == sha256( + (block_hashes[0], tuple(all_token_ids[block_size:block_size * 2]), + ("aaa", "bbb"))) + assert block_hashes[2] == sha256( + (block_hashes[1], tuple(all_token_ids[block_size * 2:block_size * 3]), + ("bbb", ))) blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks) + assert blocks is not None assert blocks.get_block_ids() == ([1, 2, 3, 4], ) req0.num_computed_tokens = 59 @@ -901,10 +914,10 @@ def test_mm_prefix_caching(): len(computed_blocks.blocks[0]) * 16, computed_blocks) assert new_blocks is not None and len(new_blocks.blocks[0]) == 0 - - # The just completed block should have hashes with extra keys. assert len(block_hashes) == 4 - assert block_hashes[3].extra_keys == ("ccc", ) + assert block_hashes[3] == sha256( + (block_hashes[2], tuple(all_token_ids[3 * block_size:] + [8] * 5), + ("ccc", ))) # Cache hit. unique_token_ids = [-1] * 7 + [200] * 5 @@ -916,7 +929,7 @@ def test_mm_prefix_caching(): req1 = make_request("1", all_token_ids, block_size, - hash, + sha256, mm_positions=mm_positions, mm_hashes=mm_hashes) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) @@ -929,6 +942,8 @@ def test_cache_key_salting(): This tests that cache salts are applied during hashing and the cache is separated cache as expected. """ + kv_cache_utils.init_none_hash(sha256) + block_size = 16 manager = KVCacheManager( make_kv_cache_config(block_size, 11), @@ -939,21 +954,26 @@ def test_cache_key_salting(): # 3 complete blocks and an incomplete block with 11 tokens. common_token_ids = [i for i in range(3) for _ in range(block_size)] token_ids = common_token_ids + [3] * 11 - req0 = make_request("0", token_ids, block_size, hash, cache_salt="salt1") + req0 = make_request("0", token_ids, block_size, sha256, cache_salt="salt1") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - # Completed block should have hashes with extra keys. + # Completed block should have hashes assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 block_hashes = req0.block_hashes assert len(block_hashes) == 3 - assert block_hashes[0].extra_keys == ("salt1", ) - assert block_hashes[1].extra_keys is None - assert block_hashes[2].extra_keys is None + assert block_hashes[0] == sha256( + (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt1", ))) + assert block_hashes[1] == sha256( + (block_hashes[0], tuple(token_ids[block_size:block_size * 2]), None)) + assert block_hashes[2] == sha256( + (block_hashes[1], tuple(token_ids[block_size * 2:block_size * 3]), + None)) blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks) + assert blocks is not None assert blocks.get_block_ids() == ([1, 2, 3, 4], ) req0.num_computed_tokens = 59 @@ -964,14 +984,13 @@ def test_cache_key_salting(): len(computed_blocks.blocks[0]) * 16, computed_blocks) assert new_blocks is not None and len(new_blocks.blocks[0]) == 0 - - # Now one more block that should not have extra keys. assert len(block_hashes) == 4 - assert block_hashes[3].extra_keys is None + assert block_hashes[3] == sha256( + (block_hashes[2], tuple(token_ids[3 * block_size:] + [8] * 5), None)) # Test cache hit with a new request that has the same salt. token_ids = common_token_ids + [4] * 11 - req1 = make_request("1", token_ids, block_size, hash, cache_salt="salt1") + req1 = make_request("1", token_ids, block_size, sha256, cache_salt="salt1") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) # Should match only a prefix of 3 blocks. assert len(computed_blocks.blocks[0]) == 3 @@ -979,13 +998,19 @@ def test_cache_key_salting(): # Test cache miss with same content but different salt. token_ids = common_token_ids + [4] * 11 - req2 = make_request("2", token_ids, block_size, hash, cache_salt="salt2") + req2 = make_request("2", token_ids, block_size, sha256, cache_salt="salt2") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(computed_blocks.blocks[0]) == 0 assert num_computed_tokens == 0 block_hashes = req2.block_hashes assert len(block_hashes) == 3 - assert block_hashes[0].extra_keys == ("salt2", ) + assert block_hashes[0] == sha256( + (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt2", ))) + assert block_hashes[1] == sha256( + (block_hashes[0], tuple(token_ids[block_size:block_size * 2]), None)) + assert block_hashes[2] == sha256( + (block_hashes[1], tuple(token_ids[block_size * 2:block_size * 3]), + None)) def test_prefill_not_enough_free_blocks_with_computed_blocks(): @@ -1004,7 +1029,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # Complete 3 blocks (48 tokens) # | Common-0 | Common-1 | Common-2 | ... | common_token_ids = [i for i in range(3) for _ in range(16)] - req0 = make_request("0", common_token_ids, block_size, hash) + req0 = make_request("0", common_token_ids, block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -1015,7 +1040,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): req0.request_id] # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... | - req1 = make_request("1", common_token_ids * 2, block_size, hash) + req1 = make_request("1", common_token_ids * 2, block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert computed_blocks.blocks[0] == block_part0 assert num_computed_tokens == 3 * 16 @@ -1032,7 +1057,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | # | Req1-5(F)| Req2-0 | Req2-1 | ... | - req2 = make_request("2", [7] * block_size * 2, block_size, hash) + req2 = make_request("2", [7] * block_size * 2, block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -1044,7 +1069,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # but it cannot be allocated due to insufficient free blocks (2). # In this case, the ref_cnt of the computed blocks should not be changed. assert manager.block_pool.free_block_queue.num_free_blocks == 5 - req3 = make_request("3", common_token_ids * 3, block_size, hash) + req3 = make_request("3", common_token_ids * 3, block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert computed_blocks.blocks[0] == block_part1 assert num_computed_tokens == 6 * 16 @@ -1069,13 +1094,13 @@ def test_reset_prefix_cache(): full_block_token_ids = [i for i in range(3) for _ in range(16)] unique_token_ids = [3] * 7 all_token_ids = full_block_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids, block_size, hash) + req0 = make_request("0", all_token_ids, block_size, sha256) blocks = manager.allocate_slots(req0, 55) assert blocks.get_block_ids() == ([1, 2, 3, 4], ) unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids - req1 = make_request("1", all_token_ids, block_size, hash) + req1 = make_request("1", all_token_ids, block_size, sha256) computed_blocks, _ = manager.get_computed_blocks(req1) assert len(req1.block_hashes) == 3 assert len(computed_blocks.blocks[0]) == 3 @@ -1109,7 +1134,7 @@ def test_prefix_cache_stats_disabled(): assert manager.prefix_cache_stats is None # Call all functions that check whether log_stats is disabled. - req = make_request("0", list(range(16)), block_size, hash) + req = make_request("0", list(range(16)), block_size, sha256) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -1124,15 +1149,9 @@ def test_prefix_cache_stats_disabled(): def test_maybe_evict_cached_block(): pool = BlockPool(num_gpu_blocks=4, enable_caching=True) - block_hash0 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=10, - token_ids=(100, )), - group_id=1000) - block_hash1 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=20, - token_ids=(200, )), - group_id=2000) - block_hash2 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=30, - token_ids=(300, )), - group_id=3000) + block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000) + block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000) + block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000) block_hashes = [ block_hash0, block_hash1, @@ -1206,7 +1225,7 @@ def test_kv_cache_events(blocks_to_cache: int): ) num_tokens = block_size * blocks_to_cache - req0 = make_request("0", list(range(num_tokens)), block_size, hash) + req0 = make_request("0", list(range(num_tokens)), block_size, sha256) _ = manager.allocate_slots(req0, num_tokens) events = manager.take_events() @@ -1222,7 +1241,7 @@ def test_kv_cache_events(blocks_to_cache: int): # Should see block_to_cache number of removed block events and a new block # stored event manager.free(req0) - req1 = make_request("1", list(range(num_tokens)), block_size, hash) + req1 = make_request("1", list(range(num_tokens)), block_size, sha256) _ = manager.allocate_slots(req1, num_tokens) events = manager.take_events() @@ -1256,7 +1275,7 @@ def test_eagle_enabled_removes_last_block(): # Request with 3 full blocks (48 tokens) token_ids = [0] * (3 * block_size) - req = make_request("divisible_request", token_ids, block_size, hash) + req = make_request("divisible_request", token_ids, block_size, sha256) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1266,7 +1285,7 @@ def test_eagle_enabled_removes_last_block(): manager.free(req) # New request with same tokens + Eagle enabled - req_eagle = make_request("eagle_divisible", token_ids, block_size, hash) + req_eagle = make_request("eagle_divisible", token_ids, block_size, sha256) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Should retain 1 block: @@ -1287,7 +1306,7 @@ def test_eagle_with_partial_blocks(): ) # 2 full blocks + 5 tokens (non-divisible length) token_ids = [0] * (2 * block_size + 5) - req = make_request("partial_block_test", token_ids, block_size, hash) + req = make_request("partial_block_test", token_ids, block_size, sha256) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1297,7 +1316,7 @@ def test_eagle_with_partial_blocks(): manager.free(req) # New request with Eagle enabled - req_eagle = make_request("partial_eagle", token_ids, block_size, hash) + req_eagle = make_request("partial_eagle", token_ids, block_size, sha256) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Original match: 2 full blocks → Eagle removes 1 → 1 remaining assert len(computed_blocks.blocks[0]) == 1 @@ -1328,7 +1347,7 @@ def test_eagle_with_sliding_window(): # 2 full blocks + 5 tokens (non-divisible length) token_ids = [0] * (2 * block_size + 5) - req = make_request("partial_block_test", token_ids, block_size, hash) + req = make_request("partial_block_test", token_ids, block_size, sha256) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1341,7 +1360,7 @@ def test_eagle_with_sliding_window(): manager.free(req) # New request with Eagle enabled - req_eagle = make_request("partial_eagle", token_ids, block_size, hash) + req_eagle = make_request("partial_eagle", token_ids, block_size, sha256) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Original match: 2 full blocks → Eagle removes 1 → 1 remaining assert len(computed_blocks.blocks[0]) == 1 @@ -1351,11 +1370,11 @@ def test_eagle_with_sliding_window(): assert manager.block_pool.get_cached_block( block_hash_first_block, kv_cache_group_ids=[0]) is not None manager.block_pool.cached_block_hash_to_block.pop( - BlockHashWithGroupId(block_hash_first_block, 0)) + make_block_hash_with_group_id(block_hash_first_block, 0)) # New request req_after_evict = make_request("partial_eagle_after_evict", token_ids, - block_size, hash) + block_size, sha256) computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict) # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is # not considered. But after dropping the last matched block due to eagle, diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py index 7dcebba491fab..b70850a9bcff9 100644 --- a/tests/v1/core/test_single_type_kv_cache_manager.py +++ b/tests/v1/core/test_single_type_kv_cache_manager.py @@ -6,8 +6,8 @@ import random import torch from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, - KVCacheBlock) +from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, + make_block_hash_with_group_id) from vllm.v1.core.single_type_kv_cache_manager import ( ChunkedLocalAttentionManager, SlidingWindowManager) from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, @@ -44,7 +44,7 @@ def test_chunked_local_attention_possible_cached_prefix(): def run_one_case(block_is_cached, tail_token, expect_length): block_hash_list = [ - BlockHash(i, ()) for i in range(len(block_is_cached)) + BlockHash(str(i).encode()) for i in range(len(block_is_cached)) ] block_pool.cached_block_hash_to_block.clear() @@ -53,8 +53,8 @@ def test_chunked_local_attention_possible_cached_prefix(): for i, (block_hash, is_cached) in enumerate(zip(block_hash_list, block_is_cached)): if is_cached: - block_pool.cached_block_hash_to_block[BlockHashWithGroupId( - block_hash, 0)] = { + block_pool.cached_block_hash_to_block[ + make_block_hash_with_group_id(block_hash, 0)] = { i: block_pool.blocks[i + 10], } @@ -109,7 +109,7 @@ def test_sliding_window_possible_cached_prefix(): def run_one_case(block_is_cached, expect_length): block_hash_list = [ - BlockHash(i, ()) for i in range(len(block_is_cached)) + BlockHash(str(i).encode()) for i in range(len(block_is_cached)) ] block_pool.cached_block_hash_to_block.clear() @@ -118,8 +118,8 @@ def test_sliding_window_possible_cached_prefix(): for i, (block_hash, is_cached) in enumerate(zip(block_hash_list, block_is_cached)): if is_cached: - block_pool.cached_block_hash_to_block[BlockHashWithGroupId( - block_hash, 0)] = { + block_pool.cached_block_hash_to_block[ + make_block_hash_with_group_id(block_hash, 0)] = { i: block_pool.blocks[i + 10], } diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index e392c2c336e9b..d343141cdf4cb 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -9,6 +9,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, from vllm.multimodal.inputs import (MultiModalFeatureSpec, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams +from vllm.utils import sha256 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.async_scheduler import AsyncScheduler @@ -130,10 +131,10 @@ def create_requests( ) -> list[Request]: global _none_hash_initialized if not _none_hash_initialized: - init_none_hash(hash) + init_none_hash(sha256) _none_hash_initialized = True - block_hasher = get_request_block_hasher(block_size, hash) + block_hasher = get_request_block_hasher(block_size, sha256) sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens, stop_token_ids=stop_token_ids, diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index f70a3ce147ff2..23ec3673b10b4 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -36,18 +36,19 @@ def test_prefix_caching_from_cli(): assert vllm_config.cache_config.enable_prefix_caching # default hash algorithm is "builtin" - assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin" + assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256" + + # set hash algorithm to sha256_cbor + args = parser.parse_args(["--prefix-caching-hash-algo", "sha256_cbor"]) + vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config() + assert vllm_config.cache_config.prefix_caching_hash_algo == \ + "sha256_cbor" # set hash algorithm to sha256 args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"]) vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config() assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256" - # set hash algorithm to builtin - args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"]) - vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config() - assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin" - # an invalid hash algorithm raises an error parser.exit_on_error = False with pytest.raises(ArgumentError): diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 3f068d5e8c7eb..0cae1c7bc0518 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -13,6 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa SharedStorageConnector) +from vllm.utils import sha256 from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) @@ -127,11 +128,11 @@ def create_request(request_id: int, use_all_1s_for_prompt_tokens: bool = False, num_remote_blocks: int = 3, block_size: int = 16, - hash_fn: Callable = hash) -> Request: + hash_fn: Callable = sha256) -> Request: """Make dummy request for testing.""" global _none_hash_initialized if not _none_hash_initialized: - init_none_hash(hash) + init_none_hash(hash_fn) _none_hash_initialized = True kv_transfer_params: Optional[dict[str, Any]] = None diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 5cc630b72846d..bf85aad452d0f 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -24,7 +24,7 @@ logger = init_logger(__name__) BlockSize = Literal[1, 8, 16, 32, 64, 128] CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"] MambaDType = Literal["auto", "float32"] -PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"] +PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"] @config @@ -63,17 +63,12 @@ class CacheConfig: """Sliding window size for the KV cache. This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" enable_prefix_caching: Optional[bool] = None - """Whether to enable prefix caching. Disabled by default for V0. Enabled by - default for V1.""" - prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin" + """Whether to enable prefix caching. Enabled by default for V1.""" + prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256" """Set the hash algorithm for prefix caching:\n - - "builtin" is Python's built-in hash.\n - - "sha256" is collision resistant but with certain overheads. - This option uses Pickle for object serialization before hashing.\n - - "sha256_cbor_64bit" provides a reproducible, cross-language compatible - hash. It serializes objects using canonical CBOR and hashes them with - SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256 - digest.""" + - "sha256" uses Pickle for object serialization before hashing.\n + - "sha256_cbor" provides a reproducible, cross-language compatible hash. It + serializes objects using canonical CBOR and hashes them with SHA-256.""" cpu_offload_gb: float = 0 """The space in GiB to offload to CPU, per GPU. Default is 0, which means no offloading. Intuitively, this argument can be seen as a virtual way to diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 09f42b550fe2b..46f0cd9289b23 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -16,6 +16,7 @@ import zmq from vllm.config.kv_events import KVEventsConfig from vllm.logger import init_logger +from vllm.v1.core.kv_cache_utils import ExternalBlockHash logger = init_logger(__name__) @@ -44,8 +45,8 @@ MEDIUM_GPU = "GPU" class BlockStored(KVCacheEvent): - block_hashes: list[int] - parent_block_hash: Optional[int] + block_hashes: list[ExternalBlockHash] + parent_block_hash: Optional[ExternalBlockHash] token_ids: list[int] block_size: int lora_id: Optional[int] @@ -53,7 +54,7 @@ class BlockStored(KVCacheEvent): class BlockRemoved(KVCacheEvent): - block_hashes: list[int] + block_hashes: list[ExternalBlockHash] medium: Optional[str] diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bee97f4cd04d8..94c984116131d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1592,20 +1592,12 @@ class EngineArgs: "in low performance due to small KV cache size. Consider " "setting --max-model-len to a smaller value.", max_model_len) - # if using prefix caching, we must set a hash algo - if self.enable_prefix_caching: - # Disable prefix caching for multimodal models for VLLM_V0. - if model_config.is_multimodal_model: - logger.warning( - "--enable-prefix-caching is not supported for multimodal " - "models in V0 and has been disabled.") - self.enable_prefix_caching = False - - # VLLM_V0 only supports builtin hash algo for prefix caching. - if self.prefix_caching_hash_algo == "sha256": - raise ValueError( - "sha256 is not supported for prefix caching in V0 engine. " - "Please use 'builtin'.") + # Disable prefix caching for multimodal models for VLLM_V0. + if self.enable_prefix_caching and model_config.is_multimodal_model: + logger.warning( + "--enable-prefix-caching is not supported for multimodal " + "models in V0 and has been disabled.") + self.enable_prefix_caching = False # Set max_num_seqs to 256 for VLLM_V0. if self.max_num_seqs is None: diff --git a/vllm/envs.py b/vllm/envs.py index 927bea3bf9538..8d199da45b082 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -171,6 +171,7 @@ if TYPE_CHECKING: VLLM_GPT_OSS_USE_CONTAINER_TOOL: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False + VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True def get_default_cache_root(): @@ -1215,6 +1216,11 @@ environment_variables: dict[str, Callable[[], Any]] = { # Add optional custom scopes for profiling, disable to avoid overheads "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))), + + # Represent block hashes in KV cache events as 64-bit integers instead of + # raw bytes. Defaults to True for backward compatibility. + "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": + lambda: bool(int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))), } # --8<-- [end:env-vars-definition] diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 9c78e56d580e0..6d0cb3710bb93 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3249,7 +3249,7 @@ def check_use_alibi(model_config: ModelConfig) -> bool: and getattr(cfg.attn_config, "alibi", False))))) -def sha256(input) -> int: +def sha256(input) -> bytes: """Hash any picklable Python object using SHA-256. The input is serialized using pickle before hashing, which allows @@ -3260,16 +3260,15 @@ def sha256(input) -> int: input: Any picklable Python object. Returns: - An integer representing the SHA-256 hash of the serialized input. + Bytes representing the SHA-256 hash of the serialized input. """ input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL) - return int.from_bytes(hashlib.sha256(input_bytes).digest(), - byteorder="big") + return hashlib.sha256(input_bytes).digest() -def sha256_cbor_64bit(input) -> int: +def sha256_cbor(input) -> bytes: """ - Hash objects using CBOR serialization and SHA-256, then truncate to 64bits. + Hash objects using CBOR serialization and SHA-256. This option is useful for non-Python-dependent serialization and hashing. @@ -3280,17 +3279,13 @@ def sha256_cbor_64bit(input) -> int: Custom classes must implement CBOR serialization methods. Returns: - An integer in the range [0, 2^64-1] representing the lower 64 bits - of the SHA-256 hash of the CBOR serialized input. + Bytes representing the SHA-256 hash of the CBOR serialized input. """ input_bytes = cbor2.dumps(input, canonical=True) - full_hash = int.from_bytes(hashlib.sha256(input_bytes).digest(), - byteorder="big") - - return full_hash & ((1 << 64) - 1) + return hashlib.sha256(input_bytes).digest() -def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], int]: +def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]: """Get a hash function by name, or raise an error if the function is not found. Args: @@ -3300,10 +3295,8 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], int]: """ if hash_fn_name == "sha256": return sha256 - if hash_fn_name == "sha256_cbor_64bit": - return sha256_cbor_64bit - if hash_fn_name == "builtin": - return hash + if hash_fn_name == "sha256_cbor": + return sha256_cbor raise ValueError(f"Unsupported hash function: {hash_fn_name}") diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index b537cac8e1d72..d1e1c1c8d0382 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -9,7 +9,11 @@ from vllm.distributed.kv_events import (MEDIUM_GPU, AllBlocksCleared, KVCacheEvent) from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, - FreeKVCacheBlockQueue, KVCacheBlock) + ExternalBlockHash, + FreeKVCacheBlockQueue, KVCacheBlock, + get_block_hash, + make_block_hash_with_group_id, + maybe_convert_block_hash) from vllm.v1.request import Request logger = init_logger(__name__) @@ -84,8 +88,10 @@ class BlockPool: """ cached_blocks = [] for group_id in kv_cache_group_ids: + block_hash_with_group_id = make_block_hash_with_group_id( + block_hash, group_id) cached_blocks_one_group = self.cached_block_hash_to_block.get( - BlockHashWithGroupId(block_hash, group_id)) + block_hash_with_group_id) if not cached_blocks_one_group: return None first_block = next(iter(cached_blocks_one_group.values())) @@ -124,28 +130,29 @@ class BlockPool: assert len(request.block_hashes) >= num_full_blocks new_block_hashes = request.block_hashes[num_cached_blocks:] - new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events - else None) + new_hashes: Optional[list[ExternalBlockHash]] = ( + [] if self.enable_kv_cache_events else None) for i, blk in enumerate(new_full_blocks): assert blk.block_hash is None block_hash = new_block_hashes[i] # Update and added the full block to the cache. - block_hash_with_group_id = BlockHashWithGroupId( + block_hash_with_group_id = make_block_hash_with_group_id( block_hash, kv_cache_group_id) blk.block_hash = block_hash_with_group_id self.cached_block_hash_to_block[block_hash_with_group_id][ blk.block_id] = blk if new_hashes is not None: - new_hashes.append(block_hash.hash_value) + new_hashes.append(maybe_convert_block_hash(block_hash)) if self.enable_kv_cache_events: if num_cached_blocks == 0: - parent_block_hash = None + parent_block_hash: Optional[ExternalBlockHash] = None else: parent_block = blocks[num_cached_blocks - 1] assert parent_block.block_hash is not None - parent_block_hash = parent_block.block_hash.get_hash_value() + parent_block_hash = maybe_convert_block_hash( + get_block_hash(parent_block.block_hash)) self.kv_event_queue.append( BlockStored( @@ -220,7 +227,9 @@ class BlockPool: # we disable hybrid kv cache manager when kv cache event is # enabled, so there is only one group. self.kv_event_queue.append( - BlockRemoved(block_hashes=[block_hash.get_hash_value()], + BlockRemoved(block_hashes=[ + maybe_convert_block_hash(get_block_hash(block_hash)) + ], medium=MEDIUM_GPU)) return True diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index aff1183e499a4..2c0eac3ddd79d 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -6,11 +6,12 @@ import os from collections import defaultdict, deque from collections.abc import Iterable, Sequence from dataclasses import astuple, dataclass -from typing import Any, Callable, NamedTuple, Optional +from typing import Any, Callable, NewType, Optional, Union +from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import GiB_bytes, cdiv, sha256_cbor_64bit +from vllm.utils import GiB_bytes, cdiv, sha256_cbor from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, @@ -18,59 +19,78 @@ from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request -logger = init_logger(__name__) +# BlockHash represents the hash of a single KV-cache block used for +# prefix caching. Treating it as a distinct type from ``bytes`` helps +# catch accidental misuse when passing around raw byte strings. +BlockHash = NewType("BlockHash", bytes) + +# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID. +# It is represented as raw bytes for compactness and efficiency. The helper +# functions below pack/unpack the ``BlockHash`` and group id into/from the key. +BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes) + +# ExternalBlockHash is used for reproducible prefix-cache block hashing. +# It's a union of ``bytes`` and ``int`` to keep backward compatibility +# after we default block hashing to use sha256 bytes. +ExternalBlockHash = Union[bytes, int] -class BlockHash(NamedTuple): - """Hash value of a block (int), the token IDs in the block, and extra keys. - We keep a tuple of token IDs and extra keys to reduce the likelihood of - hash collisions when the hash value is the same. By using SHA256 however, - hash collisions are practically impossible. +def make_block_hash_with_group_id(block_hash: BlockHash, + group_id: int) -> BlockHashWithGroupId: + """Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``. + + The group id is encoded using 4 bytes in big-endian order and appended to + the block hash bytes. This representation avoids creating tuples while + still allowing us to recover both components when needed. """ - # Hash value of the block in an integer. - hash_value: int - # Token IDs in the block. - token_ids: tuple[int, ...] - # Extra keys for the block. - extra_keys: Optional[Any] = None + return BlockHashWithGroupId(block_hash + + group_id.to_bytes(4, "big", signed=False)) -class BlockHashWithGroupId(NamedTuple): - # The hash value for the contents (e.g., token_ids) of a block without group - # ID. The value is the same for blocks representing the same tokens but for - # different groups. - block_hash: BlockHash - # The KV cache group ID. - group_id: int +def get_block_hash(key: BlockHashWithGroupId) -> BlockHash: + """Extract the ``BlockHash`` from a ``BlockHashWithGroupId``.""" + return BlockHash(key[:-4]) - def get_hash_value(self) -> int: - return self.block_hash.hash_value +def get_group_id(key: BlockHashWithGroupId) -> int: + """Extract the group id from a ``BlockHashWithGroupId``.""" + return int.from_bytes(key[-4:], "big", signed=False) + + +def maybe_convert_block_hash(hash_bytes: BlockHash) -> ExternalBlockHash: + if not envs.VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: + return hash_bytes + return int.from_bytes(hash_bytes, byteorder="big") & ((1 << 64) - 1) + + +logger = init_logger(__name__) # The hash seed for the first block of any prefix block sequence. # # We use a random value to avoid hash collisions or PYTHONHASHSEED environment -# variable if set such that processes can share the seed if needed. -# This aligns with the behavior of Python's hash() function, which also uses -# a random seed if PYTHONHASHSEED is not set. +# variable if set such that processes can share the seed if needed. This aligns +# with the behavior of Python's hash() function, which also uses a random seed +# if PYTHONHASHSEED is not set. # # The function `init_none_hash` initializes this variable globally. -NONE_HASH: int +NONE_HASH: BlockHash -def init_none_hash(hash_fn: Callable): +def init_none_hash(hash_fn: Callable[[Any], bytes]): global NONE_HASH hash_seed = os.getenv("PYTHONHASHSEED") - if hash_seed is None and hash_fn is sha256_cbor_64bit: + if hash_seed is None and hash_fn is sha256_cbor: logger.warning( "PYTHONHASHSEED is not set. This will lead to non-reproducible " - "block-hashes when using sha256_cbor_64bit as the hash function." + "block-hashes when using sha256_cbor as the hash function." "Consider setting PYTHONHASHSEED to a fixed value for " "reproducibility.") - NONE_HASH = (int.from_bytes(os.urandom(32), byteorder="big") - if hash_seed is None else hash_fn(hash_seed)) + if hash_seed is None: + NONE_HASH = BlockHash(os.urandom(32)) + else: + NONE_HASH = BlockHash(hash_fn(hash_seed)) class PrefixCachingMetrics: @@ -142,8 +162,8 @@ class KVCacheBlock: block_id: int # Reference count. ref_cnt: int = 0 - # The hash of the block composed of (block hash, tuple of token IDs). - # It is only available when the block is full. + # The hash key (block hash + group id) of the block, only available + # when the block is full and cached. _block_hash: Optional[BlockHashWithGroupId] = None # Used to construct a doubly linked list for free blocks. @@ -177,7 +197,7 @@ class KVCacheBlock: if self.next_free_block else None) return (f"KVCacheBlock(block_id={self.block_id}, " f"ref_cnt={self.ref_cnt}, " - f"_block_hash={self._block_hash}, " + f"_block_hash={self._block_hash!r}, " f"prev_free_block={prev_block_id}, " f"next_free_block={next_block_id})") @@ -517,15 +537,14 @@ def generate_block_hash_extra_keys( def hash_block_tokens( - hash_function: Callable, - parent_block_hash: Optional[int], + hash_function: Callable[[Any], bytes], + parent_block_hash: Optional[BlockHash], curr_block_token_ids: Sequence[int], extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for prefix caching. We use LRU cache for this function to avoid recomputing hash values for the same block contents. - Args: hash_function: The hash function used to compute block hash. parent_block_hash: The hash of the parent block. None @@ -533,7 +552,6 @@ def hash_block_tokens( curr_block_token_ids: A list of token ids in the current block. The current block is assumed to be full. extra_keys: Extra keys for the block. - Returns: The hash value of the block and the token ids in the block. The entire tuple is used as the hash key of the block. @@ -544,26 +562,16 @@ def hash_block_tokens( curr_block_token_ids_tuple = tuple(curr_block_token_ids) return BlockHash( hash_function( - (parent_block_hash, curr_block_token_ids_tuple, extra_keys)), - curr_block_token_ids_tuple, extra_keys) + (parent_block_hash, curr_block_token_ids_tuple, extra_keys))) def get_request_block_hasher( block_size: int, - caching_hash_fn: Callable[[Any], - int]) -> Callable[[Request], list[BlockHash]]: + caching_hash_fn: Callable[[Any], bytes], +) -> Callable[[Request], list[BlockHash]]: """ Returns a function which computes the list of un-computed block hashes - of a request. - - Each request holds a list of its block hashes (request.block_hashes). - When a request is created, it calls the below function to compute - the hashes of all full blocks of the request's initial tokens. - The hashes are then stored in request.block_hashes. - Later, whenever new tokens are appended to the request, it calls - the below function again to compute any new full blocks of tokens. - The returned new hashes are appended to request.block_hashes. - """ + of a request.""" def request_block_hasher(request: Request) -> list[BlockHash]: start_token_idx = len(request.block_hashes) * block_size @@ -577,8 +585,8 @@ def get_request_block_hasher( # last mm input. curr_mm_idx = -1 - prev_block_hash_value = request.block_hashes[-1].hash_value \ - if request.block_hashes else None + prev_block_hash_value = (request.block_hashes[-1] + if request.block_hashes else None) new_block_hashes: list[BlockHash] = [] while True: end_token_idx = start_token_idx + block_size @@ -598,7 +606,7 @@ def get_request_block_hasher( new_block_hashes.append(block_hash) start_token_idx += block_size - prev_block_hash_value = block_hash.hash_value + prev_block_hash_value = block_hash return new_block_hashes From b2f7745774fc3d5105c0e0babbfb656bfcacfcac Mon Sep 17 00:00:00 2001 From: cong-meta <prowindy@hotmail.com> Date: Mon, 8 Sep 2025 21:35:18 -0700 Subject: [PATCH 059/584] Add data_parallel_size to VllmConfig string representation (#24298) Co-authored-by: Cong Chen <congc@meta.com> --- vllm/config/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index e84b924ada0a7..e474420e3f04c 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3790,6 +3790,7 @@ class VllmConfig: f"load_format={self.load_config.load_format}, " f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa + f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa f"quantization={self.model_config.quantization}, " f"enforce_eager={self.model_config.enforce_eager}, " From 948dd3443bc6b8ffb76cbdddf3f4c5ae0b6637fa Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 9 Sep 2025 12:40:29 +0800 Subject: [PATCH 060/584] [Bugfix] Fix Apertus HF repo name (#24447) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/models/supported_models.md | 1 + tests/models/language/generation/test_common.py | 2 +- tests/models/registry.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bdb29aac333c1..d23fdff568fcd 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -322,6 +322,7 @@ th { | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| +| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | ✅︎ | | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ | | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 8a04946b2ffb3..6fc8f1301fdb9 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -93,7 +93,7 @@ AITER_MODEL_LIST = [ "allenai/OLMoE-1B-7B-0924-Instruct", marks=[pytest.mark.cpu_model], ), - pytest.param("swiss-ai/Apertus-8B"), # apertus + pytest.param("swiss-ai/Apertus-8B-2509"), # apertus ]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 8bcdeb087c1f9..755a37b109d76 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -158,7 +158,7 @@ class _HfExamplesInfo: # yapf: disable _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] - "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B", + "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-2509", min_transformers_version="4.56.0", trust_remote_code=True), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", From 0cdd213641d7adcf7ad0e9cf79867344ea1291d9 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Mon, 8 Sep 2025 21:43:48 -0700 Subject: [PATCH 061/584] [Misc] Improve Worker process title and logging prefix (#22205) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- vllm/utils/__init__.py | 10 ++---- vllm/v1/engine/core.py | 6 ++-- vllm/v1/engine/utils.py | 2 +- vllm/v1/executor/multiproc_executor.py | 42 ++++++++++++++++++-------- 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 6d0cb3710bb93..49c706bc37a84 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3359,7 +3359,7 @@ def has_triton_kernels() -> bool: def set_process_title(name: str, suffix: str = "", - append: bool = False) -> None: + prefix: str = envs.VLLM_PROCESS_NAME_PREFIX) -> None: """ Set the current process title to a specific name with an optional suffix. @@ -3367,15 +3367,11 @@ def set_process_title(name: str, Args: name: The title to assign to the current process. suffix: An optional suffix to append to the base name. - append: Whether to append to the existing process title. + prefix: A prefix to prepend to the front separated by `::`. """ if suffix: name = f"{name}_{suffix}" - if append: - name = f"{setproctitle.getproctitle()}_{name}" - else: - name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}" - setproctitle.setproctitle(name) + setproctitle.setproctitle(f"{prefix}::{name}") def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e239e6cbba167..b46ae72ccdf1b 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -224,7 +224,7 @@ class EngineCore: def add_request(self, request: Request, request_wave: int = 0): """Add request to the scheduler. - + `request_wave`: indicate which wave of requests this is expected to belong to in DP case """ @@ -433,7 +433,7 @@ class EngineCore: def preprocess_add_request( self, request: EngineCoreRequest) -> tuple[Request, int]: """Preprocess the request. - + This function could be directly used in input processing thread to allow request initialization running in parallel with Model forward """ @@ -697,7 +697,7 @@ class EngineCoreProc(EngineCore): parallel_config: ParallelConfig = kwargs[ "vllm_config"].parallel_config if parallel_config.data_parallel_size > 1 or dp_rank > 0: - set_process_title("DPEngineCore", str(dp_rank)) + set_process_title("EngineCore", f"DP{dp_rank}") decorate_logs() # Set data parallel rank for this engine process. parallel_config.data_parallel_rank = dp_rank diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index ed0129fda9474..df2fd8d9df078 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -116,7 +116,7 @@ class CoreEngineProcManager: local_dp_ranks.append(local_index) self.processes.append( context.Process(target=target_fn, - name=f"EngineCore_{global_index}", + name=f"EngineCore_DP{global_index}", kwargs=common_kwargs | { "dp_rank": global_index, "local_dp_rank": local_index, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index c3d6c20e22e2a..bcf6dda9c1e91 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -26,6 +26,8 @@ from vllm.distributed import (destroy_distributed_environment, from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_pp_group, get_tp_group) from vllm.executor.multiproc_worker_utils import ( set_multiprocessing_worker_envs) from vllm.logger import init_logger @@ -397,17 +399,6 @@ class WorkerProc: wrapper.init_worker(all_kwargs) self.worker = wrapper - pp_size = vllm_config.parallel_config.pipeline_parallel_size - tp_size = vllm_config.parallel_config.tensor_parallel_size - pp_str = f"PP{rank // tp_size}" if pp_size > 1 else "" - tp_str = f"TP{rank % tp_size}" if tp_size > 1 else "" - suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}" - process_name = "VllmWorker" - if suffix: - set_process_title(suffix, append=True) - process_name = f"{process_name} {suffix}" - decorate_logs(process_name) - # Initialize MessageQueue for receiving SchedulerOutput self.rpc_broadcast_mq = MessageQueue.create_from_handle( input_shm_handle, self.worker.rank) @@ -425,8 +416,14 @@ class WorkerProc: name="WorkerAsyncOutputCopy") self.async_output_copy_thread.start() - # Initialize device and loads weights + # Initialize device self.worker.init_device() + + # Set process title and log prefix + self.setup_proc_title_and_log_prefix( + enable_ep=vllm_config.parallel_config.enable_expert_parallel) + + # Load model self.worker.load_model() @staticmethod @@ -663,3 +660,24 @@ class WorkerProc: if output_rank is None or self.rank == output_rank: self.handle_output(output) + + @staticmethod + def setup_proc_title_and_log_prefix(enable_ep: bool) -> None: + dp_size = get_dp_group().world_size + dp_rank = get_dp_group().rank_in_group + pp_size = get_pp_group().world_size + pp_rank = get_pp_group().rank_in_group + tp_size = get_tp_group().world_size + tp_rank = get_tp_group().rank_in_group + process_name = "Worker" + if dp_size > 1: + process_name += f"_DP{dp_rank}" + if pp_size > 1: + process_name += f"_PP{pp_rank}" + if tp_size > 1: + process_name += f"_TP{tp_rank}" + if enable_ep: + ep_rank = get_ep_group().rank_in_group + process_name += f"_EP{ep_rank}" + set_process_title(name=process_name) + decorate_logs(process_name) From ed16d0f26fbd1fecde8285b82d898bfd79ceaa87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= <mickael@mistral.ai> Date: Tue, 9 Sep 2025 06:46:45 +0200 Subject: [PATCH 062/584] [Doc] mention fpdb for multiprocess breakpoints (#24452) Signed-off-by: Mickael Seznec <mickael@mistral.ai> --- docs/usage/troubleshooting.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index 4945927e3d787..a82d97ea222f7 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -40,6 +40,34 @@ If other strategies don't solve the problem, it's likely that the vLLM instance - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL. - `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time. +## Breakpoints + +Setting normal `pdb` breakpoints may not work in vLLM's codebase if they are executed in a subprocess. You will experience something like: + +``` text + File "/usr/local/uv/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/bdb.py", line 100, in trace_dispatch + return self.dispatch_line(frame) + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/uv/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/bdb.py", line 125, in dispatch_line + if self.quitting: raise BdbQuit + ^^^^^^^^^^^^^ +bdb.BdbQuit +``` + +One solution is using [forked-pdb](https://github.com/Lightning-AI/forked-pdb). Install with `pip install fpdb` and set a breakpoint with something like: + +``` python +__import__('fpdb').ForkedPdb().set_trace() +``` + +Another option is to disable multiprocessing entirely, with the `VLLM_ENABLE_V1_MULTIPROCESSING` environment variable. +This keeps the scheduler in the same process, so you can use stock `pdb` breakpoints: + +``` python +import os +os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" +``` + ## Incorrect network setup The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one. From 1823a00d67f8443f97330056beae57f4659624f7 Mon Sep 17 00:00:00 2001 From: Ming Yang <minos.future@gmail.com> Date: Mon, 8 Sep 2025 22:53:10 -0700 Subject: [PATCH 063/584] [Misc] Support bench serve long context (#24373) Signed-off-by: Ming Yang <minos.future@gmail.com> --- tests/benchmarks/test_serve_cli.py | 31 +++ vllm/benchmarks/lib/endpoint_request_func.py | 212 ++++++++++++------- 2 files changed, 163 insertions(+), 80 deletions(-) diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index bfcf274727e27..5471d6b8e4a5f 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -45,3 +45,34 @@ def test_bench_serve(server): print(result.stderr) assert result.returncode == 0, f"Benchmark failed: {result.stderr}" + +@pytest.mark.benchmark +def test_bench_serve_chat(server): + command = [ + "vllm", + "bench", + "serve", + "--model", + MODEL_NAME, + "--host", + server.host, + "--port", + str(server.port), + "--dataset-name", + "random", + "--random-input-len", + "32", + "--random-output-len", + "4", + "--num-prompts", + "5", + "--endpoint", + "/v1/chat/completions", + "--endpoint-type", + "openai-chat", + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 6bb2a497119e9..9d67580be26ad 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -17,6 +17,47 @@ from tqdm.asyncio import tqdm AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +class StreamedResponseHandler: + """Handles streaming HTTP responses by accumulating chunks until complete + messages are available.""" + + def __init__(self): + self.buffer = "" + + def add_chunk(self, chunk_bytes: bytes) -> list[str]: + """Add a chunk of bytes to the buffer and return any complete + messages.""" + chunk_str = chunk_bytes.decode("utf-8") + self.buffer += chunk_str + + messages = [] + + # Split by double newlines (SSE message separator) + while "\n\n" in self.buffer: + message, self.buffer = self.buffer.split("\n\n", 1) + message = message.strip() + if message: + messages.append(message) + + # if self.buffer is not empty, check if it is a complete message + # by removing data: prefix and check if it is a valid JSON + if self.buffer.startswith("data: "): + message_content = self.buffer.removeprefix("data: ").strip() + if message_content == "[DONE]": + messages.append(self.buffer.strip()) + self.buffer = "" + elif message_content: + try: + json.loads(message_content) + messages.append(self.buffer.strip()) + self.buffer = "" + except json.JSONDecodeError: + # Incomplete JSON, wait for more chunks. + pass + + return messages + + @dataclass class RequestFuncInput: """The input for the request function.""" @@ -102,46 +143,50 @@ async def async_request_openai_completions( headers=headers) as response: if response.status == 200: first_chunk_received = False - async for chunk_bytes in response.content: + handler = StreamedResponseHandler() + + async for chunk_bytes in response.content.iter_any(): chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk_bytes = chunk_bytes.decode("utf-8") - # NOTE: SSE comments (often used as pings) start with - # a colon. These are not JSON data payload and should - # be skipped. - if chunk_bytes.startswith(":"): - continue - chunk = chunk_bytes.removeprefix("data: ") + messages = handler.add_chunk(chunk_bytes) + for message in messages: + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if message.startswith(":"): + continue - if chunk != "[DONE]": - data = json.loads(chunk) + chunk = message.removeprefix("data: ") - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft + if chunk != "[DONE]": + data = json.loads(chunk) - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -227,41 +272,44 @@ async def async_request_openai_chat_completions( async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: - async for chunk_bytes in response.content: + handler = StreamedResponseHandler() + async for chunk_bytes in response.content.iter_any(): chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk_bytes = chunk_bytes.decode("utf-8") - # NOTE: SSE comments (often used as pings) start with - # a colon. These are not JSON data payload and should - # be skipped. - if chunk_bytes.startswith(":"): - continue - chunk = chunk_bytes.removeprefix("data: ") + messages = handler.add_chunk(chunk_bytes) + for message in messages: + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if message.startswith(":"): + continue - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) + chunk = message.removeprefix("data: ") - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) - most_recent_timestamp = timestamp + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True @@ -347,36 +395,40 @@ async def async_request_openai_audio( data=form, headers=headers) as response: if response.status == 200: - async for chunk_bytes in response.content: + handler = StreamedResponseHandler() + + async for chunk_bytes in response.content.iter_any(): chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) + messages = handler.add_chunk(chunk_bytes) + for message in messages: + chunk = message.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) - if choices := data.get("choices"): - content = choices[0]["delta"].get( - "content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft + if choices := data.get("choices"): + content = choices[0]["delta"].get( + "content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft - # Decoding phase - else: - output.itl.append( - timestamp - most_recent_timestamp) + # Decoding phase + else: + output.itl.append( + timestamp - most_recent_timestamp) - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") - most_recent_timestamp = timestamp + most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True From 46876dff3283cdf7d764bf6133ea3aadcd507b7d Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Tue, 9 Sep 2025 08:06:04 +0200 Subject: [PATCH 064/584] [Doc]: fixing typos to improve docs (#24480) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- docs/features/tool_calling.md | 2 +- docs/getting_started/installation/gpu/rocm.inc.md | 2 +- examples/tool_chat_template_phi4_mini.jinja | 4 ++-- tests/engine/test_executor.py | 2 +- tests/entrypoints/offline_mode/test_offline_mode.py | 4 ++-- tests/kernels/utils.py | 2 +- tests/models/language/generation/test_hybrid.py | 4 ++-- tests/tpu/test_quantization_accuracy.py | 2 +- vllm/distributed/parallel_state.py | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index afc605a504b3d..5401603832273 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -169,7 +169,7 @@ All Llama 3.1, 3.2 and 4 models should be supported. The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser. -Other tool calling formats like the built in python tool calling or custom tool calling are not supported. +Other tool calling formats like the built-in python tool calling or custom tool calling are not supported. Known issues: diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 80e99d3034d39..4c70128d0b49a 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -119,7 +119,7 @@ Currently, there are no pre-built ROCm wheels. This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. !!! tip - - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers. - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. - The ROCm version of PyTorch, ideally, should match the ROCm driver version. diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja index 83886762c2893..6f40c38c20644 100644 --- a/examples/tool_chat_template_phi4_mini.jinja +++ b/examples/tool_chat_template_phi4_mini.jinja @@ -9,7 +9,7 @@ <|system|> {{ system_message }} {%- if tools %} -In addition to plain text responses, you can chose to call one or more of the provided functions. +In addition to plain text responses, you can choose to call one or more of the provided functions. Use the following rule to decide when to call a function: * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so @@ -19,7 +19,7 @@ If you decide to call functions: * prefix function calls with functools marker (no closing marker required) * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...] * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples - * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0 + * respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0 * make sure you pick the right functions that match the user intent diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py index 15c7a97b50e1f..67064aff3ae92 100644 --- a/tests/engine/test_executor.py +++ b/tests/engine/test_executor.py @@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor): timeout: Optional[float] = None, args: tuple = (), kwargs: Optional[dict] = None) -> list[Any]: - # Drop marker to show that this was ran + # Drop marker to show that this was run with open(".marker", "w"): ... return super().collective_rpc(method, timeout, args, kwargs) diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index a154bb1059aae..f8ed5dda260ff 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch): ) # Need to re-import huggingface_hub - # and friends to setup offline mode + # and friends to set up offline mode _re_import_modules() # Cached model files should be used in offline mode for model_config in MODEL_CONFIGS: @@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch): disable_connect, ) # Need to re-import huggingface_hub - # and friends to setup offline mode + # and friends to set up offline mode _re_import_modules() engine_args = EngineArgs(model="facebook/opt-125m") LLM(**dataclasses.asdict(engine_args)) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index c46db8e307936..c9bf85f6e2a5c 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor, # then we would expand a to: # a = [[1, 1, 2, 2], # [3, 3, 4, 4]] - # NOTE this function this function does not explicitly broadcast dimensions + # NOTE this function does not explicitly broadcast dimensions # with an extent of 1, since this can be done implicitly by pytorch def group_broadcast(t, shape): for i, s in enumerate(shape): diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index b44ddc61b6c8c..d0e42062099ec 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -301,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks( finished_requests_ids is larger than the maximum mamba block capacity. This could generally happen due to the fact that hybrid does support - statelessness mechanism where it can cleanup new incoming requests in + statelessness mechanism where it can clean up new incoming requests in a single step. """ try: @@ -322,7 +322,7 @@ def test_state_cleanup( This test is for verifying that the Hybrid state is cleaned up between steps. - If its not cleaned, an error would be expected. + If it's not cleaned, an error would be expected. """ try: with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py index 6cefbae4bdd18..8d9fbd280317c 100644 --- a/tests/tpu/test_quantization_accuracy.py +++ b/tests/tpu/test_quantization_accuracy.py @@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [ expected_value=0.76), # no bias # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU, # so only one of these tests can run in a single call to pytest. As - # a follow up, move this into the LM-EVAL section of the CI. + # a follow-up, move this into the LM-EVAL section of the CI. # GSM8KAccuracyTestConfig( # model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8", # expected_value=0.66), # bias in QKV layers diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 522dfc8d8b5a0..602bcebc017dd 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1117,7 +1117,7 @@ def initialize_model_parallel( "decode context model parallel group is already initialized") # Note(hc): In the current implementation of decode context parallel, # dcp_size must not exceed tp_size, because the world size does not - # change by DCP, it simply reuse the GPUs of TP group, and split one + # change by DCP, it simply reuses the GPUs of TP group, and split one # TP group into tp_size//dcp_size DCP groups. group_ranks = all_ranks.reshape( -1, decode_context_model_parallel_size).unbind(0) From e283976f3a3bacbe10cc22365b149cc11e8c0dff Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:24:11 +0800 Subject: [PATCH 065/584] [Performance][MM] Building the inverse permutation in O(n) time in Qwen2_5_VisionTransformer (#24443) Signed-off-by: Junhong <liujunhong11@huawei.com> Co-authored-by: Junhong <liujunhong11@huawei.com> --- vllm/model_executor/models/qwen2_5_vl.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index afef86fbaa027..a052b2a486f6a 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -717,6 +717,15 @@ class Qwen2_5_VisionTransformer(nn.Module): seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() return max_seqlen, seqlens + @staticmethod + def invert_permutation(perm: torch.Tensor) -> torch.Tensor: + # building the inverse permutation in O(n) time + inv = torch.empty_like(perm) + inv[perm] = torch.arange(perm.numel(), + device=perm.device, + dtype=perm.dtype) + return inv + def forward( self, x: torch.Tensor, @@ -760,6 +769,8 @@ class Qwen2_5_VisionTransformer(nn.Module): rotary_pos_emb = torch.cat(rotary_pos_emb) window_index = torch.cat(window_index) + # compute reverse indices + reverse_indices = self.invert_permutation(window_index) cu_window_seqlens = torch.cat(cu_window_seqlens) cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) cu_seqlens = torch.cat(cu_seqlens) @@ -813,7 +824,6 @@ class Qwen2_5_VisionTransformer(nn.Module): # adapter hidden_states = self.merger(hidden_states) - reverse_indices = torch.argsort(window_index) hidden_states = hidden_states[reverse_indices, :] return hidden_states From 45c9cb583564744e25e37b3d4ffd10958788c5b2 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" <yeq@meta.com> Date: Tue, 9 Sep 2025 01:14:45 -0700 Subject: [PATCH 066/584] [Misc] Add claude settings to gitignore (#24492) Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> --- .gitignore | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 465935d488f84..71b1c2387cf7f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* -# triton jit +# triton jit .triton # Byte-compiled / optimized / DLL files @@ -177,6 +177,10 @@ cython_debug/ # VSCode .vscode/ +# Claude +CLAUDE.md +.claude/ + # DS Store .DS_Store @@ -209,4 +213,4 @@ shellcheck*/ csrc/moe/marlin_moe_wna16/kernel_* # Ignore ep_kernels_workspace folder -ep_kernels_workspace/ \ No newline at end of file +ep_kernels_workspace/ From ccb97338af73fe878731758040ec93c668d0b1a7 Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Tue, 9 Sep 2025 01:25:44 -0700 Subject: [PATCH 067/584] [Misc] Add Codex settings to gitignore (#24493) Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.me> --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 71b1c2387cf7f..b1df673e83ca8 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,10 @@ cython_debug/ CLAUDE.md .claude/ +# Codex +AGENTS.md +.codex/ + # DS Store .DS_Store From 1116590b16bca58672e63908cb728bbd50b81c6e Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 9 Sep 2025 01:37:48 -0700 Subject: [PATCH 068/584] [gpt-oss] Validate gpt-oss python tool during initialization (#23856) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- vllm/entrypoints/openai/api_server.py | 2 ++ vllm/entrypoints/tool.py | 24 ++++++++++++++++++++++++ vllm/entrypoints/tool_server.py | 5 ++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b6667ebf152e1..c159bcee315f2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1717,6 +1717,8 @@ async def init_app_state( if args.tool_server == "demo": tool_server: Optional[ToolServer] = DemoToolServer() + assert isinstance(tool_server, DemoToolServer) + await tool_server.init_and_validate() elif args.tool_server: tool_server = MCPToolServer() await tool_server.add_tool_server(args.tool_server) diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py index 758789a5e059d..f5f4d7d3b5565 100644 --- a/vllm/entrypoints/tool.py +++ b/vllm/entrypoints/tool.py @@ -4,6 +4,8 @@ import os from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any +from openai_harmony import Author, Message, Role, TextContent + from vllm.logger import init_logger if TYPE_CHECKING: @@ -99,6 +101,28 @@ class HarmonyPythonTool(Tool): return self.python_tool = PythonTool() + + async def validate(self): + if not self.enabled: + return + try: + message = Message( + author=Author(role=Role.ASSISTANT), + content=[TextContent(text="print('Hello, world!')")], + channel="analysis", + recipient="python", + content_type="code", + ) + msgs = [] + async for msg in self.python_tool.process(message): + msgs.append(msg) + assert msgs[0].content[0].text == "Hello, world!\n" + except Exception as e: + self.enabled = False + logger.warning_once( + "Code interpreter tool failed to initialize (%s), code " + "interpreter is disabled", e) + return logger.info_once("Code interpreter tool initialized") async def get_result(self, context: "ConversationContext") -> Any: diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py index 3f413df7108af..056a571fb2fd1 100644 --- a/vllm/entrypoints/tool_server.py +++ b/vllm/entrypoints/tool_server.py @@ -162,10 +162,13 @@ class DemoToolServer(ToolServer): def __init__(self): self.tools: dict[str, Tool] = {} + + async def init_and_validate(self): browser_tool = HarmonyBrowserTool() + python_tool = HarmonyPythonTool() + await python_tool.validate() if browser_tool.enabled: self.tools["browser"] = browser_tool - python_tool = HarmonyPythonTool() if python_tool.enabled: self.tools["python"] = python_tool logger.info("DemoToolServer initialized with tools: %s", From 3d2a2de8f75b973927d63b4cab63d9abb1a24722 Mon Sep 17 00:00:00 2001 From: Weixiao Huang <hwx.simle@gmail.com> Date: Tue, 9 Sep 2025 16:57:46 +0800 Subject: [PATCH 069/584] [RL] fast weight update with zmq + ipc handles (#24295) Signed-off-by: huangweixiao <huangweixiao@msh.team> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com> --- examples/offline_inference/rlhf_colocate.py | 95 +++++++++++++++++---- examples/offline_inference/rlhf_utils.py | 90 +++++++++++++++---- 2 files changed, 152 insertions(+), 33 deletions(-) diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py index 65621023ab6ce..360fd79b55aad 100644 --- a/examples/offline_inference/rlhf_colocate.py +++ b/examples/offline_inference/rlhf_colocate.py @@ -28,12 +28,15 @@ Learn more about Ray placement groups: https://docs.ray.io/en/latest/placement-groups.html """ +import gc import os import ray import torch +import zmq from ray.util.placement_group import placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from torch.multiprocessing.reductions import reduce_tensor from vllm import LLM @@ -86,20 +89,72 @@ class RayTrainingActor: from vllm.platforms import current_platform self.device_uuid = current_platform.get_device_uuid(0) + self.zmq_context = zmq.Context() + self.zmq_address_counter = 0 + self.zmq_handle = None def report_device_id(self) -> str: return self.device_uuid - def get_weight_ipc_handles(self): - from torch.multiprocessing.reductions import reduce_tensor + def get_zmq_handles(self) -> dict[str, str]: + suffix = f"{self.device_uuid}-{self.zmq_address_counter}" + self.zmq_handle = f"ipc:///tmp/rl-colocate-zmq-{suffix}.sock" + self.zmq_address_counter += 1 + return {self.device_uuid: self.zmq_handle} - data = {} - for name, p in self.model.named_parameters(): - # A training actor might hold only a subset of the weights and may - # need to gather weights from other actors. For demonstration - # purposes, each training actor owns the full weight set. - data[name] = reduce_tensor(p.detach()) - return {self.device_uuid: data} + def update_weights(self): + # align size to avoid misaligned address + align_size = 256 + + def get_size(p: torch.Tensor) -> int: + return (p.nbytes + align_size - 1) // align_size * align_size + + named_parameters: dict[str, torch.nn.Parameter] = dict( + self.model.named_parameters() + ) + max_tensor_size = max(get_size(p) for p in named_parameters.values()) + # use max_tensor_size * 2 as buffer size + buffer = torch.empty(max_tensor_size * 2, dtype=torch.uint8, device="cuda:0") + s = self.zmq_context.socket(zmq.REQ) + s.bind(self.zmq_handle) + handle = reduce_tensor(buffer) + + offset = 0 + buckets: list[tuple[list[dict], list[torch.Tensor]]] = [] + named_tensors: list[dict] = [] + real_tensors: list[torch.Tensor] = [] + for name, p in named_parameters.items(): + size = get_size(p) + if offset + size > buffer.numel(): + buckets.append((named_tensors, real_tensors)) + named_tensors, real_tensors = [], [] + offset = 0 + # assume tensors are contiguous + named_tensors.append( + {"name": name, "dtype": p.dtype, "shape": p.shape, "offset": offset} + ) + real_tensors.append(p) + offset += size + if named_tensors: + buckets.append((named_tensors, real_tensors)) + s.send_pyobj(handle) + s.recv() + for named_tensors, real_tensors in buckets: + offset = 0 + for p in real_tensors: + buffer[offset : offset + p.nbytes].data.copy_( + p.data.view(-1).view(dtype=torch.uint8), non_blocking=True + ) + offset += get_size(p) + torch.cuda.synchronize() + s.send_pyobj(named_tensors) + s.recv() + s.send_pyobj(None) + s.recv() + s.close() + del buffer + gc.collect() + torch.cuda.empty_cache() # Ray manages four GPUs. @@ -175,18 +230,22 @@ assert training_actor_device_ids[:2] == inference_engine_device_ids[0] # the second inference engine. assert training_actor_device_ids[2:] == inference_engine_device_ids[1] -print("Gather all the IPC handles from the training actors.") -ipc_handles = {} +print("Gather all the ZMQ handles from the training actors.") +zmq_handles = {} for actor in training_actors: - ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote())) + zmq_handles.update(ray.get(actor.get_zmq_handles.remote())) + +print(f"ZMQ handles: {zmq_handles}") print("Update the weights of the inference engines.") -for llm in inference_engines: - ray.get( - llm.collective_rpc.remote( - "update_weights_from_ipc_handles", args=(ipc_handles,) - ) - ) +ray.get( + [actor.update_weights.remote() for actor in training_actors] + + [ + llm.collective_rpc.remote("update_weights_from_ipc", args=(zmq_handles,)) + for llm in inference_engines + ] +) + print("Check if the weights are updated.") for llm in inference_engines: assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple())) diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index d2a8419ffabcd..c0e60b9793407 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -1,6 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc +from typing import Callable, Optional, TypedDict + import torch +import zmq def stateless_init_process_group(master_address, master_port, rank, world_size, device): @@ -66,6 +70,27 @@ class WorkerExtension: return weights_updated +def rebuild_ipc( + handle: tuple[Callable, tuple], device_id: Optional[int] = None +) -> torch.Tensor: + func, args = handle + list_args = list(args) + if device_id is not None: + # the key is to change device id to the current device id + # in case two processes have different CUDA_VISIBLE_DEVICES + list_args[6] = device_id + buffer = func(*list_args) + return buffer + + +class FlattenedTensorMetadata(TypedDict): + name: str + shape: torch.Size + dtype: torch.dtype + # specify the start offset of this tensor in shared ipc_buffer tensor + offset: int + + class ColocateWorkerExtension: """ The class for vLLM's worker to inherit from, in the colocate setting. @@ -76,27 +101,62 @@ class ColocateWorkerExtension: should pass the full qualified name as `worker_extension_cls` argument. """ + def update_weights_from_ipc(self, zmq_handles: dict[str, str]): + from vllm.model_executor.model_loader.utils import process_weights_after_loading + + assert self.device is not None + if not hasattr(self, "_zmq_ctx") or self._zmq_ctx is None: + self._zmq_ctx = zmq.Context() + socket = self._zmq_ctx.socket(zmq.REP) + socket.connect(zmq_handles[self.report_device_id()]) + buffer: Optional[torch.Tensor] = None + while True: + payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = ( + socket.recv_pyobj() + ) + if payload is None: + # means the update is done + process_weights_after_loading( + self.model_runner.model, self.model_config, self.device + ) + torch.cuda.synchronize() + socket.send(b"") + break + if isinstance(payload, tuple): + # an ipc handle that vLLM can use `func, args = handle` + # and `func(*args)` to rebuild GPU tensor. + buffer = rebuild_ipc(payload, self.device.index) + assert buffer.dtype == torch.uint8 + socket.send(b"") + continue + assert isinstance(payload, list) + assert buffer is not None + weights = [] + for item in payload: + shape = item["shape"] + if isinstance(shape, (list, tuple)): + shape = torch.Size(shape) + assert isinstance(shape, torch.Size) + dtype, offset = item["dtype"], item["offset"] + size = dtype.itemsize * shape.numel() + tensor = buffer[offset : offset + size].view(dtype=dtype).view(shape) + weights.append((item["name"], tensor)) + self.model_runner.model.load_weights(weights=weights) + del weights + torch.cuda.synchronize() + socket.send(b"") + + socket.close() + del buffer + gc.collect() + torch.cuda.empty_cache() + def report_device_id(self) -> str: from vllm.platforms import current_platform self.device_uuid = current_platform.get_device_uuid(self.device.index) return self.device_uuid - def update_weights_from_ipc_handles(self, ipc_handles): - handles = ipc_handles[self.device_uuid] - device_id = self.device.index - weights = [] - for name, handle in handles.items(): - func, args = handle - list_args = list(args) - # the key is to change device id to the current device id - # in case two processes have different CUDA_VISIBLE_DEVICES - list_args[6] = device_id - tensor = func(*list_args) - weights.append((name, tensor)) - self.model_runner.model.load_weights(weights=weights) - torch.cuda.synchronize() - def check_weights_changed(self): """ Check if the weights are updated to 0. From 6fb27881634d89c2e70e9e5fbad1b918c0d916cf Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" <yeq@meta.com> Date: Tue, 9 Sep 2025 03:02:35 -0700 Subject: [PATCH 070/584] [CI/Build][Doc] Fully deprecate old bench scripts for serving / throughput / latency (#24411) Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> --- benchmarks/README.md | 4 +- benchmarks/benchmark_latency.py | 200 +---- benchmarks/benchmark_serving.py | 1333 +--------------------------- benchmarks/benchmark_throughput.py | 750 +--------------- 4 files changed, 41 insertions(+), 2246 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 98b3600d13635..957c2f988051d 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -694,7 +694,7 @@ python -m vllm.entrypoints.openai.api_server \ Send requests with images: ```bash -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2.5-VL-7B-Instruct \ --dataset-name sharegpt \ @@ -721,7 +721,7 @@ python -m vllm.entrypoints.openai.api_server \ Send requests with videos: ```bash -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2.5-VL-7B-Instruct \ --dataset-name sharegpt \ diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d8b960edaa468..a7892f3f71243 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,191 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark the latency of processing a single batch of requests.""" - -import argparse -import dataclasses -import json -import os -import time -from typing import Any, Optional - -import numpy as np -from tqdm import tqdm -from typing_extensions import deprecated - -import vllm.envs as envs -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptType -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={"latency": results["latencies"]}, - extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, - ) - if pt_records: - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -@deprecated( - "benchmark_latency.py is deprecated and will be removed in a " - "future version. Please use 'vllm bench latency' instead.", -) -def main(args: argparse.Namespace): - print(args) - - engine_args = EngineArgs.from_cli_args(args) - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM(**dataclasses.asdict(engine_args)) - assert llm.llm_engine.model_config.max_model_len >= ( - args.input_len + args.output_len - ), ( - "Please ensure that max_model_len is greater than" - " the sum of input_len and output_len." - ) - - sampling_params = SamplingParams( - n=args.n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=args.output_len, - detokenize=not args.disable_detokenize, - ) - print(sampling_params) - dummy_prompt_token_ids = np.random.randint( - 10000, size=(args.batch_size, args.input_len) - ) - dummy_prompts: list[PromptType] = [ - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() - ] - - def llm_generate(): - if not args.use_beam_search: - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) - else: - llm.beam_search( - dummy_prompts, - BeamSearchParams( - beam_width=args.n, - max_tokens=args.output_len, - ignore_eos=True, - ), - ) - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - llm.start_profile() - llm_generate() - llm.stop_profile() - else: - start_time = time.perf_counter() - llm_generate() - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) - - if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=profile_dir) - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90, 99] - percentiles = np.percentile(latencies, percentages) - print(f"Avg latency: {np.mean(latencies)} seconds") - for percentage, percentile in zip(percentages, percentiles): - print(f"{percentage}% percentile latency: {percentile} seconds") - - # Output JSON results if specified - if args.output_json: - results = { - "avg_latency": np.mean(latencies), - "latencies": latencies.tolist(), - "percentiles": dict(zip(percentages, percentiles.tolist())), - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--n", - type=int, - default=1, - help="Number of generated sequences per prompt.", - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-iters-warmup", - type=int, - default=10, - help="Number of iterations to run for warmup.", - ) - parser.add_argument( - "--num-iters", type=int, default=30, help="Number of iterations to run." - ) - parser.add_argument( - "--profile", - action="store_true", - help="profile the generation process of a single batch", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the latency results in JSON format.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)" - ), - ) - - parser = EngineArgs.add_cli_args(parser) - # V1 enables prefix caching by default which skews the latency - # numbers. We need to disable prefix caching by default. - parser.set_defaults(enable_prefix_caching=False) - - return parser - +import sys if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) - main(args) + print("""DEPRECATED: This script has been moved to the vLLM CLI. + +Please use the following command instead: + vllm bench latency + +For help with the new command, run: + vllm bench latency --help + +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench latency --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 934df05efac17..76cf51498020b 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,1324 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -r"""Benchmark online serving throughput. - -On the server side, run one of the following commands: - vLLM OpenAI API server - vllm serve <your_model> \ - --swap-space 16 - -On the client side, run: - python benchmarks/benchmark_serving.py \ - --backend <backend> \ - --model <your_model> \ - --dataset-name sharegpt \ - --dataset-path <path to dataset> \ - --request-rate <request_rate> \ # By default <request_rate> is inf - --num-prompts <num_prompts> # By default <num_prompts> is 1000 - - when using tgi backend, add - --endpoint /generate_stream - to the end of the command above. -""" - -import argparse -import asyncio -import gc -import json -import os -import random -import time -import warnings -from collections.abc import Iterable -from dataclasses import dataclass -from datetime import datetime -from typing import Any, Literal, Optional - -import numpy as np -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase -from typing_extensions import deprecated - -from backend_request_func import ( - ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, - RequestFuncInput, - RequestFuncOutput, -) - -try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: - from backend_request_func import get_tokenizer - -try: - from vllm.utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - -from benchmark_dataset import ( - AIMODataset, - ASRDataset, - BurstGPTDataset, - ConversationDataset, - CustomDataset, - HuggingFaceDataset, - InstructCoderDataset, - MTBenchDataset, - NextEditPredictionDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.benchmarks.serve import get_request - -MILLISECONDS_TO_SECONDS_CONVERSION = 1000 - - -@dataclass -class BenchmarkMetrics: - completed: int - total_input: int - total_output: int - request_throughput: float - request_goodput: float - output_throughput: float - total_token_throughput: float - mean_ttft_ms: float - median_ttft_ms: float - std_ttft_ms: float - percentiles_ttft_ms: list[tuple[float, float]] - mean_tpot_ms: float - median_tpot_ms: float - std_tpot_ms: float - percentiles_tpot_ms: list[tuple[float, float]] - mean_itl_ms: float - median_itl_ms: float - std_itl_ms: float - percentiles_itl_ms: list[tuple[float, float]] - # E2EL stands for end-to-end latency per request. - # It is the time taken on the client side from sending - # a request to receiving a complete response. - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - - -def calculate_metrics( - input_requests: list[SampleRequest], - outputs: list[RequestFuncOutput], - dur_s: float, - tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - goodput_config_dict: dict[str, float], -) -> tuple[BenchmarkMetrics, list[int]]: - actual_output_lens: list[int] = [] - total_input = 0 - completed = 0 - good_completed = 0 - itls: list[float] = [] - tpots: list[float] = [] - all_tpots: list[float] = [] - ttfts: list[float] = [] - e2els: list[float] = [] - for i in range(len(outputs)): - if outputs[i].success: - output_len = outputs[i].output_tokens - - if not output_len: - # We use the tokenizer to count the number of output tokens - # for some serving backends instead of looking at - # len(outputs[i].itl) since multiple output tokens may be - # bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer( - outputs[i].generated_text, add_special_tokens=False - ).input_ids - ) - actual_output_lens.append(output_len) - total_input += input_requests[i].prompt_len - tpot = 0 - if output_len > 1: - latency_minus_ttft = outputs[i].latency - outputs[i].ttft - tpot = latency_minus_ttft / (output_len - 1) - tpots.append(tpot) - # Note: if output_len <= 1, we regard tpot as 0 for goodput - all_tpots.append(tpot) - itls += outputs[i].itl - ttfts.append(outputs[i].ttft) - e2els.append(outputs[i].latency) - completed += 1 - else: - actual_output_lens.append(0) - - if goodput_config_dict: - valid_metrics = [] - slo_values = [] - - if "ttft" in goodput_config_dict: - valid_metrics.append(ttfts) - slo_values.append( - goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "tpot" in goodput_config_dict: - valid_metrics.append(all_tpots) - slo_values.append( - goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "e2el" in goodput_config_dict: - valid_metrics.append(e2els) - slo_values.append( - goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - - for req_metric in zip(*valid_metrics): - is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) - if is_good_req: - good_completed += 1 - - if completed == 0: - warnings.warn( - "All requests failed. This is likely due to a misconfiguration " - "on the benchmark arguments.", - stacklevel=2, - ) - metrics = BenchmarkMetrics( - completed=completed, - total_input=total_input, - total_output=sum(actual_output_lens), - request_throughput=completed / dur_s, - request_goodput=good_completed / dur_s, - output_throughput=sum(actual_output_lens) / dur_s, - total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_ttft_ms=np.mean(ttfts or 0) - * 1000, # ttfts is empty if streaming is not supported by backend - std_ttft_ms=np.std(ttfts or 0) * 1000, - median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[ - (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles - ], - mean_tpot_ms=np.mean(tpots or 0) * 1000, - std_tpot_ms=np.std(tpots or 0) * 1000, - median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[ - (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles - ], - mean_itl_ms=np.mean(itls or 0) * 1000, - std_itl_ms=np.std(itls or 0) * 1000, - median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[ - (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles - ], - mean_e2el_ms=np.mean(e2els or 0) * 1000, - std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[ - (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles - ], - ) - - return metrics, actual_output_lens - - -async def benchmark( - backend: str, - api_url: str, - base_url: str, - model_id: str, - model_name: str, - tokenizer: PreTrainedTokenizerBase, - input_requests: list[SampleRequest], - logprobs: Optional[int], - request_rate: float, - burstiness: float, - disable_tqdm: bool, - profile: bool, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - ignore_eos: bool, - goodput_config_dict: dict[str, float], - max_concurrency: Optional[int], - lora_modules: Optional[Iterable[str]], - extra_body: Optional[dict], - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, - ramp_up_start_rps: Optional[int] = None, - ramp_up_end_rps: Optional[int] = None, -): - if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS[backend] - else: - raise ValueError(f"Unknown backend: {backend}") - - print("Starting initial single prompt test run...") - test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0].prompt, - input_requests[0].prompt_len, - input_requests[0].expected_output_len, - input_requests[0].multi_modal_data, - ) - - assert ( - test_mm_content is None - or isinstance(test_mm_content, dict) - or ( - isinstance(test_mm_content, list) - and all(isinstance(item, dict) for item in test_mm_content) - ) - ), "multi_modal_data must be a dict or list[dict]" - test_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=api_url, - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - - test_output = await request_func(request_func_input=test_input) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}" - ) - else: - print("Initial test run completed. Starting main benchmark run...") - - if lora_modules: - # For each input request, choose a LoRA module at random. - lora_modules = iter( - [random.choice(lora_modules) for _ in range(len(input_requests))] - ) - - if profile: - print("Starting profiler...") - profile_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=base_url + "/start_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler started") - - distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" - - if ramp_up_strategy is not None: - print( - f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase " - f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over " - "the duration of the benchmark." - ) - else: - print(f"Traffic request rate: {request_rate} RPS.") - - print(f"Burstiness factor: {burstiness} ({distribution})") - print(f"Maximum request concurrency: {max_concurrency}") - - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None - - async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) - async with semaphore: - return await request_func(request_func_input=request_func_input, pbar=pbar) - - benchmark_start_time = time.perf_counter() - tasks: list[asyncio.Task] = [] - - rps_change_events = [] - last_int_rps = -1 - if ramp_up_strategy is not None and ramp_up_start_rps is not None: - last_int_rps = ramp_up_start_rps - rps_change_events.append( - { - "rps": last_int_rps, - "timestamp": datetime.now().isoformat(), - } - ) - - async for request, current_request_rate in get_request( - input_requests, - request_rate, - burstiness, - ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - ): - if ramp_up_strategy is not None: - current_int_rps = int(current_request_rate) - if current_int_rps > last_int_rps: - timestamp = datetime.now().isoformat() - for rps_val in range(last_int_rps + 1, current_int_rps + 1): - rps_change_events.append({"rps": rps_val, "timestamp": timestamp}) - last_int_rps = current_int_rps - - prompt, prompt_len, output_len, mm_content, request_id = ( - request.prompt, - request.prompt_len, - request.expected_output_len, - request.multi_modal_data, - request.request_id, - ) - req_model_id, req_model_name = model_id, model_name - if lora_modules: - req_lora_module = next(lora_modules) - req_model_id, req_model_name = req_lora_module, req_lora_module - - request_func_input = RequestFuncInput( - model=req_model_id, - model_name=req_model_name, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - multi_modal_content=mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - request_id=request_id, - ) - task = limited_request_func(request_func_input=request_func_input, pbar=pbar) - tasks.append(asyncio.create_task(task)) - outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - - if pbar is not None: - pbar.close() - - benchmark_duration = time.perf_counter() - benchmark_start_time - - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - tokenizer=tokenizer, - selected_percentile_metrics=selected_percentile_metrics, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) - - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - if max_concurrency is not None: - print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) - if request_rate != float("inf"): - print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", metrics.request_throughput - ) - ) - if goodput_config_dict: - print( - "{:<40} {:<10.2f}".format( - "Request goodput (req/s):", metrics.request_goodput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput - ) - ) - - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "request_goodput": metrics.request_goodput if goodput_config_dict else None, - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "generated_texts": [output.generated_text for output in outputs], - "errors": [output.error for output in outputs], - } - - if rps_change_events: - result["rps_change_events"] = rps_change_events - - def process_one_metric( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"), - ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"), - ) - ) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms" - ) - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms" - ) - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms" - ) - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result[f"p{p_word}_{metric_attribute_name}_ms"] = value - - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("e2el", "E2EL", "End-to-end Latency") - - print("=" * 50) - - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_prompt, - api_url=base_url + "/stop_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - - return result - - -def check_goodput_args(args): - # Check and parse goodput arguments - goodput_config_dict = {} - VALID_NAMES = ["ttft", "tpot", "e2el"] - if args.goodput: - goodput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in goodput_config_dict.items(): - if slo_name not in VALID_NAMES: - raise ValueError( - f"Invalid metric name found, {slo_name}: {slo_val}. " - "The service level objective name should be one of " - f"{str(VALID_NAMES)}. " - ) - if slo_val < 0: - raise ValueError( - f"Invalid value found, {slo_name}: {slo_val}. " - "The service level objective value should be " - "non-negative." - ) - return goodput_config_dict - - -def parse_goodput(slo_pairs): - goodput_config_dict = {} - try: - for slo_pair in slo_pairs: - slo_name, slo_val = slo_pair.split(":") - goodput_config_dict[slo_name] = float(slo_val) - except ValueError as err: - raise argparse.ArgumentTypeError( - "Invalid format found for service level objectives. " - 'Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is a " - "number in milliseconds." - ) from err - return goodput_config_dict - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any], file_name: str -) -> None: - metrics = [ - "median_ttft_ms", - "mean_ttft_ms", - "std_ttft_ms", - "p99_ttft_ms", - "mean_tpot_ms", - "median_tpot_ms", - "std_tpot_ms", - "p99_tpot_ms", - "median_itl_ms", - "mean_itl_ms", - "std_itl_ms", - "p99_itl_ms", - ] - # These raw data might be useful, but they are rather big. They can be added - # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={k: [results[k]] for k in metrics}, - extra_info={ - k: results[k] - for k in results - if k not in metrics and k not in ignored_metrics - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -@deprecated( - "benchmark_serving.py is deprecated and will be removed in a future " - "version. Please use 'vllm bench serve' instead.", -) -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - backend = args.backend - model_id = args.model - model_name = args.served_model_name - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode - - # Validate ramp-up arguments - if args.ramp_up_strategy is not None: - if args.request_rate != float("inf"): - raise ValueError( - "When using ramp-up, do not specify --request-rate. " - "The request rate will be controlled by ramp-up parameters. " - "Please remove the --request-rate argument." - ) - if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: - raise ValueError( - "When using --ramp-up-strategy, both --ramp-up-start-rps and " - "--ramp-up-end-rps must be specified" - ) - if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: - raise ValueError("Ramp-up start and end RPS must be non-negative") - if args.ramp_up_start_rps > args.ramp_up_end_rps: - raise ValueError("Ramp-up start RPS must be less than end RPS") - if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0: - raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") - - if args.base_url is not None: - api_url = f"{args.base_url}{args.endpoint}" - base_url = f"{args.base_url}" - else: - api_url = f"http://{args.host}:{args.port}{args.endpoint}" - base_url = f"http://{args.host}:{args.port}" - - tokenizer = get_tokenizer( - tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=args.trust_remote_code, - ) - - if args.dataset_name is None: - raise ValueError( - "Please specify '--dataset-name' and the corresponding " - "'--dataset-path' if required." - ) - - if args.dataset_name == "custom": - dataset = CustomDataset(dataset_path=args.dataset_path) - input_requests = dataset.sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.custom_output_len, - skip_chat_template=args.custom_skip_chat_template, - request_id_prefix=args.request_id_prefix, - ) - - elif args.dataset_name == "sonnet": - dataset = SonnetDataset(dataset_path=args.dataset_path) - # For the "sonnet" dataset, formatting depends on the backend. - if args.backend == "openai-chat": - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=False, - request_id_prefix=args.request_id_prefix, - ) - else: - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=True, - request_id_prefix=args.request_id_prefix, - ) - - elif args.dataset_name == "hf": - # all following datasets are implemented from the - # HuggingFaceDataset base class - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_class = VisionArenaDataset - args.hf_split = "train" - args.hf_subset = None - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_class = InstructCoderDataset - args.hf_split = "train" - elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: - dataset_class = MTBenchDataset - args.hf_split = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ConversationDataset - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_class = AIMODataset - args.hf_split = "train" - elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 - dataset_class = NextEditPredictionDataset - args.hf_split = "train" - elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ASRDataset - args.hf_split = "train" - else: - supported_datasets = set( - [ - dataset_name - for cls in HuggingFaceDataset.__subclasses__() - for dataset_name in cls.SUPPORTED_DATASET_PATHS - ] - ) - raise ValueError( - f"Unsupported dataset path: {args.dataset_path}. " - "Huggingface dataset only supports dataset_path" - f" from one of following: {supported_datasets}. " - "Please consider contributing if you would " - "like to add support for additional dataset formats." - ) - - if dataset_class.IS_MULTIMODAL and backend not in [ - "openai-chat", - "openai-audio", - ]: - # multi-modal benchmark is only available on OpenAI Chat backend. - raise ValueError( - "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' backend." - ) - input_requests = dataset_class( - dataset_path=args.dataset_path, - dataset_subset=args.hf_subset, - dataset_split=args.hf_split, - random_seed=args.seed, - no_stream=args.no_stream, - ).sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.hf_output_len, - request_id_prefix=args.request_id_prefix, - ) - - else: - # For datasets that follow a similar structure, use a mapping. - dataset_mapping = { - "sharegpt": lambda: ShareGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, - request_id_prefix=args.request_id_prefix, - ), - "burstgpt": lambda: BurstGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - request_id_prefix=args.request_id_prefix, - ), - "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - range_ratio=args.random_range_ratio, - request_id_prefix=args.request_id_prefix, - ), - } - - try: - input_requests = dataset_mapping[args.dataset_name]() - except KeyError as err: - raise ValueError(f"Unknown dataset: {args.dataset_name}") from err - goodput_config_dict = check_goodput_args(args) - - # Collect the sampling parameters. - sampling_params = { - k: v - for k, v in { - "top_p": args.top_p, - "top_k": args.top_k, - "min_p": args.min_p, - "temperature": args.temperature, - }.items() - if v is not None - } - - # Sampling parameters are only supported by openai-compatible backend. - if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: - raise ValueError( - "Sampling parameters are only supported by openai-compatible backends." - ) - - if "temperature" not in sampling_params: - sampling_params["temperature"] = 0.0 # Default to greedy decoding. - - if args.backend == "llama.cpp": - # Disable prompt caching in llama.cpp backend - sampling_params["cache_prompt"] = False - - # Avoid GC processing "static" data - reduce pause times. - gc.collect() - gc.freeze() - - benchmark_result = asyncio.run( - benchmark( - backend=backend, - api_url=api_url, - base_url=base_url, - model_id=model_id, - model_name=model_name, - tokenizer=tokenizer, - input_requests=input_requests, - logprobs=args.logprobs, - request_rate=args.request_rate, - burstiness=args.burstiness, - disable_tqdm=args.disable_tqdm, - profile=args.profile, - selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], - ignore_eos=args.ignore_eos, - goodput_config_dict=goodput_config_dict, - max_concurrency=args.max_concurrency, - lora_modules=args.lora_modules, - extra_body=sampling_params, - ramp_up_strategy=args.ramp_up_strategy, - ramp_up_start_rps=args.ramp_up_start_rps, - ramp_up_end_rps=args.ramp_up_end_rps, - ) - ) - - # Save config and results to json - if args.save_result or args.append_result: - result_json: dict[str, Any] = {} - - # Setup - current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_json["date"] = current_dt - result_json["backend"] = backend - result_json["model_id"] = model_id - result_json["tokenizer_id"] = tokenizer_id - result_json["num_prompts"] = args.num_prompts - - # Metadata - if args.metadata: - for item in args.metadata: - if "=" in item: - kvstring = item.split("=") - result_json[kvstring[0].strip()] = kvstring[1].strip() - else: - raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) - # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf" - ) - result_json["burstiness"] = args.burstiness - result_json["max_concurrency"] = args.max_concurrency - - if args.ramp_up_strategy is not None: - result_json["ramp_up_strategy"] = args.ramp_up_strategy - result_json["ramp_up_start_rps"] = args.ramp_up_start_rps - result_json["ramp_up_end_rps"] = args.ramp_up_end_rps - - # Merge with benchmark result - result_json = {**result_json, **benchmark_result} - - if not args.save_detailed: - # Remove fields with too many data points - for field in [ - "input_lens", - "output_lens", - "ttfts", - "itls", - "generated_texts", - "errors", - ]: - if field in result_json: - del result_json[field] - if field in benchmark_result: - del benchmark_result[field] - - # Save to file - base_model_id = model_id.split("/")[-1] - max_concurrency_str = ( - f"-concurrency{args.max_concurrency}" - if args.max_concurrency is not None - else "" - ) - if args.ramp_up_strategy is not None: - file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - else: - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - if args.result_filename: - file_name = args.result_filename - if args.result_dir: - os.makedirs(args.result_dir, exist_ok=True) - file_name = os.path.join(args.result_dir, file_name) - with open( - file_name, mode="a+" if args.append_result else "w", encoding="utf-8" - ) as outfile: - # Append a newline. - if args.append_result and outfile.tell() != 0: - outfile.write("\n") - json.dump(result_json, outfile) - save_to_pytorch_benchmark_format(args, result_json, file_name) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput." - ) - parser.add_argument( - "--backend", - type=str, - default="vllm", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) - parser.add_argument( - "--base-url", - type=str, - default=None, - help="Server or API base url if not using http host and port.", - ) - # Use 127.0.0.1 here instead of localhost to force the use of ipv4 - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument( - "--endpoint", - type=str, - default="/v1/completions", - help="API endpoint.", - ) - parser.add_argument( - "--dataset-name", - type=str, - default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], - help="Name of the dataset to benchmark on.", - ) - parser.add_argument( - "--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.", - ) - parser.add_argument( - "--no-stream", - action="store_true", - help="Do not load the dataset in streaming mode.", - ) - parser.add_argument( - "--max-concurrency", - type=int, - default=None, - help="Maximum number of concurrent requests. This can be used " - "to help simulate an environment where a higher level component " - "is enforcing a maximum number of concurrent requests. While the " - "--request-rate argument controls the rate at which requests are " - "initiated, this argument will control how many are actually allowed " - "to execute at a time. This means that when used in combination, the " - "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.", - ) - - parser.add_argument( - "--model", - type=str, - required=True, - help="Name of the model.", - ) - parser.add_argument( - "--tokenizer", - type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.", - ) - parser.add_argument( - "--logprobs", - type=int, - default=None, - help=( - "Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed" - ), - ) - parser.add_argument( - "--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process or gamma distribution " - "to synthesize the request arrival times.", - ) - parser.add_argument( - "--burstiness", - type=float, - default=1.0, - help="Burstiness factor of the request generation. " - "Only take effect when request_rate is not inf. " - "Default value is 1, which follows Poisson process. " - "Otherwise, the request intervals follow a gamma distribution. " - "A lower burstiness value (0 < burstiness < 1) results in more " - "bursty requests. A higher burstiness value (burstiness > 1) " - "results in a more uniform arrival of requests.", - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) - parser.add_argument( - "--disable-tqdm", - action="store_true", - help="Specify to disable tqdm progress bar.", - ) - parser.add_argument( - "--profile", - action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", - ) - parser.add_argument( - "--save-result", - action="store_true", - help="Specify to save benchmark results to a json file", - ) - parser.add_argument( - "--save-detailed", - action="store_true", - help="When saving the results, whether to include per request " - "information such as response, error, ttfs, tpots, etc.", - ) - parser.add_argument( - "--append-result", - action="store_true", - help="Append the benchmark result to the existing json file.", - ) - parser.add_argument( - "--metadata", - metavar="KEY=VALUE", - nargs="*", - help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " - "for metadata of this run to be saved in the result JSON file " - "for record keeping purposes.", - ) - parser.add_argument( - "--result-dir", - type=str, - default=None, - help="Specify directory to save benchmark json results." - "If not specified, results are saved in the current directory.", - ) - parser.add_argument( - "--result-filename", - type=str, - default=None, - help="Specify the filename to save benchmark json results." - "If not specified, results will be saved in " - "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" - " format.", - ) - parser.add_argument( - "--ignore-eos", - action="store_true", - help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", - ) - parser.add_argument( - "--percentile-metrics", - type=str, - default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentiles. " - "This argument specifies the metrics to report percentiles. " - 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' - 'Default value is "ttft,tpot,itl".', - ) - parser.add_argument( - "--metric-percentiles", - type=str, - default="99", - help="Comma-separated list of percentiles for selected metrics. " - 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' - 'Default value is "99". ' - 'Use "--percentile-metrics" to select metrics.', - ) - parser.add_argument( - "--goodput", - nargs="+", - required=False, - help='Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is in " - 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' - "separated by spaces. Allowed request level metric names are " - '"ttft", "tpot", "e2el". For more context on the definition of ' - "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve", - ) - parser.add_argument( - "--request-id-prefix", - type=str, - required=False, - default="benchmark-serving", - help="Specify the prefix of request id.", - ) - - # group for dataset specific arguments - custom_group = parser.add_argument_group("custom dataset options") - custom_group.add_argument( - "--custom-output-len", - type=int, - default=256, - help="Number of output tokens per request, used only for custom dataset.", - ) - custom_group.add_argument( - "--custom-skip-chat-template", - action="store_true", - help="Skip applying chat template to prompt, used only for custom dataset.", - ) - - sonnet_group = parser.add_argument_group("sonnet dataset options") - sonnet_group.add_argument( - "--sonnet-input-len", - type=int, - default=550, - help="Number of input tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-output-len", - type=int, - default=150, - help="Number of output tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-prefix-len", - type=int, - default=200, - help="Number of prefix tokens per request, used only for sonnet dataset.", - ) - - sharegpt_group = parser.add_argument_group("sharegpt dataset options") - sharegpt_group.add_argument( - "--sharegpt-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.", - ) - - random_group = parser.add_argument_group("random dataset options") - random_group.add_argument( - "--random-input-len", - type=int, - default=1024, - help="Number of input tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-output-len", - type=int, - default=128, - help="Number of output tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for random sampling. Must be in the range [0, 1) to define " - "a symmetric sampling range" - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - random_group.add_argument( - "--random-prefix-len", - type=int, - default=0, - help=( - "Number of fixed prefix tokens before the random context " - "in a request. " - "The total input length is the sum of `random-prefix-len` and " - "a random " - "context length sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]." - ), - ) - - hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - hf_group.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - hf_group.add_argument( - "--hf-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", - ) - - sampling_group = parser.add_argument_group("sampling parameters") - sampling_group.add_argument( - "--top-p", - type=float, - default=None, - help="Top-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--top-k", - type=int, - default=None, - help="Top-k sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--min-p", - type=float, - default=None, - help="Min-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--temperature", - type=float, - default=None, - help="Temperature sampling parameter. Only has effect on " - "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).", - ) - - parser.add_argument( - "--tokenizer-mode", - type=str, - default="auto", - choices=["auto", "slow", "mistral", "custom"], - help='The tokenizer mode.\n\n* "auto" will use the ' - 'fast tokenizer if available.\n* "slow" will ' - "always use the slow tokenizer. \n* " - '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.', - ) - - parser.add_argument( - "--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ", - ) - - parser.add_argument( - "--lora-modules", - nargs="+", - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.", - ) - - parser.add_argument( - "--ramp-up-strategy", - type=str, - default=None, - choices=["linear", "exponential"], - help="The ramp-up strategy. This would be used to " - "ramp up the request rate from initial RPS to final " - "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). " - "over the duration of the benchmark.", - ) - parser.add_argument( - "--ramp-up-start-rps", - type=int, - default=None, - help="The starting request rate for ramp-up (RPS). " - "Needs to be specified when --ramp-up-strategy is used.", - ) - parser.add_argument( - "--ramp-up-end-rps", - type=int, - default=None, - help="The ending request rate for ramp-up (RPS). " - "Needs to be specified when --ramp-up-strategy is used.", - ) - - return parser - +import sys if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - main(args) + print("""DEPRECATED: This script has been moved to the vLLM CLI. + +Please use the following command instead: + vllm bench serve + +For help with the new command, run: + vllm bench serve --help + +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench serve --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 34a525f00d910..b6dc0918fd4d1 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,741 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark offline inference throughput.""" - -import argparse -import dataclasses -import json -import os -import random -import time -import warnings -from typing import Any, Optional, Union - -import torch -import uvloop -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase -from typing_extensions import deprecated - -from benchmark_dataset import ( - AIMODataset, - BurstGPTDataset, - ConversationDataset, - InstructCoderDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args, -) -from vllm.inputs import TextPrompt, TokensPrompt -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser, merge_async_iterators - - -def run_vllm( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, Optional[list[RequestOutput]]]: - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests: Optional[list[LoRARequest]] = None - if engine_args.enable_lora: - lora_requests = [request.lora_request for request in requests] - - use_beam_search = False - - outputs = None - if not use_beam_search: - start = time.perf_counter() - outputs = llm.generate( - prompts, sampling_params, lora_request=lora_requests, use_tqdm=True - ) - end = time.perf_counter() - else: - assert lora_requests is None, "BeamSearch API does not support LoRA" - # output_len should be the same for all requests. - output_len = requests[0].expected_output_len - for request in requests: - assert request.expected_output_len == output_len - start = time.perf_counter() - llm.beam_search( - prompts, - BeamSearchParams( - beam_width=n, - max_tokens=output_len, - ignore_eos=True, - ), - ) - end = time.perf_counter() - return end - start, outputs - - -def run_vllm_chat( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, list[RequestOutput]]: - """ - Run vLLM chat benchmark. This function is recommended ONLY for benchmarking - multimodal models as it properly handles multimodal inputs and chat - formatting. For non-multimodal models, use run_vllm() instead. - """ - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of " - "prompt_len and expected_output_len for all requests." - ) - - prompts = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - start = time.perf_counter() - outputs = llm.chat(prompts, sampling_params, use_tqdm=True) - end = time.perf_counter() - return end - start, outputs - - -async def run_vllm_async( - requests: list[SampleRequest], - n: int, - engine_args: AsyncEngineArgs, - disable_frontend_multiprocessing: bool = False, - disable_detokenize: bool = False, -) -> float: - from vllm import SamplingParams - - async with build_async_engine_client_from_engine_args( - engine_args, - disable_frontend_multiprocessing=disable_frontend_multiprocessing, - ) as llm: - model_config = await llm.get_model_config() - assert all( - model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - lora_requests: list[Optional[LoRARequest]] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests.append(request.lora_request) - - generators = [] - start = time.perf_counter() - for i, (prompt, sp, lr) in enumerate( - zip(prompts, sampling_params, lora_requests) - ): - generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass - end = time.perf_counter() - return end - start - - -def run_hf( - requests: list[SampleRequest], - model: str, - tokenizer: PreTrainedTokenizerBase, - n: int, - max_batch_size: int, - trust_remote_code: bool, - disable_detokenize: bool = False, -) -> float: - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code - ) - if llm.config.model_type == "llama": - # To enable padding in the HF backend. - tokenizer.pad_token = tokenizer.eos_token - llm = llm.cuda() - - pbar = tqdm(total=len(requests)) - start = time.perf_counter() - batch: list[str] = [] - max_prompt_len = 0 - max_output_len = 0 - for i in range(len(requests)): - prompt = requests[i].prompt - prompt_len = requests[i].prompt_len - output_len = requests[i].expected_output_len - # Add the prompt to the batch. - batch.append(prompt) - max_prompt_len = max(max_prompt_len, prompt_len) - max_output_len = max(max_output_len, output_len) - if len(batch) < max_batch_size and i != len(requests) - 1: - # Check if we can add more requests to the batch. - next_prompt_len = requests[i + 1].prompt_len - next_output_len = requests[i + 1].expected_output_len - if ( - max(max_prompt_len, next_prompt_len) - + max(max_output_len, next_output_len) - ) <= 2048: - # We can add more requests to the batch. - continue - - # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids - llm_outputs = llm.generate( - input_ids=input_ids.cuda(), - do_sample=True, - num_return_sequences=n, - temperature=1.0, - top_p=1.0, - use_cache=True, - max_new_tokens=max_output_len, - ) - if not disable_detokenize: - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) - pbar.update(len(batch)) - - # Clear the batch. - batch = [] - max_prompt_len = 0 - max_output_len = 0 - end = time.perf_counter() - return end - start - - -def run_mii( - requests: list[SampleRequest], - model: str, - tensor_parallel_size: int, - output_len: int, -) -> float: - from mii import client, serve - - llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [request.prompt for request in requests] - - start = time.perf_counter() - llm.generate(prompts, max_new_tokens=output_len) - end = time.perf_counter() - client = client(model) - client.terminate_server() - return end - start - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={ - "requests_per_second": [results["requests_per_second"]], - "tokens_per_second": [results["tokens_per_second"]], - }, - extra_info={ - k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def get_requests(args, tokenizer): - # Common parameters for all dataset types. - common_kwargs = { - "dataset_path": args.dataset_path, - "random_seed": args.seed, - } - sample_kwargs = { - "tokenizer": tokenizer, - "lora_path": args.lora_path, - "max_loras": args.max_loras, - "num_requests": args.num_prompts, - "input_len": args.input_len, - "output_len": args.output_len, - } - - if args.dataset_path is None or args.dataset_name == "random": - sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["prefix_len"] = args.prefix_len - dataset_cls = RandomDataset - elif args.dataset_name == "sharegpt": - dataset_cls = ShareGPTDataset - if args.backend == "vllm-chat": - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_name == "sonnet": - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - dataset_cls = SonnetDataset - sample_kwargs["prefix_len"] = args.prefix_len - sample_kwargs["return_prompt_formatted"] = True - elif args.dataset_name == "burstgpt": - dataset_cls = BurstGPTDataset - elif args.dataset_name == "hf": - common_kwargs["no_stream"] = args.no_stream - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = VisionArenaDataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = InstructCoderDataset - common_kwargs["dataset_split"] = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = ConversationDataset - common_kwargs["dataset_subset"] = args.hf_subset - common_kwargs["dataset_split"] = args.hf_split - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_cls = AIMODataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - else: - raise ValueError(f"Unknown dataset name: {args.dataset_name}") - # Remove None values - sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} - return dataset_cls(**common_kwargs).sample(**sample_kwargs) - - -@deprecated( - "benchmark_throughput.py is deprecated and will be removed in a " - "future version. Please use 'vllm bench throughput' instead.", -) -def main(args: argparse.Namespace): - if args.seed is None: - args.seed = 0 - print(args) - random.seed(args.seed) - # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code - ) - requests = get_requests(args, tokenizer) - is_multi_modal = any(request.multi_modal_data is not None for request in requests) - request_outputs: Optional[list[RequestOutput]] = None - if args.backend == "vllm": - if args.async_engine: - elapsed_time = uvloop.run( - run_vllm_async( - requests, - args.n, - AsyncEngineArgs.from_cli_args(args), - args.disable_frontend_multiprocessing, - args.disable_detokenize, - ) - ) - else: - elapsed_time, request_outputs = run_vllm( - requests, - args.n, - EngineArgs.from_cli_args(args), - args.disable_detokenize, - ) - elif args.backend == "hf": - assert args.tensor_parallel_size == 1 - elapsed_time = run_hf( - requests, - args.model, - tokenizer, - args.n, - args.hf_max_batch_size, - args.trust_remote_code, - args.disable_detokenize, - ) - elif args.backend == "mii": - elapsed_time = run_mii( - requests, args.model, args.tensor_parallel_size, args.output_len - ) - elif args.backend == "vllm-chat": - elapsed_time, request_outputs = run_vllm_chat( - requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize - ) - else: - raise ValueError(f"Unknown backend: {args.backend}") - - if request_outputs: - # Note: with the vllm and vllm-chat backends, - # we have request_outputs, which we use to count tokens. - total_prompt_tokens = 0 - total_output_tokens = 0 - for ro in request_outputs: - if not isinstance(ro, RequestOutput): - continue - total_prompt_tokens += ( - len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 - ) - total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) - total_num_tokens = total_prompt_tokens + total_output_tokens - else: - total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) - total_output_tokens = sum(r.expected_output_len for r in requests) - total_prompt_tokens = total_num_tokens - total_output_tokens - - if is_multi_modal and args.backend != "vllm-chat": - print( - "\033[91mWARNING\033[0m: Multi-modal request with " - f"{args.backend} backend detected. The " - "following metrics are not accurate because image tokens are not" - " counted. See vllm-project/vllm/issues/9778 for details." - ) - # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. - # vllm-chat backend counts the image tokens now - - print( - f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s" - ) - print(f"Total num prompt tokens: {total_prompt_tokens}") - print(f"Total num output tokens: {total_output_tokens}") - - # Output JSON results if specified - if args.output_json: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": total_num_tokens / elapsed_time, - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) - - -def validate_args(args): - """ - Validate command-line arguments. - """ - - # === Deprecation and Defaulting === - if args.dataset is not None: - warnings.warn( - "The '--dataset' argument will be deprecated in the next release. " - "Please use '--dataset-name' and '--dataset-path' instead.", - stacklevel=2, - ) - args.dataset_path = args.dataset - - if not getattr(args, "tokenizer", None): - args.tokenizer = args.model - - # === Backend Validation === - valid_backends = {"vllm", "hf", "mii", "vllm-chat"} - if args.backend not in valid_backends: - raise ValueError(f"Unsupported backend: {args.backend}") - - # === Dataset Configuration === - if not args.dataset and not args.dataset_path: - print("When dataset path is not set, it will default to random dataset") - args.dataset_name = "random" - if args.input_len is None: - raise ValueError("input_len must be provided for a random dataset") - - # === Dataset Name Specific Checks === - # --hf-subset and --hf-split: only used - # when dataset_name is 'hf' - if args.dataset_name != "hf" and ( - getattr(args, "hf_subset", None) is not None - or getattr(args, "hf_split", None) is not None - ): - warnings.warn( - "--hf-subset and --hf-split will be ignored \ - since --dataset-name is not 'hf'.", - stacklevel=2, - ) - elif args.dataset_name == "hf": - if args.dataset_path in ( - VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() - | ConversationDataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm-chat", ( - f"{args.dataset_path} needs to use vllm-chat as the backend." - ) # noqa: E501 - elif args.dataset_path in ( - InstructCoderDataset.SUPPORTED_DATASET_PATHS - | AIMODataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm", ( - f"{args.dataset_path} needs to use vllm as the backend." - ) # noqa: E501 - else: - raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") - - # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != "random" and args.random_range_ratio is not None: - warnings.warn( - "--random-range-ratio will be ignored since \ - --dataset-name is not 'random'.", - stacklevel=2, - ) - - # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not - # set. - if ( - args.dataset_name not in {"random", "sonnet", None} - and args.prefix_len is not None - ): - warnings.warn( - "--prefix-len will be ignored since --dataset-name\ - is not 'random', 'sonnet', or not set.", - stacklevel=2, - ) - - # === LoRA Settings === - if getattr(args, "enable_lora", False) and args.backend != "vllm": - raise ValueError("LoRA benchmarking is only supported for vLLM backend") - if getattr(args, "enable_lora", False) and args.lora_path is None: - raise ValueError("LoRA path must be provided when enable_lora is True") - - # === Backend-specific Validations === - if args.backend == "hf" and args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend") - if args.backend != "hf" and args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - - if ( - args.backend in {"hf", "mii"} - and getattr(args, "quantization", None) is not None - ): - raise ValueError("Quantization is only for vLLM backend.") - - if args.backend == "mii" and args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.backend == "mii" and args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.backend == "mii" and args.tokenizer != args.model: - raise ValueError("Tokenizer must be the same as the model for MII backend.") - - # --data-parallel is not supported currently. - # https://github.com/vllm-project/vllm/issues/16222 - if args.data_parallel_size > 1: - raise ValueError( - "Data parallel is not supported in offline benchmark, " - "please use benchmark serving instead" - ) - - -def create_argument_parser(): - parser = FlexibleArgumentParser(description="Benchmark the throughput.") - parser.add_argument( - "--backend", - type=str, - choices=["vllm", "hf", "mii", "vllm-chat"], - default="vllm", - ) - parser.add_argument( - "--dataset-name", - type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], - help="Name of the dataset to benchmark on.", - default="sharegpt", - ) - parser.add_argument( - "--no-stream", - action="store_true", - help="Do not load the dataset in streaming mode.", - ) - parser.add_argument( - "--dataset", - type=str, - default=None, - help="Path to the ShareGPT dataset, will be deprecated in\ - the next release. The dataset is expected to " - "be a json in form of list[dict[..., conversations: " - "list[dict[..., value: <prompt_or_response>]]]]", - ) - parser.add_argument( - "--dataset-path", type=str, default=None, help="Path to the dataset" - ) - parser.add_argument( - "--input-len", - type=int, - default=None, - help="Input prompt length for each request", - ) - parser.add_argument( - "--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.", - ) - parser.add_argument( - "--n", type=int, default=1, help="Number of generated sequences per prompt." - ) - parser.add_argument( - "--num-prompts", type=int, default=1000, help="Number of prompts to process." - ) - parser.add_argument( - "--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the throughput results in JSON format.", - ) - parser.add_argument( - "--async-engine", - action="store_true", - default=False, - help="Use vLLM async engine rather than LLM class.", - ) - parser.add_argument( - "--disable-frontend-multiprocessing", - action="store_true", - default=False, - help="Disable decoupled async engine frontend.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize the response (i.e. do not include " - "detokenization time in the measurement)" - ), - ) - # LoRA - parser.add_argument( - "--lora-path", - type=str, - default=None, - help="Path to the LoRA adapters to use. This can be an absolute path, " - "a relative path, or a Hugging Face model identifier.", - ) - parser.add_argument( - "--prefix-len", - type=int, - default=None, - help=f"Number of prefix tokens to be used in RandomDataset " - "and SonnetDataset. For RandomDataset, the total input " - "length is the sum of prefix-len (default: " - f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length " - "sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]. For SonnetDataset, " - f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " - "controls how much of the input is fixed lines versus " - "random lines, but the total input length remains approximately " - "input_len tokens.", - ) - # random dataset - parser.add_argument( - "--random-range-ratio", - type=float, - default=None, - help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " - "for sampling input/output length, " - "used only for RandomDataset. Must be in the range [0, 1) to " - "define a symmetric sampling range " - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - - # hf dataset - parser.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - parser.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - - parser = AsyncEngineArgs.add_cli_args(parser) - - return parser - +import sys if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - validate_args(args) - main(args) + print("""DEPRECATED: This script has been moved to the vLLM CLI. + +Please use the following command instead: + vllm bench throughput + +For help with the new command, run: + vllm bench throughput --help + +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench throughput --help +""") + sys.exit(1) From a55cf41a0971f606413194a03dbe8b091104c309 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 9 Sep 2025 10:21:10 -0400 Subject: [PATCH 071/584] [Compilation][WideEP] Enable Piecewise CUDAGraph for DeepEPHT (#24123) --- vllm/config/compilation.py | 15 ++++++++++++++- vllm/platforms/cuda.py | 16 +++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 677fb069bc07a..09600e96a1c62 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -546,7 +546,8 @@ class CompilationConfig: # full cudagraph outside the fx graph. This reduces some cpu # overhead when the runtime batch_size is not cudagraph captured. # see https://github.com/vllm-project/vllm/pull/20059 for details. - self.splitting_ops = self._attention_ops + # make a copy to avoid mutating the class-level list via reference. + self.splitting_ops = list(self._attention_ops) elif len(self.splitting_ops) == 0: logger.warning_once("Using piecewise compilation with empty " "splitting_ops.") @@ -561,6 +562,18 @@ class CompilationConfig: self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] + if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": + # exclude MoE dispatch/combine from capture by ensuring + # piecewise splitting includes them, so communication remains + # outside CUDA graphs while compute can still be graphed. + moe_ops = [ + "vllm.moe_forward", + "vllm.moe_forward_shared", + ] + for op in moe_ops: + if op not in self.splitting_ops: + self.splitting_ops.append(op) + def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 1b0a298352cbf..fc1a399d6f43f 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -183,16 +183,14 @@ class CudaPlatformBase(Platform): compilation_config = vllm_config.compilation_config if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" and parallel_config.data_parallel_size > 1 - and compilation_config.cudagraph_mode != CUDAGraphMode.NONE): + and compilation_config.cudagraph_mode + not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]): logger.info( - "Data Parallel: disabling cudagraphs since DP " - "with DeepEP high-throughput kernels are not CUDA Graph " - "compatible. The DeepEP low-latency kernels are CUDA Graph " - "compatible. Set the all_to_all backend to deepep_low_latency " - "to use those kernels instead.") - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - if model_config is not None: - model_config.enforce_eager = True + "Data Parallel with DeepEP high-throughput: using PIECEWISE " + "CUDA graphs and excluding MoE ops from capture. Set " + "VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE " + "graphs captured as well.") + compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE @classmethod def get_current_memory_usage(cls, From 19332c0479bbc8e5389afd53e9d15a028bdaf5f8 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Tue, 9 Sep 2025 22:29:50 +0800 Subject: [PATCH 072/584] [Model] Systematic support for fp32 head, pooling models part (#23810) Signed-off-by: wang.yuqi <noooop@126.com> --- tests/models/language/pooling/mteb_utils.py | 37 ++++++++++--- .../pooling/test_bge_reranker_v2_gemma.py | 1 + vllm/config/__init__.py | 53 ++++++++++++++++++- vllm/model_executor/layers/pooler.py | 38 +++++++------ vllm/model_executor/models/adapters.py | 10 ++-- vllm/model_executor/models/bert.py | 4 +- vllm/model_executor/models/bert_with_rope.py | 16 +++--- vllm/model_executor/models/gpt2.py | 10 ++-- vllm/model_executor/models/internlm2.py | 19 ++++--- vllm/model_executor/models/jamba.py | 2 +- vllm/model_executor/models/jina_vl.py | 13 +++-- vllm/model_executor/models/modernbert.py | 4 +- vllm/model_executor/models/qwen2_rm.py | 4 ++ vllm/model_executor/models/roberta.py | 16 ++++-- 14 files changed, 166 insertions(+), 61 deletions(-) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 68b1cc80303ad..7336c30bdda33 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -9,6 +9,7 @@ import mteb import numpy as np import pytest import requests +import torch from tests.models.utils import (EmbedModelInfo, RerankModelInfo, check_embeddings_close) @@ -165,16 +166,19 @@ def mteb_test_embed_models(hf_runner, vllm_extra_kwargs=None, hf_model_callback=None, atol=MTEB_EMBED_TOL): + # A model family has many models with the same architecture, + # and we don't need to test each one. if not model_info.enable_test: - # A model family has many models with the same architecture, - # and we don't need to test each one. pytest.skip("Skipping test.") - example_prompts = ["The chef prepared a delicious meal."] + # Test embed_dims, isnan and whether to use normalize + example_prompts = ["The chef prepared a delicious meal." * 1000] + # Allow vllm to test using the given dtype, such as float32 vllm_extra_kwargs = vllm_extra_kwargs or {} vllm_extra_kwargs["dtype"] = model_info.dtype + # Allow vllm to test using hf_overrides if model_info.hf_overrides is not None: vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides @@ -186,21 +190,32 @@ def mteb_test_embed_models(hf_runner, model_config = vllm_model.llm.llm_engine.model_config + # Confirm whether vllm is using the correct architecture if model_info.architecture: assert model_info.architecture in model_config.architectures + + # Confirm whether vllm uses the correct default_pooling_type, which + # relates to whether chunked prefill and prefix caching are enabled assert (model_config._model_info.default_pooling_type == model_info.default_pooling_type) vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype - vllm_outputs = vllm_model.embed(example_prompts) + # Test embed_dims, isnan and whether to use normalize + vllm_outputs = vllm_model.embed(example_prompts, + truncate_prompt_tokens=-1) + assert not torch.any(torch.isnan(torch.tensor(vllm_outputs))) + + # Accelerate mteb test by setting + # SentenceTransformers mteb score to a constant if model_info.mteb_score is None: with hf_runner(model_info.name, is_sentence_transformer=True, dtype="float32") as hf_model: + # e.g. setting default parameters for the encode method of hf_runner if hf_model_callback is not None: hf_model_callback(hf_model) @@ -299,14 +314,16 @@ def mteb_test_rerank_models(hf_runner, hf_model_callback=None, vllm_mteb_encoder=VllmMtebEncoder, atol=MTEB_RERANK_TOL): + # A model family has many models with the same architecture, + # and we don't need to test each one. if not model_info.enable_test: - # A model family has many models with the same architecture, - # and we don't need to test each one. pytest.skip("Skipping test.") + # Allow vllm to test using the given dtype, such as float32 vllm_extra_kwargs = vllm_extra_kwargs or {} vllm_extra_kwargs["dtype"] = model_info.dtype + # Allow vllm to test using hf_overrides if model_info.hf_overrides is not None: vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides @@ -319,9 +336,15 @@ def mteb_test_rerank_models(hf_runner, model_config = vllm_model.llm.llm_engine.model_config + # Confirm whether vllm is using the correct architecture if model_info.architecture: assert (model_info.architecture in model_config.architectures) + + # Score API is only enabled for num_labels == 1 assert model_config.hf_config.num_labels == 1 + + # Confirm whether vllm uses the correct default_pooling_type, which + # relates to whether chunked prefill and prefix caching are enabled assert (model_config._model_info.default_pooling_type == model_info.default_pooling_type) @@ -330,6 +353,8 @@ def mteb_test_rerank_models(hf_runner, languages=MTEB_RERANK_LANGS) vllm_dtype = model_config.dtype + # Accelerate mteb test by setting + # SentenceTransformers mteb score to a constant if model_info.mteb_score is None: st_main_score, st_dtype = mteb_test_rerank_models_hf( hf_runner, model_info.name, hf_model_callback) diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py index eaa8bfb84ffdd..fc888157b402a 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py @@ -14,6 +14,7 @@ from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models RERANK_MODELS = [ LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma", architecture="GemmaForSequenceClassification", + mteb_score=0.33757, hf_overrides={ "architectures": ["GemmaForSequenceClassification"], diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index e474420e3f04c..4f4673ac6e67a 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -745,7 +745,7 @@ class ModelConfig: self.pooler_config = self._init_pooler_config() - self.dtype = _get_and_verify_dtype( + self.dtype: torch.dtype = _get_and_verify_dtype( self.model, self.hf_config, self.dtype, @@ -1751,6 +1751,32 @@ class ModelConfig: # `llm as reranker` models defaults to not using pad_token. return getattr(self.hf_config, "use_pad_token", True) + @property + def head_dtype(self) -> torch.dtype: + """ + "head" refers to the last Linear layer(s) of an LLM, + such as the lm_head in a generation model, + or the score or classifier in a classification model. + + The default head_dtype based on runner_type.\n + - The pooling model defaults to using fp32 head, + you can use --hf-overrides '{"head_dtype": "model"}' to disable it.\n + - The generate model defaults to not using fp32 head, + you can use --hf-overrides '{"head_dtype": "float32"}' to enable it. + """ + head_dtype = _get_head_dtype(config=self.hf_config, + dtype=self.dtype, + runner_type=self.runner_type) + + if head_dtype not in current_platform.supported_dtypes: + logger.warning_once( + "The current platform does not support [%s] head dtype, " + "fallback to model dtype [%s].", head_dtype, self.dtype) + return self.dtype + + logger.debug_once("head dtype: %s", head_dtype) + return head_dtype + def get_and_verify_max_len(self, max_model_len: int): # Consider max_model_len in tokenizer_config only when # pooling models use absolute position_embedding. @@ -2893,6 +2919,31 @@ def _get_and_verify_dtype( return torch_dtype +def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype, + runner_type: str) -> torch.dtype: + head_dtype: Optional[Union[str, + torch.dtype]] = getattr(config, "head_dtype", + None) + + if head_dtype == "model": + return dtype + elif isinstance(head_dtype, str): + head_dtype = head_dtype.lower() + if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {head_dtype!r}") + return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype] + elif isinstance(head_dtype, torch.dtype): + return head_dtype + elif head_dtype is None: + if torch.float32 not in current_platform.supported_dtypes: + return dtype + if runner_type == "pooling": + return torch.float32 + return dtype + else: + raise ValueError(f"Unknown dtype: {head_dtype}") + + def _get_and_verify_max_len( hf_config: PretrainedConfig, tokenizer_config: Optional[dict], diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index afe7ea7b83924..b571a8f866990 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -5,7 +5,7 @@ from collections.abc import Mapping, Set from dataclasses import dataclass from enum import IntEnum from itertools import groupby -from typing import Callable, Optional, TypeVar, Union, cast +from typing import Callable, Optional, TypeVar, Union import torch import torch.nn as nn @@ -362,14 +362,13 @@ class PoolerIdentity(PoolerActivation): class PoolerNormalize(PoolerActivation): def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: - x = F.normalize(pooled_data.float(), p=2, dim=-1) - return x.to(pooled_data.dtype) + return F.normalize(pooled_data, p=2, dim=-1) class PoolerMultiLabelClassify(PoolerActivation): def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: - return F.sigmoid(pooled_data.float()).to(pooled_data.dtype) + return F.sigmoid(pooled_data) class PoolerClassify(PoolerActivation): @@ -394,9 +393,9 @@ class PoolerClassify(PoolerActivation): pooled_data.shape[-1]) if num_labels < 2: - return F.sigmoid(pooled_data.float()).to(pooled_data.dtype) + return F.sigmoid(pooled_data) - return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype) + return F.softmax(pooled_data, dim=-1) class LambdaPoolerActivation(PoolerActivation): @@ -432,8 +431,9 @@ class EmbeddingPoolerHead(PoolerHead): from vllm.model_executor.models.adapters import _load_st_projector vllm_config = get_current_vllm_config() - self.projector = _load_st_projector( + self.projector: Optional[nn.Module] = _load_st_projector( vllm_config.model_config) if vllm_config else None + self.head_dtype = vllm_config.model_config.head_dtype def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): @@ -442,16 +442,11 @@ class EmbeddingPoolerHead(PoolerHead): pooled_data = torch.stack(pooled_data) # pooled_data shape: [batchsize, hidden_dimension] + pooled_data = pooled_data.to(self.head_dtype) + # Apply ST projector if self.projector is not None: - projector = cast(nn.Module, self.projector) - - def _proj(x: torch.Tensor) -> torch.Tensor: - orig_dtype = x.dtype - y = projector(x.to(torch.float32)) - return y.to(orig_dtype) - - pooled_data = _proj(pooled_data) + pooled_data = self.projector(pooled_data) # pooled_data shape: [batchsize, embedding_dimension] pooling_params = get_pooling_params(pooling_metadata) @@ -494,8 +489,18 @@ class RewardPoolerHead(PoolerHead): def __init__(self) -> None: super().__init__(activation=PoolerClassify(static_num_labels=False)) + from vllm.config import get_current_vllm_config + vllm_config = get_current_vllm_config() + self.head_dtype = vllm_config.model_config.head_dtype + def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): + + if isinstance(pooled_data, list): + pooled_data = [p.to(self.head_dtype) for p in pooled_data] + else: + pooled_data = pooled_data.to(self.head_dtype) + pooling_params = get_pooling_params(pooling_metadata) # for softmax @@ -641,6 +646,7 @@ class ClassifierPooler(Pooler): self.act_fn = act_fn or PoolerClassify() self.logit_bias: Optional[ float] = vllm_config.model_config.pooler_config.logit_bias + self.head_dtype = vllm_config.model_config.head_dtype def get_supported_tasks(self) -> Set[PoolingTask]: return {"classify", "score"} @@ -655,6 +661,8 @@ class ClassifierPooler(Pooler): pooled_data = torch.stack(pooled_data) # pooled_data shape: [batchsize, hidden_size] + pooled_data = pooled_data.to(self.head_dtype) + if self.classifier is not None: pooled_data = self.classifier(pooled_data) # pooled_data shape: [batchsize, num_labels] diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index bb96bc559200c..c189208fa075b 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -62,7 +62,7 @@ def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]: linear = nn.Linear(layer_config.get("in_features", 768), layer_config.get("out_features", 768), bias=layer_config.get("bias", True), - dtype=torch.float32) + dtype=model_config.head_dtype) if not _load_dense_weights(linear, folder, model_config): continue @@ -70,7 +70,7 @@ def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]: layers.append(linear) if act_name := layer_config.get("activation_function"): layers.append(get_act_fn(act_name)) - return nn.Sequential(*layers).to(dtype=torch.float32) + return nn.Sequential(*layers).to(dtype=model_config.head_dtype) except Exception: logger.exception("ST projector loading failed") @@ -105,15 +105,13 @@ def _load_dense_weights(linear: nn.Linear, folder: str, if weight_key in state_dict: weight_loader = getattr(linear.weight, "weight_loader", default_weight_loader) - weight_loader(linear.weight, - state_dict[weight_key].to(torch.float32)) + weight_loader(linear.weight, state_dict[weight_key]) bias_key = weight_key.replace("weight", "bias") if linear.bias is not None and bias_key in state_dict: bias_loader = getattr(linear.bias, "weight_loader", default_weight_loader) - bias_loader(linear.bias, - state_dict[bias_key].to(torch.float32)) + bias_loader(linear.bias, state_dict[bias_key]) return True except Exception: logger.exception("Failed to load %s", filename) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 8f23439655ed7..c07e5364814ac 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -562,7 +562,9 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, self.bert = BertPoolingModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "bert"), embedding_class=BertEmbedding) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.classifier = nn.Linear(config.hidden_size, + config.num_labels, + dtype=vllm_config.model_config.head_dtype) pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 3be7e11d947d5..b758cbf28d893 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -637,14 +637,14 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding): self.new = GteNewModel(vllm_config=vllm_config, prefix=prefix, add_pooling_layer=True) - self.classifier = RowParallelLinear(config.hidden_size, - config.num_labels, - input_is_parallel=False, - bias=True, - quant_config=quant_config, - prefix=maybe_prefix( - prefix, "classifier"), - return_bias=False) + self.classifier = ReplicatedLinear( + config.hidden_size, + config.num_labels, + bias=True, + quant_config=quant_config, + params_dtype=vllm_config.model_config.head_dtype, + prefix=maybe_prefix(prefix, "classifier"), + return_bias=False) pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 4446b5ab181c1..0f6521e44e6be 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -339,7 +339,10 @@ class GPT2ForSequenceClassification(nn.Module): config = vllm_config.model_config.hf_config self.transformer = GPT2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "gpt2")) - self.score = nn.Linear(config.n_embd, config.num_labels, bias=False) + self.score = nn.Linear(config.n_embd, + config.num_labels, + bias=False, + dtype=vllm_config.model_config.head_dtype) pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None @@ -348,7 +351,7 @@ class GPT2ForSequenceClassification(nn.Module): "encode": Pooler.for_encode(pooler_config), "classify": - Pooler.for_classify(pooler_config, classifier=None), + Pooler.for_classify(pooler_config, classifier=self.score), }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): @@ -367,8 +370,7 @@ class GPT2ForSequenceClassification(nn.Module): position_ids=positions, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) - logits = self.score(hidden_states) - return logits + return hidden_states def _add_transformer_prefix( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 320e8d9d480c3..ce94328797ed6 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -423,13 +423,15 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM): delattr(self, attr) config = vllm_config.model_config.hf_config - self.v_head = RowParallelLinear( - config.hidden_size, - 1, - bias=False, - input_is_parallel=False, - prefix=maybe_prefix(prefix, "v_head"), - ) + self.head_dtype = vllm_config.model_config.head_dtype + + self.v_head = RowParallelLinear(config.hidden_size, + 1, + bias=False, + input_is_parallel=False, + params_dtype=self.head_dtype, + prefix=maybe_prefix(prefix, "v_head"), + return_bias=False) pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None @@ -446,5 +448,6 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM): ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) - logits, _ = self.v_head(hidden_states) + hidden_states = hidden_states.to(self.head_dtype) + logits = self.v_head(hidden_states) return logits diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index aebd2cbe2e999..550fde17b6c53 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -613,7 +613,7 @@ class JambaForSequenceClassification(JambaForCausalLM): config.hidden_size, num_labels, bias=score_bias, - dtype=torch.float32, + dtype=vllm_config.model_config.head_dtype, ) pooler_config = vllm_config.model_config.pooler_config diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index 140b0d1674728..f8c2a1e507a74 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -5,9 +5,9 @@ from typing import Optional import torch import torch.nn as nn -from transformers import BatchFeature, PretrainedConfig +from transformers import BatchFeature -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -28,13 +28,17 @@ logger = init_logger(__name__) class JinaVLScorer(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, model_config: "ModelConfig"): super().__init__() + config = model_config.hf_config + head_dtype = model_config.head_dtype self.dense = ColumnParallelLinear(config.hidden_size, config.hidden_size, + params_dtype=head_dtype, bias=True) self.out_proj = RowParallelLinear(config.hidden_size, config.num_labels, + params_dtype=head_dtype, bias=True) def forward(self, x, **kwargs): @@ -88,11 +92,10 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "qwen2_vl")) - config = vllm_config.model_config.hf_config pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - self.score = JinaVLScorer(config) + self.score = JinaVLScorer(vllm_config.model_config) self.pooler = DispatchPooler({ "encode": Pooler.for_encode(pooler_config), diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 776287589808a..1d5da3139de92 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -306,7 +306,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding): self.config = config self.model = ModernBertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.classifier = nn.Linear(config.hidden_size, + config.num_labels, + dtype=vllm_config.model_config.head_dtype) self.pooling = ModernBertPooler(config) pooler_config = vllm_config.model_config.pooler_config diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 421b43563bade..2bd9d2b52628a 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -53,15 +53,18 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): self.quant_config = quant_config self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.head_dtype = vllm_config.model_config.head_dtype self.score = nn.Sequential( ColumnParallelLinear(config.hidden_size, config.hidden_size, quant_config=quant_config, + params_dtype=self.head_dtype, return_bias=False), nn.ReLU(), RowParallelLinear(config.hidden_size, config.num_labels, + params_dtype=self.head_dtype, quant_config=quant_config, return_bias=False), ) @@ -80,6 +83,7 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) + hidden_states = hidden_states.to(self.head_dtype) logits = self.score(hidden_states) return logits diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 2bfa51162910b..ba405be416876 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -8,7 +8,7 @@ import torch from torch import nn from transformers import RobertaConfig -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool, DispatchPooler, Pooler) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -73,10 +73,16 @@ class RobertaEmbedding(nn.Module): class RobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" - def __init__(self, config: RobertaConfig): + def __init__(self, model_config: "ModelConfig"): super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + config = model_config.hf_config + head_dtype = model_config.head_dtype + self.dense = nn.Linear(config.hidden_size, + config.hidden_size, + dtype=head_dtype) + self.out_proj = nn.Linear(config.hidden_size, + config.num_labels, + dtype=head_dtype) def forward(self, x: torch.Tensor) -> torch.Tensor: # CLSPool has already been applied in `pooling` @@ -184,7 +190,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): self.roberta = BertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "bert"), embedding_class=RobertaEmbedding) - self.classifier = RobertaClassificationHead(config) + self.classifier = RobertaClassificationHead(vllm_config.model_config) pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None From 922d3b401ba616e6bb1b2292058e178f365e1d13 Mon Sep 17 00:00:00 2001 From: "d.transposed" <damian.bogunowicz@gmail.com> Date: Tue, 9 Sep 2025 16:30:24 +0200 Subject: [PATCH 073/584] [Bugfix] Handle the edge case in detokenizer where processed tokens contain both `stop` str and `eos` token (#23938) Signed-off-by: dtransposed <damian.bogunowicz@gmail.com> --- ...stop_string_while_stop_model_terminates.py | 103 ++++++++++++++++++ vllm/v1/engine/detokenizer.py | 9 +- 2 files changed, 106 insertions(+), 6 deletions(-) create mode 100644 tests/detokenizer/test_stop_string_while_stop_model_terminates.py diff --git a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py new file mode 100644 index 0000000000000..9b32a2927f2de --- /dev/null +++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.sampling_params import SamplingParams +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.detokenizer import BaseIncrementalDetokenizer + + +@pytest.fixture(params=[True, False]) +def include_stop_str_in_output(request): + return request.param + + +class _DummyDetokenizer(BaseIncrementalDetokenizer): + + def __init__(self, request: EngineCoreRequest): + super().__init__(request) + + def decode_next(self, next_token_id: int) -> str: + # Map token id to single ASCII character for deterministic testing. + return chr(next_token_id) + + +def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0): + params = SamplingParams( + stop=stop, + include_stop_str_in_output=include_stop_str_in_output, + min_tokens=min_tokens) + # Keep other fields minimal for unit test purposes. + req = EngineCoreRequest( + request_id="test", + prompt_token_ids=[], + mm_features=None, + sampling_params=params, + pooling_params=None, + eos_token_id=None, + arrival_time=0.0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + ) + return req + + +def test_stop_string_while_stop_token_terminates( + include_stop_str_in_output: bool): + """ + This test verifies that the detokenizer correctly handles the case where + the generated token sequence contains both: + - a stop token + - an <eos> token + + The detokenizer should respect the stop string and truncate the output + accordingly. + + Imagine the following sequence: + - "abcdeZ" is generated, where "Z" is the <eos> token. + - "cd" is the stop string. + + If include_stop_str_in_output=False, the detokenizer should truncate the + output to "ab" because the stop string "cd" is excluded. + If include_stop_str_in_output=True, the detokenizer should include the stop + string "cd" in the output, resulting in "abcd". + + + This verifies the behavioral change introduced in BaseIncrementalDetokenizer + where stop-string evaluation occurs before the early-return on + stop_terminated. + """ + + # Generate text "abcdeZ" and tokenize it. + generated_text = "abcde" + eos_token = "Z" + stop_string = "cd" + generated_text = generated_text + eos_token + token_ids = [ord(c) for c in generated_text] + + # Create a request with the stop string and initialize the detokenizer. + req = _make_request(stop=[stop_string], + include_stop_str_in_output=include_stop_str_in_output) + detok = _DummyDetokenizer(req) + + # Simulate that the last token ('Z') is a stop token (stop_terminated=True). + result = detok.update(new_token_ids=token_ids, stop_terminated=True) + + # The update should not report a stop string + assert result == stop_string + + # Output text should reflect stop-string handling: + # - include_stop_str_in_output=False => exclude "cd" => "ab" + # - include_stop_str_in_output=True => include "cd" => "abcd" + expected_text = "abcd" if include_stop_str_in_output else "ab" + assert detok.output_text == expected_text + + # The skipped final token should still be recorded in token_ids. + assert detok.output_token_ids == token_ids + + # get_next_output_text should return the full text when finished=True. + # (Buffering only applies during streaming when finished=False.) + assert detok.get_next_output_text(finished=True, + delta=False) == expected_text diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 38f435f5166e0..cf4b06db843bd 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -121,12 +121,9 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): self.output_token_ids) <= self.min_tokens: stop_check_offset = len(self.output_text) - if stop_terminated: - if skipped_stop_token_id is not None: - # Cleanup after skipping detokenization. - self.token_ids.append(skipped_stop_token_id) - # Stop token triggered; skip stop string check. - return None + if skipped_stop_token_id is not None: + # Cleanup after skipping detokenization. + self.token_ids.append(skipped_stop_token_id) # 2) Evaluate stop strings. stop_string = None From 1c63a16b653d1f3f4260e862f01eaffed283c4c8 Mon Sep 17 00:00:00 2001 From: Micah Williamson <micah.williamson@amd.com> Date: Tue, 9 Sep 2025 09:38:10 -0500 Subject: [PATCH 074/584] [Core] Run garbage collector after CUDA graph capture to fix throughput regression (#24128) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> --- vllm/v1/worker/gpu_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 549c5dd2bbb2c..897c3a6213207 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2885,6 +2885,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): finally: if should_freeze: gc.unfreeze() + gc.collect() # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes From 1aa427fdc1d4ddb7a0c966f7c46badd1c3952c14 Mon Sep 17 00:00:00 2001 From: youkaichao <youkaichao@gmail.com> Date: Wed, 10 Sep 2025 00:04:41 +0800 Subject: [PATCH 075/584] [Kernels] Add Flash Linear Attention Kernels (#24518) Signed-off-by: youkaichao <youkaichao@gmail.com> --- tools/mypy.sh | 2 +- vllm/model_executor/layers/fla/__init__.py | 8 + .../model_executor/layers/fla/ops/__init__.py | 17 + vllm/model_executor/layers/fla/ops/chunk.py | 225 +++++++++++ .../layers/fla/ops/chunk_delta_h.py | 289 ++++++++++++++ vllm/model_executor/layers/fla/ops/chunk_o.py | 176 +++++++++ .../layers/fla/ops/chunk_scaled_dot_kkt.py | 138 +++++++ vllm/model_executor/layers/fla/ops/cumsum.py | 226 +++++++++++ .../layers/fla/ops/fused_recurrent.py | 366 ++++++++++++++++++ vllm/model_executor/layers/fla/ops/index.py | 39 ++ vllm/model_executor/layers/fla/ops/l2norm.py | 143 +++++++ .../layers/fla/ops/layernorm_guard.py | 337 ++++++++++++++++ vllm/model_executor/layers/fla/ops/op.py | 44 +++ .../layers/fla/ops/solve_tril.py | 365 +++++++++++++++++ vllm/model_executor/layers/fla/ops/utils.py | 180 +++++++++ vllm/model_executor/layers/fla/ops/wy_fast.py | 114 ++++++ vllm/triton_utils/__init__.py | 4 +- 17 files changed, 2671 insertions(+), 2 deletions(-) create mode 100644 vllm/model_executor/layers/fla/__init__.py create mode 100644 vllm/model_executor/layers/fla/ops/__init__.py create mode 100644 vllm/model_executor/layers/fla/ops/chunk.py create mode 100644 vllm/model_executor/layers/fla/ops/chunk_delta_h.py create mode 100644 vllm/model_executor/layers/fla/ops/chunk_o.py create mode 100644 vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py create mode 100644 vllm/model_executor/layers/fla/ops/cumsum.py create mode 100644 vllm/model_executor/layers/fla/ops/fused_recurrent.py create mode 100644 vllm/model_executor/layers/fla/ops/index.py create mode 100644 vllm/model_executor/layers/fla/ops/l2norm.py create mode 100644 vllm/model_executor/layers/fla/ops/layernorm_guard.py create mode 100644 vllm/model_executor/layers/fla/ops/op.py create mode 100644 vllm/model_executor/layers/fla/ops/solve_tril.py create mode 100644 vllm/model_executor/layers/fla/ops/utils.py create mode 100644 vllm/model_executor/layers/fla/ops/wy_fast.py diff --git a/tools/mypy.sh b/tools/mypy.sh index 781d8fc02884b..63e3b9a916634 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -29,7 +29,7 @@ run_mypy vllm/engine run_mypy vllm/executor run_mypy vllm/inputs run_mypy vllm/lora -run_mypy vllm/model_executor +run_mypy --exclude 'vllm/model_executor/layers/fla/ops' vllm/model_executor run_mypy vllm/plugins run_mypy vllm/worker run_mypy vllm/v1 diff --git a/vllm/model_executor/layers/fla/__init__.py b/vllm/model_executor/layers/fla/__init__.py new file mode 100644 index 0000000000000..0e89cf9f79439 --- /dev/null +++ b/vllm/model_executor/layers/fla/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang diff --git a/vllm/model_executor/layers/fla/ops/__init__.py b/vllm/model_executor/layers/fla/ops/__init__.py new file mode 100644 index 0000000000000..c19cc14ba6928 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/__init__.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +from .chunk import chunk_gated_delta_rule +from .fused_recurrent import fused_recurrent_gated_delta_rule +from .layernorm_guard import RMSNormGated + +__all__ = [ + "RMSNormGated", + "chunk_gated_delta_rule", + "fused_recurrent_gated_delta_rule", +] diff --git a/vllm/model_executor/layers/fla/ops/chunk.py b/vllm/model_executor/layers/fla/ops/chunk.py new file mode 100644 index 0000000000000..e7d295aff2392 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/chunk.py @@ -0,0 +1,225 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import warnings +from typing import Optional + +import torch +from einops import rearrange + +from .chunk_delta_h import chunk_gated_delta_rule_fwd_h +from .chunk_o import chunk_fwd_o +from .chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd +from .cumsum import chunk_local_cumsum +from .l2norm import l2norm_fwd +from .solve_tril import solve_tril +from .utils import SUPPRESS_LEVEL, input_guard +from .wy_fast import recompute_w_u_fwd + + +def chunk_gated_delta_rule_fwd(q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: Optional[torch.LongTensor] = None): + g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens) + # obtain WY representation. u is actually the new v. + A = chunk_scaled_dot_kkt_fwd(k=k, + beta=beta, + g_cumsum=g, + cu_seqlens=cu_seqlens, + output_dtype=torch.float32) + A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype) + w, u = recompute_w_u_fwd( + k=k, + v=v, + beta=beta, + A=A, + g_cumsum=g, + cu_seqlens=cu_seqlens, + ) + h, v_new, final_state = chunk_gated_delta_rule_fwd_h( + k=k, + w=w, + u=u, + g=g, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + o = chunk_fwd_o( + q=q, + k=k, + v=v_new, + h=h, + g=g, + scale=scale, + cu_seqlens=cu_seqlens, + ) + if SUPPRESS_LEVEL < 3: + return g, o, A, final_state, None, None, None + elif SUPPRESS_LEVEL >= 3: + return g, o, A, final_state, w, h, v_new + + +class ChunkGatedDeltaRuleFunction(torch.autograd.Function): + + @staticmethod + @input_guard + @torch.amp.custom_fwd(device_type='cuda') + def forward(ctx, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: Optional[torch.LongTensor] = None, + use_qk_l2norm_in_kernel: bool = False): + if use_qk_l2norm_in_kernel: + q = l2norm_fwd(q) + k = l2norm_fwd(k) + + g, o, A, final_state, w, h, v_new = chunk_gated_delta_rule_fwd( + q=q, + k=k, + v=v, + g=g, + beta=beta, + scale=scale, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + ctx.scale = scale + ctx.use_qk_l2norm_in_kernel = use_qk_l2norm_in_kernel + return o.to(q.dtype), final_state + + +@torch.compiler.disable +def chunk_gated_delta_rule(q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float = None, + initial_state: torch.Tensor = None, + output_final_state: bool = False, + cu_seqlens: Optional[torch.LongTensor] = None, + head_first: bool = False, + use_qk_l2norm_in_kernel: bool = False): + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + v (torch.Tensor): + values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + g (torch.Tensor): + (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`. + beta (torch.Tensor): + betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`. + scale (Optional[int]): + Scale factor for the RetNet attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[N, H, K, V]` for `N` input sequences. + For equal-length input sequences, `N` equals the batch size `B`. + Default: `None`. + output_final_state (Optional[bool]): + Whether to output the final state of shape `[N, H, K, V]`. Default: `False`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + head_first (Optional[bool]): + Whether the inputs are in the head-first format, which is not supported for variable-length inputs. + Default: `False`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + final_state (torch.Tensor): + Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`. + + Examples:: + >>> import torch + >>> import torch.nn.functional as F + >>> from einops import rearrange + >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule + # inputs with equal lengths + >>> B, T, H, K, V = 4, 2048, 4, 512, 512 + >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda') + >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1) + >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda') + >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid() + >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda')) + >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda') + >>> o, ht = chunk_gated_delta_rule( + q, k, v, g, beta, + initial_state=h0, + output_final_state=True + ) + # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required + >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g)) + # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected + >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long) + >>> o_var, ht_var = chunk_gated_delta_rule( + q, k, v, g, beta, + initial_state=h0, + output_final_state=True, + cu_seqlens=cu_seqlens + ) + """ + assert q.dtype == k.dtype == v.dtype + assert q.dtype != torch.float32, "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16." + assert len( + beta.shape + ) == 3, "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise." + + if head_first: + raise DeprecationWarning( + "head_first is deprecated and will be removed in a future version. " + "Please use head_first=False for now instead.", + stacklevel=2) + q, k, v, beta, g = map( + lambda x: rearrange(x, 'b h t ... -> b t h ...'), + (q, k, v, beta, g)) + if not head_first and q.shape[1] < q.shape[2]: + warnings.warn( + f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). " + "This may indicate the inputs were passed in head-first format [B, H, T, ...] " + "when head_first=False was specified. " + "Please verify your input tensor format matches the expected shape [B, T, H, ...].", + stacklevel=2) + if cu_seqlens is not None: + if q.shape[0] != 1: + raise ValueError( + f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`." + f"Please flatten variable-length inputs before processing.") + if initial_state is not None and initial_state.shape[0] != len( + cu_seqlens) - 1: + raise ValueError( + f"The number of initial states is expected to be equal to the number of input sequences, " + f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}." + ) + if scale is None: + scale = k.shape[-1]**-0.5 + o, final_state = ChunkGatedDeltaRuleFunction.apply( + q, k, v, g, beta, scale, initial_state, output_final_state, cu_seqlens, + use_qk_l2norm_in_kernel) + if head_first: + o = rearrange(o, 'b t h ... -> b h t ...') + return o, final_state diff --git a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py new file mode 100644 index 0000000000000..eac56ef352e70 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py @@ -0,0 +1,289 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices, prepare_chunk_offsets +from .op import exp, safe_exp +from .utils import is_nvidia_hopper, use_cuda_graph + +NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16] + + +@triton.heuristics({ + 'USE_G': lambda args: args['g'] is not None, + 'USE_INITIAL_STATE': lambda args: args['h0'] is not None, + 'STORE_FINAL_STATE': lambda args: args['ht'] is not None, + 'SAVE_NEW_VALUE': lambda args: args['v_new'] is not None, + 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None, +}) +@triton.autotune( + configs=[ + triton.Config({'BV': BV}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4] for num_stages in [2, 3, 4] for BV in [32, 64] + ], + key=['H', 'K', 'V', 'BT', 'USE_G'], + use_cuda_graph=use_cuda_graph, +) +@triton.jit(do_not_specialize=['T']) +def chunk_gated_delta_rule_fwd_kernel_h_blockdim64( + k, + v, + w, + v_new, + g, + h, + h0, + ht, + cu_seqlens, + chunk_offsets, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, + STORE_FINAL_STATE: tl.constexpr, + SAVE_NEW_VALUE: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_v, i_nh = tl.program_id(0), tl.program_id(1) + i_n, i_h = i_nh // H, i_nh % H + if IS_VARLEN: + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + boh = tl.load(chunk_offsets + i_n).to(tl.int32) + else: + bos, eos = i_n * T, i_n * T + T + NT = tl.cdiv(T, BT) + boh = i_n * NT + + # [BK, BV] + b_h1 = tl.zeros([64, BV], dtype=tl.float32) + if K > 64: + b_h2 = tl.zeros([64, BV], dtype=tl.float32) + if K > 128: + b_h3 = tl.zeros([64, BV], dtype=tl.float32) + if K > 192: + b_h4 = tl.zeros([64, BV], dtype=tl.float32) + + # calculate offset + h += (boh * H + i_h) * K * V + v += (bos * H + i_h) * V + k += (bos * Hg + i_h // (H // Hg)) * K + w += (bos * H + i_h) * K + if SAVE_NEW_VALUE: + v_new += (bos * H + i_h) * V + stride_v = H * V + stride_h = H * K * V + stride_k = Hg * K + stride_w = H * K + if USE_INITIAL_STATE: + h0 = h0 + i_nh * K * V + if STORE_FINAL_STATE: + ht = ht + i_nh * K * V + + # load initial state + if USE_INITIAL_STATE: + p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), + (1, 0)) + b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32) + if K > 64: + p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV), + (64, BV), (1, 0)) + b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32) + if K > 128: + p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV), + (64, BV), (1, 0)) + b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32) + if K > 192: + p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV), + (64, BV), (1, 0)) + b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32) + + # main recurrence + for i_t in range(NT): + p_h1 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), + (0, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1)) + if K > 64: + p_h2 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), + (64, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h2, + b_h2.to(p_h2.dtype.element_ty), + boundary_check=(0, 1)) + if K > 128: + p_h3 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), + (128, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h3, + b_h3.to(p_h3.dtype.element_ty), + boundary_check=(0, 1)) + if K > 192: + p_h4 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), + (192, i_v * BV), (64, BV), (1, 0)) + tl.store(p_h4, + b_h4.to(p_h4.dtype.element_ty), + boundary_check=(0, 1)) + + p_v = tl.make_block_ptr(v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), + (BT, BV), (1, 0)) + p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_v, 1), + (i_t * BT, i_v * BV), (BT, BV), + (1, 0)) if SAVE_NEW_VALUE else None + b_v_new = tl.zeros([BT, BV], dtype=tl.float32) + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 0), + (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype)) + if K > 64: + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 64), + (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype)) + if K > 128: + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 128), + (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype)) + if K > 192: + p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 192), + (BT, 64), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype)) + b_v_new = -b_v_new + tl.load(p_v, boundary_check=(0, 1)) + + if SAVE_NEW_VALUE: + p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_v, 1), + (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + tl.store(p_v_new, + b_v_new.to(p_v_new.dtype.element_ty), + boundary_check=(0, 1)) + + if USE_G: + last_idx = min((i_t + 1) * BT, T) - 1 + b_g_last = tl.load(g + bos * H + last_idx * H + i_h) + p_g = tl.make_block_ptr(g + bos * H + i_h, (T, ), (H, ), + (i_t * BT, ), (BT, ), (0, )) + b_g = tl.load(p_g, boundary_check=(0, )) + b_v_new = b_v_new * safe_exp(b_g_last - b_g)[:, None] + b_g_last = exp(b_g_last) + b_h1 = b_h1 * b_g_last + if K > 64: + b_h2 = b_h2 * b_g_last + if K > 128: + b_h3 = b_h3 * b_g_last + if K > 192: + b_h4 = b_h4 * b_g_last + b_v_new = b_v_new.to(k.dtype.element_ty) + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (0, i_t * BT), + (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h1 += tl.dot(b_k, b_v_new) + if K > 64: + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (64, i_t * BT), + (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h2 += tl.dot(b_k, b_v_new) + if K > 128: + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (128, i_t * BT), + (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h3 += tl.dot(b_k, b_v_new) + if K > 192: + p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (192, i_t * BT), + (64, BT), (0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h4 += tl.dot(b_k, b_v_new) + + # epilogue + if STORE_FINAL_STATE: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), + (1, 0)) + tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + if K > 64: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV), + (64, BV), (1, 0)) + tl.store(p_ht, + b_h2.to(p_ht.dtype.element_ty), + boundary_check=(0, 1)) + if K > 128: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV), + (64, BV), (1, 0)) + tl.store(p_ht, + b_h3.to(p_ht.dtype.element_ty), + boundary_check=(0, 1)) + if K > 192: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV), + (64, BV), (1, 0)) + tl.store(p_ht, + b_h4.to(p_ht.dtype.element_ty), + boundary_check=(0, 1)) + + +def chunk_gated_delta_rule_fwd_h( + k: torch.Tensor, + w: torch.Tensor, + u: torch.Tensor, + g: Optional[torch.Tensor] = None, + initial_state: Optional[torch.Tensor] = None, + output_final_state: bool = False, + chunk_size: int = 64, # SY: remove this argument and force chunk size 64? + save_new_value: bool = True, + cu_seqlens: Optional[torch.LongTensor] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, Hg, K, V = *k.shape, u.shape[-1] + H = u.shape[-2] + BT = chunk_size + + chunk_indices = prepare_chunk_indices( + cu_seqlens, chunk_size) if cu_seqlens is not None else None + # N: the actual number of sequences in the batch with either equal or variable lengths + if cu_seqlens is None: + N, NT, chunk_offsets = B, triton.cdiv(T, BT), None + else: + N, NT, chunk_offsets = len(cu_seqlens) - 1, len( + chunk_indices), prepare_chunk_offsets(cu_seqlens, BT) + assert K <= 256, "current kernel does not support head dimension larger than 256." + + h = k.new_empty(B, NT, H, K, V) + final_state = k.new_empty( + N, H, K, V, dtype=torch.float32) if output_final_state else None + + v_new = torch.empty_like(u) if save_new_value else None + + def grid(meta): + return (triton.cdiv(V, meta['BV']), N * H) + + chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid]( + k=k, + v=u, + w=w, + v_new=v_new, + g=g, + h=h, + h0=initial_state, + ht=final_state, + cu_seqlens=cu_seqlens, + chunk_offsets=chunk_offsets, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT) + return h, v_new, final_state diff --git a/vllm/model_executor/layers/fla/ops/chunk_o.py b/vllm/model_executor/layers/fla/ops/chunk_o.py new file mode 100644 index 0000000000000..5a36d313320fa --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/chunk_o.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# ruff: noqa: E501 + +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .op import exp, safe_exp +from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper + +BKV_LIST = [64, 128] if check_shared_mem() else [32, 64] +NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8] + + +@triton.heuristics({ + 'USE_G': lambda args: args['g'] is not None, + 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({ + 'BK': BK, + 'BV': BV + }, + num_warps=num_warps, + num_stages=num_stages) for BK in BKV_LIST + for BV in BKV_LIST for num_warps in NUM_WARPS + for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'V', 'BT'], +) +@triton.jit(do_not_specialize=['T']) +def chunk_fwd_kernel_o( + q, + k, + v, + h, + g, + o, + cu_seqlens, + chunk_indices, + scale, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if IS_VARLEN: + i_tg = i_t + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + else: + NT = tl.cdiv(T, BT) + i_tg = i_b * NT + i_t + bos, eos = i_b * T, i_b * T + T + + # offset calculation + q += (bos * Hg + i_h // (H // Hg)) * K + k += (bos * Hg + i_h // (H // Hg)) * K + v += (bos * H + i_h) * V + o += (bos * H + i_h) * V + h += (i_tg * H + i_h).to(tl.int64) * K * V + + b_o = tl.zeros([BT, BV], dtype=tl.float32) + b_A = tl.zeros([BT, BT], dtype=tl.float32) + + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr(q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), + (BT, BK), (1, 0)) + p_k = tl.make_block_ptr(k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), + (BK, BT), (0, 1)) + p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV), + (BK, BV), (1, 0)) + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BK, BV] + b_h = tl.load(p_h, boundary_check=(0, 1)) + + # [BT, BK] @ [BK, BV] -> [BT, BV] + b_o += tl.dot(b_q, b_h) + # [BT, BK] @ [BK, BT] -> [BT, BT] + b_A += tl.dot(b_q, b_k) + + if USE_G: + g += bos * H + i_h + p_g = tl.make_block_ptr(g, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, )) + b_g = tl.load(p_g, boundary_check=(0, )) + b_o = b_o * exp(b_g)[:, None] + b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :]) + + o_i = tl.arange(0, BT) + m_A = o_i[:, None] >= o_i[None, :] + b_A = tl.where(m_A, b_A, 0) + + p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), + (BT, BV), (1, 0)) + p_o = tl.make_block_ptr(o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), + (BT, BV), (1, 0)) + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # to fix mma -> mma layout conversion + # already solved by triton v3.2 or higher + b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_fwd_o( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + h: torch.Tensor, + g: Optional[torch.Tensor] = None, # cumsum of log decay + scale: Optional[float] = None, + cu_seqlens: Optional[torch.LongTensor] = None, + chunk_size: int = 64) -> torch.Tensor: + B, T, Hg, K, V = *q.shape, v.shape[-1] + H = v.shape[-2] + if FLA_GDN_FIX_BT: + BT = 64 + else: + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + if scale is None: + scale = k.shape[-1]**-0.5 + + o = torch.empty_like(v) + + def grid(meta): + return (triton.cdiv(V, meta['BV']), NT, B * H) + + chunk_fwd_kernel_o[grid]( + q, + k, + v, + h, + g, + o, + cu_seqlens, + chunk_indices, + scale, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + ) + return o diff --git a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py new file mode 100644 index 0000000000000..9938eae52db75 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .op import safe_exp + + +@triton.heuristics({ + 'IS_VARLEN': lambda args: args['cu_seqlens'] is not None, + 'USE_G': lambda args: args['g_cumsum'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({'BK': BK}, num_warps=num_warps, num_stages=num_stages) + for BK in [32, 64, 128] for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'BT', 'IS_VARLEN'], +) +@triton.jit(do_not_specialize=['T']) +def chunk_scaled_dot_kkt_fwd_kernel( + k, + beta, + g_cumsum, + A, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + IS_VARLEN: tl.constexpr, + USE_G: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + o_t = tl.arange(0, BT) + + p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ), + (i_t * BT, ), (BT, ), (0, )) + b_beta = tl.load(p_beta, boundary_check=(0, )) + + b_A = tl.zeros([BT, BT], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr(k + (bos * Hg + i_h // (H // Hg)) * K, (T, K), + (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), + (1, 0)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = b_k * b_beta[:, None] + b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k)) + + if USE_G: + p_g = tl.make_block_ptr(g_cumsum + bos * H + i_h, (T, ), (H, ), + (i_t * BT, ), (BT, ), (0, )) + b_g = tl.load(p_g, boundary_check=(0, )) + b_g_diff = b_g[:, None] - b_g[None, :] + b_A = b_A * safe_exp(b_g_diff) + + b_A = tl.where(o_t[:, None] > o_t[None, :], b_A, 0) + p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), + (i_t * BT, 0), (BT, BT), (1, 0)) + tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_scaled_dot_kkt_fwd( + k: torch.Tensor, + beta: torch.Tensor, + g_cumsum: Optional[torch.Tensor] = None, + cu_seqlens: Optional[torch.LongTensor] = None, + chunk_size: int = 64, + output_dtype: torch.dtype = torch.float32) -> torch.Tensor: + r""" + Compute beta * K * K^T. + + Args: + k (torch.Tensor): + The key tensor of shape `[B, T, H, K]`. + beta (torch.Tensor): + The beta tensor of shape `[B, T, H]`. + g_cumsum (torch.Tensor): + The cumulative sum of the gate tensor of shape `[B, T, H]`. + Default: None + cu_seqlens (torch.LongTensor): + The cumulative sequence lengths of the input tensor. + Default: None + chunk_size (int): + The chunk size. Default: 64. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float32` + + Returns: + beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size. + """ + + B, T, Hg, K = k.shape + + H = beta.shape[-1] + BT = chunk_size + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype) + chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)]( + k=k, + beta=beta, + g_cumsum=g_cumsum, + A=A, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + Hg=Hg, + K=K, + BT=BT, + ) + return A diff --git a/vllm/model_executor/layers/fla/ops/cumsum.py b/vllm/model_executor/layers/fla/ops/cumsum.py new file mode 100644 index 0000000000000..59152e2c845ad --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/cumsum.py @@ -0,0 +1,226 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import warnings +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .utils import check_shared_mem, input_guard + +BS_LIST = [32, 64] if check_shared_mem() else [16, 32] + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +@triton.autotune(configs=[ + triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8] +], + key=['B', 'H', 'BT', 'IS_VARLEN', 'REVERSE']) +@triton.jit(do_not_specialize=['T']) +def chunk_local_cumsum_scalar_kernel( + s, + o, + cu_seqlens, + chunk_indices, + T, + B: tl.constexpr, + H: tl.constexpr, + BT: tl.constexpr, + REVERSE: tl.constexpr, + IS_VARLEN: tl.constexpr, + HEAD_FIRST: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + if HEAD_FIRST: + p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T, ), (1, ), + (i_t * BT, ), (BT, ), (0, )) + p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T, ), (1, ), + (i_t * BT, ), (BT, ), (0, )) + else: + p_s = tl.make_block_ptr(s + bos * H + i_h, (T, ), (H, ), (i_t * BT, ), + (BT, ), (0, )) + p_o = tl.make_block_ptr(o + bos * H + i_h, (T, ), (H, ), (i_t * BT, ), + (BT, ), (0, )) + # [BT] + b_s = tl.load(p_s, boundary_check=(0, )).to(tl.float32) + b_o = tl.cumsum(b_s, axis=0) + if REVERSE: + b_z = tl.sum(b_s, axis=0) + b_o = -b_o + b_z[None] + b_s + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, )) + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +@triton.autotune(configs=[ + triton.Config({'BS': BS}, num_warps=num_warps) for BS in BS_LIST + for num_warps in [2, 4, 8] +], + key=['B', 'H', 'S', 'BT', 'IS_VARLEN', 'REVERSE']) +@triton.jit(do_not_specialize=['T']) +def chunk_local_cumsum_vector_kernel( + s, + o, + cu_seqlens, + chunk_indices, + T, + B: tl.constexpr, + H: tl.constexpr, + S: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + REVERSE: tl.constexpr, + IS_VARLEN: tl.constexpr, + HEAD_FIRST: tl.constexpr, +): + i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + o_i = tl.arange(0, BT) + if REVERSE: + m_s = tl.where(o_i[:, None] <= o_i[None, :], 1., 0.) + else: + m_s = tl.where(o_i[:, None] >= o_i[None, :], 1., 0.) + + if HEAD_FIRST: + p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + else: + p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1), + (i_t * BT, i_s * BS), (BT, BS), (1, 0)) + # [BT, BS] + b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32) + b_o = tl.dot(m_s, b_s, allow_tf32=False) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_local_cumsum_scalar( + g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: Optional[torch.Tensor] = None, + head_first: bool = False, + output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor: + if head_first: + B, H, T = g.shape + else: + B, T, H = g.shape + assert chunk_size == 2**(chunk_size.bit_length() - + 1), "chunk_size must be a power of 2" + BT = chunk_size + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype) + grid = (NT, B * H) + chunk_local_cumsum_scalar_kernel[grid](g_org, + g, + cu_seqlens, + chunk_indices, + T=T, + B=B, + H=H, + BT=BT, + HEAD_FIRST=head_first, + REVERSE=reverse) + return g + + +def chunk_local_cumsum_vector( + g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: Optional[torch.Tensor] = None, + head_first: bool = False, + output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor: + if head_first: + B, H, T, S = g.shape + else: + B, T, H, S = g.shape + BT = chunk_size + chunk_indices = prepare_chunk_indices( + cu_seqlens, chunk_size) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + assert chunk_size == 2**(chunk_size.bit_length() - + 1), "chunk_size must be a power of 2" + + g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype) + + def grid(meta): + return (triton.cdiv(meta['S'], meta['BS']), NT, B * H) + + # keep cummulative normalizer in fp32 + # this kernel is equivalent to + # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1) + chunk_local_cumsum_vector_kernel[grid](g_org, + g, + cu_seqlens, + chunk_indices, + T=T, + B=B, + H=H, + S=S, + BT=BT, + HEAD_FIRST=head_first, + REVERSE=reverse) + return g + + +@input_guard +def chunk_local_cumsum(g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: Optional[torch.Tensor] = None, + head_first: bool = False, + output_dtype: Optional[torch.dtype] = torch.float, + **kwargs) -> torch.Tensor: + if not head_first and g.shape[1] < g.shape[2]: + warnings.warn( + f"Input tensor shape suggests potential format mismatch: seq_len ({g.shape[1]}) < num_heads ({g.shape[2]}). " + "This may indicate the inputs were passed in head-first format [B, H, T, ...] " + "when head_first=False was specified. " + "Please verify your input tensor format matches the expected shape [B, T, H, ...].", + stacklevel=2) + if cu_seqlens is not None: + assert g.shape[ + 0] == 1, "Only batch size 1 is supported when cu_seqlens are provided" + if len(g.shape) == 3: + return chunk_local_cumsum_scalar(g, chunk_size, reverse, cu_seqlens, + head_first, output_dtype) + elif len(g.shape) == 4: + return chunk_local_cumsum_vector(g, chunk_size, reverse, cu_seqlens, + head_first, output_dtype) + else: + raise ValueError(f"Unsupported input shape {g.shape}. " + f"which should be (B, T, H, D) if `head_first=False` " + f"or (B, H, T, D) otherwise") diff --git a/vllm/model_executor/layers/fla/ops/fused_recurrent.py b/vllm/model_executor/layers/fla/ops/fused_recurrent.py new file mode 100644 index 0000000000000..25a615fe12441 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/fused_recurrent.py @@ -0,0 +1,366 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .op import exp + + +@triton.heuristics({ + 'USE_INITIAL_STATE': + lambda args: args['h0'] is not None, + 'IS_VARLEN': + lambda args: args['cu_seqlens'] is not None, + "IS_CONTINUOUS_BATCHING": + lambda args: args['ssm_state_indices'] is not None, + "IS_SPEC_DECODING": + lambda args: args['num_accepted_tokens'] is not None, +}) +@triton.jit(do_not_specialize=['N', 'T']) +def fused_recurrent_gated_delta_rule_fwd_kernel( + q, + k, + v, + g, + beta, + o, + h0, + ht, + cu_seqlens, + ssm_state_indices, + num_accepted_tokens, + scale, + N: tl.constexpr, # num of sequences + T: tl.constexpr, # num of tokens + B: tl.constexpr, + H: tl.constexpr, + HV: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + stride_init_state_token: tl.constexpr, + stride_final_state_token: tl.constexpr, + stride_indices_seq: tl.constexpr, + stride_indices_tok: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, # whether to use initial state + INPLACE_FINAL_STATE: tl.constexpr, # whether to store final state inplace + IS_BETA_HEADWISE: tl. + constexpr, # whether beta is headwise vector or scalar, + USE_QK_L2NORM_IN_KERNEL: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_CONTINUOUS_BATCHING: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, +): + i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_n, i_hv = i_nh // HV, i_nh % HV + i_h = i_hv // (HV // H) + if IS_VARLEN: + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int64), tl.load(cu_seqlens + i_n + 1).to(tl.int64) + all = T + T = eos - bos + else: + bos, eos = i_n * T, i_n * T + T + all = B * T + + if T == 0: + # no tokens to process for this sequence + return + + o_k = i_k * BK + tl.arange(0, BK) + o_v = i_v * BV + tl.arange(0, BV) + + p_q = q + (bos * H + i_h) * K + o_k + p_k = k + (bos * H + i_h) * K + o_k + p_v = v + (bos * HV + i_hv) * V + o_v + if IS_BETA_HEADWISE: + p_beta = beta + (bos * HV + i_hv) * V + o_v + else: + p_beta = beta + bos * HV + i_hv + p_g = g + bos * HV + i_hv + p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v + + mask_k = o_k < K + mask_v = o_v < V + mask_h = mask_k[:, None] & mask_v[None, :] + + b_h = tl.zeros([BK, BV], dtype=tl.float32) + if USE_INITIAL_STATE: + if IS_CONTINUOUS_BATCHING: + if IS_SPEC_DECODING: + i_t = tl.load(num_accepted_tokens + i_n).to(tl.int64) - 1 + else: + i_t = 0 + p_h0 = h0 + tl.load(ssm_state_indices + i_n * stride_indices_seq + + i_t).to(tl.int64) * stride_init_state_token + else: + p_h0 = h0 + bos * HV * K * V + p_h0 = p_h0 + i_hv * K * V + o_k[:, None] * V + o_v[None, :] + b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32) + + for i_t in range(0, T): + b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32) + b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32) + b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32) + b_g = tl.load(p_g).to(tl.float32) + + if USE_QK_L2NORM_IN_KERNEL: + b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q)) + 1e-6) + b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k)) + 1e-6) + b_q = b_q * scale + # [BK, BV] + b_h *= exp(b_g) + # [BV] + b_v -= tl.sum(b_h * b_k[:, None], 0) + if IS_BETA_HEADWISE: + b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32) + else: + b_beta = tl.load(p_beta).to(tl.float32) + b_v *= b_beta + # [BK, BV] + b_h += b_k[:, None] * b_v[None, :] + # [BV] + b_o = tl.sum(b_h * b_q[:, None], 0) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v) + + # keep the states for multi-query tokens + if INPLACE_FINAL_STATE: + p_ht = ht + tl.load(ssm_state_indices + i_n * stride_indices_seq + + i_t).to(tl.int64) * stride_final_state_token + else: + p_ht = ht + (bos + i_t) * stride_final_state_token + p_ht = p_ht + i_hv * K * V + o_k[:, None] * V + o_v[None, :] + tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h) + + p_q += H * K + p_k += H * K + p_o += HV * V + p_v += HV * V + p_g += HV + p_beta += HV * (V if IS_BETA_HEADWISE else 1) + + +def fused_recurrent_gated_delta_rule_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + inplace_final_state: bool = True, + cu_seqlens: Optional[torch.LongTensor] = None, + ssm_state_indices: Optional[torch.Tensor] = None, + num_accepted_tokens: Optional[torch.Tensor] = None, + use_qk_l2norm_in_kernel: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, H, K, V = *k.shape, v.shape[-1] + HV = v.shape[2] + N = B if cu_seqlens is None else len(cu_seqlens) - 1 + BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8) + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + assert NK == 1, "NK > 1 is not supported yet" + num_stages = 3 + num_warps = 1 + + o = q.new_empty(NK, *v.shape) + if inplace_final_state: + final_state = initial_state + else: + final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype) + + stride_init_state_token = initial_state.stride(0) + stride_final_state_token = final_state.stride(0) + + if ssm_state_indices is None: + stride_indices_seq, stride_indices_tok = 1, 1 + elif ssm_state_indices.ndim == 1: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1 + else: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride() + + grid = (NK, NV, N * HV) + fused_recurrent_gated_delta_rule_fwd_kernel[grid]( + q=q, + k=k, + v=v, + g=g, + beta=beta, + o=o, + h0=initial_state, + ht=final_state, + cu_seqlens=cu_seqlens, + ssm_state_indices=ssm_state_indices, + num_accepted_tokens=num_accepted_tokens, + scale=scale, + N=N, + T=T, + B=B, + H=H, + HV=HV, + K=K, + V=V, + BK=BK, + BV=BV, + stride_init_state_token=stride_init_state_token, + stride_final_state_token=stride_final_state_token, + stride_indices_seq=stride_indices_seq, + stride_indices_tok=stride_indices_tok, + IS_BETA_HEADWISE=beta.ndim == v.ndim, + USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel, + INPLACE_FINAL_STATE=inplace_final_state, + num_warps=num_warps, + num_stages=num_stages, + ) + o = o.squeeze(0) + return o, final_state + + +class FusedRecurrentFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + inplace_final_state: bool = True, + cu_seqlens: Optional[torch.LongTensor] = None, + ssm_state_indices: Optional[torch.Tensor] = None, + num_accepted_tokens: Optional[torch.Tensor] = None, + use_qk_l2norm_in_kernel: bool = False): + o, final_state = fused_recurrent_gated_delta_rule_fwd( + q=q.contiguous(), + k=k.contiguous(), + v=v.contiguous(), + g=g.contiguous(), + beta=beta.contiguous(), + scale=scale, + initial_state=initial_state, + inplace_final_state=inplace_final_state, + cu_seqlens=cu_seqlens, + ssm_state_indices=ssm_state_indices, + num_accepted_tokens=num_accepted_tokens, + use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + ) + + return o, final_state + + +def fused_recurrent_gated_delta_rule( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor = None, + scale: float = None, + initial_state: torch.Tensor = None, + inplace_final_state: bool = True, + cu_seqlens: Optional[torch.LongTensor] = None, + ssm_state_indices: Optional[torch.Tensor] = None, + num_accepted_tokens: Optional[torch.Tensor] = None, + use_qk_l2norm_in_kernel: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, H, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]`. + v (torch.Tensor): + values of shape `[B, T, HV, V]`. + GVA is applied if `HV > H`. + g (torch.Tensor): + g (decays) of shape `[B, T, HV]`. + beta (torch.Tensor): + betas of shape `[B, T, HV]`. + scale (Optional[int]): + Scale factor for the RetNet attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[N, HV, K, V]` for `N` input sequences. + For equal-length input sequences, `N` equals the batch size `B`. + Default: `None`. + inplace_final_state: bool: + Whether to store the final state in-place to save memory. + Default: `True`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + ssm_state_indices (Optional[torch.Tensor]): + Indices to map the input sequences to the initial/final states. + num_accepted_tokens (Optional[torch.Tensor]): + Number of accepted tokens for each sequence during decoding. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, HV, V]`. + final_state (torch.Tensor): + Final state of shape `[N, HV, K, V]`. + + Examples:: + >>> import torch + >>> import torch.nn.functional as F + >>> from einops import rearrange + >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule + # inputs with equal lengths + >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512 + >>> q = torch.randn(B, T, H, K, device='cuda') + >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1) + >>> v = torch.randn(B, T, HV, V, device='cuda') + >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda')) + >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid() + >>> h0 = torch.randn(B, HV, K, V, device='cuda') + >>> o, ht = fused_gated_recurrent_delta_rule( + q, k, v, g, beta, + initial_state=h0, + ) + # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required + >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta)) + # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected + >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long) + >>> o_var, ht_var = fused_gated_recurrent_delta_rule( + q, k, v, g, beta, + initial_state=h0, + cu_seqlens=cu_seqlens + ) + """ + if cu_seqlens is not None and q.shape[0] != 1: + raise ValueError( + f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`." + f"Please flatten variable-length inputs before processing.") + if scale is None: + scale = k.shape[-1]**-0.5 + else: + assert scale > 0, "scale must be positive" + if beta is None: + beta = torch.ones_like(q[..., 0]) + o, final_state = FusedRecurrentFunction.apply( + q, + k, + v, + g, + beta, + scale, + initial_state, + inplace_final_state, + cu_seqlens, + ssm_state_indices, + num_accepted_tokens, + use_qk_l2norm_in_kernel, + ) + return o, final_state diff --git a/vllm/model_executor/layers/fla/ops/index.py b/vllm/model_executor/layers/fla/ops/index.py new file mode 100644 index 0000000000000..9eca32bc31a04 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/index.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import torch + +from vllm.triton_utils import triton + +from .utils import tensor_cache + + +@tensor_cache +def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor: + return cu_seqlens[1:] - cu_seqlens[:-1] + + +@tensor_cache +def prepare_chunk_indices(cu_seqlens: torch.LongTensor, + chunk_size: int) -> torch.LongTensor: + indices = torch.cat([ + torch.arange(n) + for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist() + ]) + return torch.stack([indices.eq(0).cumsum(0) - 1, indices], + 1).to(cu_seqlens) + + +@tensor_cache +def prepare_chunk_offsets(cu_seqlens: torch.LongTensor, + chunk_size: int) -> torch.LongTensor: + return torch.cat([ + cu_seqlens.new_tensor([0]), + triton.cdiv(prepare_lens(cu_seqlens), chunk_size) + ]).cumsum(-1) diff --git a/vllm/model_executor/layers/fla/ops/l2norm.py b/vllm/model_executor/layers/fla/ops/l2norm.py new file mode 100644 index 0000000000000..b89c67871d071 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/l2norm.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import os +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +BT_LIST = [8, 16, 32, 64, 128] + +USE_DEFAULT_FLA_NORM = int(os.getenv("USE_DEFAULT_FLA_NORM", "0")) + + +@triton.autotune(configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16, 32] +], + key=['D']) +@triton.jit +def l2norm_fwd_kernel1( + x, + y, + D, + BD: tl.constexpr, + eps, +): + i_t = tl.program_id(0) + x += i_t * D + y += i_t * D + # Compute mean and variance + cols = tl.arange(0, BD) + mask = cols < D + b_x = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32) + b_var = tl.sum(b_x * b_x, axis=0) + b_rstd = 1 / tl.sqrt(b_var + eps) + # tl.store(Rstd + i_t, rstd) + # Normalize and apply linear transformation + b_y = b_x * b_rstd + tl.store(y + cols, b_y, mask=mask) + + +@triton.autotune(configs=[ + triton.Config({'BT': BT}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16] for BT in BT_LIST +], + key=['D']) +@triton.jit(do_not_specialize=["NB"]) +def l2norm_fwd_kernel( + x, + y, + eps, + NB, + T, + D: tl.constexpr, + BT: tl.constexpr, + BD: tl.constexpr, +): + i_t = tl.program_id(0) + p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32) + b_var = tl.sum(b_x * b_x, axis=1) + b_y = b_x / tl.sqrt(b_var + eps)[:, None] + p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit +def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr): + xoffset = tl.program_id(0) * MBLOCK + row_idx = xoffset + tl.arange(0, MBLOCK)[:, None] + xmask = row_idx < M + rindex = tl.arange(0, N)[None, :] + xs = tl.load(X + (rindex + N * row_idx), None).to(tl.float32) + square = tl.broadcast_to(xs * xs, [MBLOCK, N]) + square_sum = tl.sum(tl.where(xmask, square, 0), 1)[:, None] + rsqrt = tl.rsqrt(square_sum + eps) + tl.store(Y + (rindex + N * row_idx), xs * rsqrt, xmask) + + +def l2norm_fwd(x: torch.Tensor, + eps: float = 1e-6, + output_dtype: Optional[torch.dtype] = None): + x_shape_og = x.shape + x = x.view(-1, x.shape[-1]) + # allocate output + if output_dtype is None: + y = torch.empty_like(x) + else: + y = torch.empty_like(x, dtype=output_dtype) + assert y.stride(-1) == 1 + T, D = x.shape[0], x.shape[-1] + # rstd = torch.empty((T,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D)) + if D > BD: + raise RuntimeError("This layer doesn't support feature dim >= 64KB.") + + if not USE_DEFAULT_FLA_NORM: + MBLOCK = 32 + # M, N = x.shape + l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK), )]( + x, + y, + eps, + T, + D, + MBLOCK, + ) + else: + if D <= 512: + NB = triton.cdiv(T, 2048) + + def grid(meta): + return (triton.cdiv(T, meta['BT']), ) + + l2norm_fwd_kernel[grid]( + x, + y, + eps, + NB=NB, + T=T, + D=D, + BD=BD, + ) + else: + l2norm_fwd_kernel1[(T, )]( + x, + y, + eps=eps, + D=D, + BD=BD, + ) + + return y.view(x_shape_og) diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py new file mode 100644 index 0000000000000..a733c6c81e369 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py @@ -0,0 +1,337 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Tri Dao +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2024, Tri Dao. + +# ruff: noqa: E501 +# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html +# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate. +# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling. +# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. + +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from vllm.triton_utils import tl, triton + +from .utils import input_guard + + +def rms_norm_ref(x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + upcast=True): + dtype = x.dtype + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + z = z.float() if z is not None else z + if z is not None and not norm_before_gate: + x = x * F.silu(z) + if group_size is None: + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = (x * rstd * weight) + bias if bias is not None else (x * rstd * + weight) + else: + x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) + rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + + eps) + out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight + if bias is not None: + out = out + bias + if z is not None and norm_before_gate: + out *= F.silu(z) + return out.to(dtype) + + +@triton.heuristics({ + "HAS_BIAS": lambda args: args["B"] is not None, + "HAS_Z": lambda args: args["Z"] is not None, +}) +@triton.jit +def layer_norm_fwd_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + Z, # pointer to the other branch + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_z_row, + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + BLOCK_N: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + NORM_BEFORE_GATE: tl.constexpr, + IS_RMS_NORM: tl.constexpr, +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + group = tl.program_id(1) + X += row * stride_x_row + group * N + Y += row * stride_y_row + group * N + if HAS_Z: + Z += row * stride_z_row + group * N + if not IS_RMS_NORM: + Mean += group * M + Rstd += group * M + W += group * N + if HAS_BIAS: + B += group * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32) + if HAS_Z and not NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=cols < N).to(tl.float32) + x *= z * tl.sigmoid(z) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + y = x_hat * w + b if HAS_BIAS else x_hat * w + if HAS_Z and NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=mask).to(tl.float32) + y *= z * tl.sigmoid(z) + # Write output + tl.store(Y + cols, y, mask=mask) + + +def layer_norm_fwd( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + z: torch.Tensor = None, + out: torch.Tensor = None, + group_size: int = None, + norm_before_gate: bool = True, + is_rms_norm: bool = False, +): + M, N = x.shape + if group_size is None: + group_size = N + assert N % group_size == 0 + ngroups = N // group_size + assert x.stride(-1) == 1 + if z is not None: + assert z.stride(-1) == 1 + assert z.shape == (M, N) + assert weight.shape == (N, ) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N, ) + # allocate output + if out is not None: + assert out.shape == x.shape + else: + out = torch.empty_like(x) + assert out.stride(-1) == 1 + mean = torch.empty((ngroups * M, ), dtype=torch.float32, + device=x.device) if not is_rms_norm else None + rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size)) + if group_size > BLOCK_N: + raise RuntimeError( + "This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + num_warps = min(max(BLOCK_N // 256, 1), 8) + grid = (M, ngroups) + layer_norm_fwd_kernel[grid](x, + out, + weight, + bias, + z, + mean, + rstd, + x.stride(0), + out.stride(0), + z.stride(0) if z is not None else 0, + M, + group_size, + eps, + BLOCK_N=BLOCK_N, + NORM_BEFORE_GATE=norm_before_gate, + IS_RMS_NORM=is_rms_norm, + num_warps=num_warps) + return out, mean, rstd + + +class LayerNormFn(torch.autograd.Function): + + @input_guard + @staticmethod + def forward(ctx, + x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if z is not None: + assert z.shape == x_shape_og + z = z.reshape(-1, z.shape[-1]) + if z.stride(-1) != 1: + z = z.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + y, mean, rstd = layer_norm_fwd( + x, + weight, + bias, + eps, + z=z, + group_size=group_size, + norm_before_gate=norm_before_gate, + is_rms_norm=is_rms_norm, + ) + ctx.save_for_backward(x, weight, bias, mean, rstd, z) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.group_size = group_size + ctx.norm_before_gate = norm_before_gate + ctx.is_rms_norm = is_rms_norm + return y.reshape(x_shape_og) + + +def layernorm_fn(x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False): + return LayerNormFn.apply(x, weight, bias, z, eps, group_size, + norm_before_gate, is_rms_norm) + + +def rmsnorm_fn(x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True): + return LayerNormFn.apply(x, weight, bias, z, eps, group_size, + norm_before_gate, True) + + +class LayerNormGated(nn.Module): + + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: Optional[int] = None, + norm_before_gate: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + torch.nn.init.zeros_(self.bias) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + return layernorm_fn(x, + self.weight, + self.bias, + z=z, + group_size=self.group_size, + eps=self.eps, + norm_before_gate=self.norm_before_gate) + + +class RMSNormGated(nn.Module): + + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: Optional[int] = None, + norm_before_gate: bool = False, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + return rmsnorm_fn(x, + self.weight, + self.bias, + z=z, + eps=self.eps, + group_size=self.group_size, + norm_before_gate=self.norm_before_gate) diff --git a/vllm/model_executor/layers/fla/ops/op.py b/vllm/model_executor/layers/fla/ops/op.py new file mode 100644 index 0000000000000..05c424b437f48 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/op.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import os + +from vllm.triton_utils import tl, tldevice, triton + +if os.environ.get('FLA_USE_FAST_OPS', '0') == '1': + div = tldevice.fast_dividef + exp = tldevice.fast_expf + log = tldevice.fast_logf + log2 = tldevice.fast_log2f +else: + + @triton.jit + def div_normal(x, y): + return x / y + + div = div_normal + exp = tl.exp + log = tl.log + log2 = tl.log2 + + +@triton.jit +def safe_exp(x): + return exp(tl.where(x <= 0, x, float('-inf'))) + + +if not hasattr(tl, 'gather'): + + @triton.jit + def gather(src, index, axis, _builder=None): + # This is a fallback implementation when tl.gather is not supported + # In order to pass triton compiler, there is no actual gather operation + return src +else: + gather = tl.gather diff --git a/vllm/model_executor/layers/fla/ops/solve_tril.py b/vllm/model_executor/layers/fla/ops/solve_tril.py new file mode 100644 index 0000000000000..97cb0d800d411 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/solve_tril.py @@ -0,0 +1,365 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .utils import input_guard + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4, 5] + ], + key=['BT'], +) +@triton.jit(do_not_specialize=['T']) +def solve_tril_16x16_kernel( + A, + Ad, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + BT: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + A = A + (bos * H + i_h) * BT + Ad = Ad + (bos * H + i_h) * 16 + + offset = (i_t * 16) % BT + p_A = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * 16, offset), + (16, 16), (1, 0)) + p_Ai = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16), + (1, 0)) + b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32) + b_A = -tl.where( + tl.arange(0, 16)[:, None] > tl.arange(0, 16)[None, :], b_A, 0) + + o_i = tl.arange(0, 16) + for i in range(1, min(16, T - i_t * 16)): + b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset) + b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) + mask = o_i == i + b_A = tl.where(mask[:, None], b_a, b_A) + b_A += o_i[:, None] == o_i[None, :] + tl.store(p_Ai, + b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4, 5] + ], + key=['H', 'BT', 'IS_VARLEN'], +) +@triton.jit(do_not_specialize=['T']) +def merge_16x16_to_32x32_inverse_kernel(A, Ad, Ai, cu_seqlens, chunk_indices, + T, H: tl.constexpr, BT: tl.constexpr, + IS_VARLEN: tl.constexpr): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + A += (bos * H + i_h) * 32 + Ad += (bos * H + i_h) * 16 + Ai += (bos * H + i_h) * 32 + + p_A_21 = tl.make_block_ptr(A, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0), + (16, 16), (1, 0)) + p_Ad_11 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 32, 0), + (16, 16), (1, 0)) + p_Ad_22 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 32 + 16, 0), + (16, 16), (1, 0)) + p_Ai_11 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32, 0), + (16, 16), (1, 0)) + p_Ai_22 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 16), + (16, 16), (1, 0)) + p_Ai_21 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0), + (16, 16), (1, 0)) + + A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32) + Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32) + Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32) + Ai_21 = -tl.dot(tl.dot(Ai_22, A_21, input_precision='ieee'), + Ai_11, + input_precision='ieee') + tl.store(p_Ai_11, + Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_22, + Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_21, + Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4, 8] for num_stages in [2, 3, 4, 5] + ], + key=['H', 'BT', 'IS_VARLEN'], +) +@triton.jit(do_not_specialize=['T']) +def merge_16x16_to_64x64_inverse_kernel(A, Ad, Ai, cu_seqlens, chunk_indices, + T, H: tl.constexpr, BT: tl.constexpr, + IS_VARLEN: tl.constexpr): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + A += (bos * H + i_h) * 64 + Ad += (bos * H + i_h) * 16 + Ai += (bos * H + i_h) * 64 + + p_A_21 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0), + (16, 16), (1, 0)) + p_A_32 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16), + (16, 16), (1, 0)) + p_A_31 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0), + (16, 16), (1, 0)) + p_A_43 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32), + (16, 16), (1, 0)) + p_A_42 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16), + (16, 16), (1, 0)) + p_A_41 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0), + (16, 16), (1, 0)) + p_Ad_11 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64, 0), + (16, 16), (1, 0)) + p_Ad_22 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 16, 0), + (16, 16), (1, 0)) + p_Ad_33 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 32, 0), + (16, 16), (1, 0)) + p_Ad_44 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 48, 0), + (16, 16), (1, 0)) + + A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32) + A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32) + A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32) + A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32) + A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32) + A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32) + + Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32) + Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32) + Ai_33 = tl.load(p_Ad_33, boundary_check=(0, 1)).to(tl.float32) + Ai_44 = tl.load(p_Ad_44, boundary_check=(0, 1)).to(tl.float32) + + Ai_21 = -tl.dot(tl.dot(Ai_22, A_21, input_precision='ieee'), + Ai_11, + input_precision='ieee') + Ai_32 = -tl.dot(tl.dot(Ai_33, A_32, input_precision='ieee'), + Ai_22, + input_precision='ieee') + Ai_43 = -tl.dot(tl.dot(Ai_44, A_43, input_precision='ieee'), + Ai_33, + input_precision='ieee') + + Ai_31 = -tl.dot(Ai_33, + tl.dot(A_31, Ai_11, input_precision='ieee') + + tl.dot(A_32, Ai_21, input_precision='ieee'), + input_precision='ieee') + Ai_42 = -tl.dot(Ai_44, + tl.dot(A_42, Ai_22, input_precision='ieee') + + tl.dot(A_43, Ai_32, input_precision='ieee'), + input_precision='ieee') + Ai_41 = -tl.dot(Ai_44, + tl.dot(A_41, Ai_11, input_precision='ieee') + + tl.dot(A_42, Ai_21, input_precision='ieee') + + tl.dot(A_43, Ai_31, input_precision='ieee'), + input_precision='ieee') + + p_Ai_11 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 0), + (16, 16), (1, 0)) + p_Ai_22 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 16), + (16, 16), (1, 0)) + p_Ai_33 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 32), + (16, 16), (1, 0)) + p_Ai_44 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 48), + (16, 16), (1, 0)) + p_Ai_21 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0), + (16, 16), (1, 0)) + p_Ai_31 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0), + (16, 16), (1, 0)) + p_Ai_32 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16), + (16, 16), (1, 0)) + p_Ai_41 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0), + (16, 16), (1, 0)) + p_Ai_42 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16), + (16, 16), (1, 0)) + p_Ai_43 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32), + (16, 16), (1, 0)) + tl.store(p_Ai_11, + Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_22, + Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_33, + Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_44, + Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_21, + Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_31, + Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_32, + Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_41, + Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_42, + Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_43, + Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + + fill_zeros = tl.zeros((16, 16), dtype=tl.float32) + p_Ai_12 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 16), + (16, 16), (1, 0)) + p_Ai_13 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 32), + (16, 16), (1, 0)) + p_Ai_14 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 48), + (16, 16), (1, 0)) + p_Ai_23 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 32), + (16, 16), (1, 0)) + p_Ai_24 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 48), + (16, 16), (1, 0)) + p_Ai_34 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 48), + (16, 16), (1, 0)) + tl.store(p_Ai_12, + fill_zeros.to(p_Ai_12.dtype.element_ty, + fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_13, + fill_zeros.to(p_Ai_13.dtype.element_ty, + fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_14, + fill_zeros.to(p_Ai_14.dtype.element_ty, + fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_23, + fill_zeros.to(p_Ai_23.dtype.element_ty, + fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_24, + fill_zeros.to(p_Ai_24.dtype.element_ty, + fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + tl.store(p_Ai_34, + fill_zeros.to(p_Ai_34.dtype.element_ty, + fp_downcast_rounding="rtne"), + boundary_check=(0, 1)) + + +@input_guard +def solve_tril(A: torch.Tensor, + cu_seqlens: Optional[torch.Tensor] = None, + output_dtype: torch.dtype = torch.float) -> torch.Tensor: + """ + Compute the inverse of the lower triangular matrix + A should be strictly lower triangular, i.e., A.triu() == 0. + + Args: + A (torch.Tensor): + [B, T, H, K] + cu_seqlens (torch.Tensor): + The cumulative sequence lengths of the input tensor. + Default: None. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float` + + Returns: + (I + A)^-1 with the same shape as A + """ + assert A.shape[-1] in [16, 32, 64] + + B, T, H, BT = A.shape + Ad = torch.empty(B, + T, + H, + 16, + device=A.device, + dtype=torch.float if BT != 16 else output_dtype) + + chunk_indices = prepare_chunk_indices( + cu_seqlens, 16) if cu_seqlens is not None else None + NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, 16) + solve_tril_16x16_kernel[NT, B * H]( + A=A, + Ad=Ad, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + BT=BT, + ) + if BT == 16: + return Ad + + Ai = torch.empty(B, T, H, BT, device=A.device, dtype=output_dtype) + merge_fn = merge_16x16_to_32x32_inverse_kernel if BT == 32 else merge_16x16_to_64x64_inverse_kernel + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT) + merge_fn[NT, B * H]( + A=A, + Ad=Ad, + Ai=Ai, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + BT=BT, + ) + return Ai diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py new file mode 100644 index 0000000000000..7fd90cee45d0e --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/utils.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import contextlib +import functools +import logging +import os +from enum import Enum +from typing import Any, Callable, Literal, Optional + +import torch + +from vllm.triton_utils import triton + +logger = logging.getLogger(__name__) + +COMPILER_MODE = os.getenv("FLA_COMPILER_MODE") == "1" +FLA_CI_ENV = os.getenv("FLA_CI_ENV") == "1" +FLA_GDN_FIX_BT = os.getenv("FLA_GDN_FIX_BT", "0") == "1" + +SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0")) + + +def tensor_cache( + fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]: + """ + A decorator that caches the most recent results of a function with tensor inputs. + + This decorator will store the output of the decorated function for the most recent set of input tensors. + The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed. + + Args: + fn (Callable[..., torch.Tensor]): + The function to be decorated. It should take tensor inputs and return tensor outputs. + + Returns: + Callable[..., torch.Tensor]: + A wrapped version of the input function with single-entry caching. + """ + + cache_entries: tuple[Optional[tuple], Optional[dict], Any] = [] + cache_size = 4 + + @functools.wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + nonlocal cache_entries, cache_size + for i, entry in enumerate(cache_entries): + last_args, last_kwargs, last_result = entry + if len(args) == len(last_args) and len(kwargs) == len(last_kwargs) \ + and all(a is b for a, b in zip(args, last_args)) \ + and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()): + cache_entries = cache_entries[:i] + cache_entries[i + 1:] + [ + (args, kwargs, last_result) + ] + return last_result + + result = fn(*args, **kwargs) + + if len(cache_entries) >= cache_size: + cache_entries = cache_entries[1:] + cache_entries.append((args, kwargs, result)) + return result + + return wrapper + + +def input_guard( + fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]: + """ + A decorator to make sure all input tensors are contiguous and set the device based on input tensors. + """ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + contiguous_args = (i if not isinstance(i, torch.Tensor) else + i.contiguous() for i in args) + contiguous_kwargs = { + k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) + for k, v in kwargs.items() + } + + tensor = None + for arg in args: + if isinstance(arg, torch.Tensor): + tensor = arg + break + if tensor is None: + for value in kwargs.values(): + if isinstance(value, torch.Tensor): + tensor = value + break + + if tensor is not None: + ctx = torch.cuda.device(tensor.device.index) + else: + ctx = contextlib.nullcontext() + + with ctx: + return fn(*contiguous_args, **contiguous_kwargs) + + return wrapper + + +@functools.cache +def get_available_device() -> str: + try: + return triton.runtime.driver.active.get_current_target().backend + except BaseException: + return 'cpu' + + +@functools.cache +def _check_platform() -> Literal['nvidia', 'amd', 'intel', 'musa']: + device = get_available_device() + mapping = { + "cuda": "nvidia", + "hip": "amd", + "xpu": "intel", + } + # return the mapped value, or the original if not found + return mapping.get(device, device) + + +# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'. +# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs. +# Therefore, we need to check the triton backend to determine the actual GPU vendor. +device = get_available_device() if get_available_device() != 'hip' else 'cuda' +device_torch_lib = getattr(torch, device) +device_platform = _check_platform() + +is_amd = (device_platform == 'amd') +is_intel = (device_platform == 'intel') +is_nvidia = (device_platform == 'nvidia') +is_intel_alchemist = (is_intel + and 'Intel(R) Arc(TM) A' in torch.xpu.get_device_name(0)) +is_nvidia_hopper = (is_nvidia + and ('NVIDIA H' in torch.cuda.get_device_name(0) + or torch.cuda.get_device_capability()[0] >= 9)) +use_cuda_graph = (is_nvidia + and os.environ.get('FLA_USE_CUDA_GRAPH', '0') == '1') + + +def get_all_max_shared_mem(): + try: + return [ + triton.runtime.driver.active.utils.get_device_properties(i) + ['max_shared_mem'] for i in range(device_torch_lib.device_count()) + ] + except BaseException: + return [-1] + + +class Backend(Enum): + ADA = 101376 # RTX 4090 + AMPERE = 166912 # A100 + HOPPER = 232448 # H100 + DEFAULT = 102400 # Default + + @classmethod + def get_shared_memory(cls, arch: str) -> int: + try: + return cls[arch.upper()].value + except KeyError: + return cls.DEFAULT.value + + +@functools.cache +def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool: + try: + device_shared_mem_list = get_all_max_shared_mem() + max_shared_memory = device_shared_mem_list[tensor_idx] + return max_shared_memory >= Backend.get_shared_memory(arch) + except Exception: + return False diff --git a/vllm/model_executor/layers/fla/ops/wy_fast.py b/vllm/model_executor/layers/fla/ops/wy_fast.py new file mode 100644 index 0000000000000..70374eb650642 --- /dev/null +++ b/vllm/model_executor/layers/fla/ops/wy_fast.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# ruff: noqa: E501 +from typing import Optional + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices + + +@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4, 8] for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'IS_VARLEN'], +) +@triton.jit(do_not_specialize=['T']) +def recompute_w_u_fwd_kernel(k, v, beta, w, u, A, g, cu_seqlens, chunk_indices, + T, H: tl.constexpr, Hg: tl.constexpr, + K: tl.constexpr, V: tl.constexpr, + BT: tl.constexpr, BK: tl.constexpr, + BV: tl.constexpr, IS_VARLEN: tl.constexpr): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = tl.load(chunk_indices + i_t * 2).to( + tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(cu_seqlens + i_n).to( + tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ), + (i_t * BT, ), (BT, ), (0, )) + p_g = tl.make_block_ptr(g + (bos * H + i_h), (T, ), (H, ), (i_t * BT, ), + (BT, ), (0, )) + p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), + (i_t * BT, 0), (BT, BT), (1, 0)) + b_beta = tl.load(p_beta, boundary_check=(0, )) + b_A = tl.load(p_A, boundary_check=(0, 1)) + b_g = tl.exp(tl.load(p_g, boundary_check=(0, ))) + + for i_v in range(tl.cdiv(V, BV)): + p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), + (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_u = tl.make_block_ptr(u + (bos * H + i_h) * V, (T, V), (H * V, 1), + (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_vb = (b_v * b_beta[:, None]).to(b_v.dtype) + b_u = tl.dot(b_A, b_vb, allow_tf32=False) + tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1)) + + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr(k + (bos * Hg + i_h // (H // Hg)) * K, (T, K), + (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), + (1, 0)) + p_w = tl.make_block_ptr(w + (bos * H + i_h) * K, (T, K), (H * K, 1), + (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype) + b_w = tl.dot(b_A, b_kb) + tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1)) + + +def recompute_w_u_fwd( + k: torch.Tensor, + v: torch.Tensor, + beta: torch.Tensor, + g_cumsum: torch.Tensor, + A: torch.Tensor, + cu_seqlens: Optional[torch.LongTensor], +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, Hg, K, V = *k.shape, v.shape[-1] + H = v.shape[-2] + BT = A.shape[-1] + + chunk_indices = prepare_chunk_indices( + cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + BK = 64 + BV = 64 + u = torch.empty_like(v) + w = k.new_empty(B, T, H, K) + recompute_w_u_fwd_kernel[(NT, B * H)]( + k=k, + v=v, + beta=beta, + w=w, + u=u, + A=A, + g=g_cumsum, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + ) + return w, u diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 0fcf5d15afd1d..828536e6408b1 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -7,8 +7,10 @@ from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder, if HAS_TRITON: import triton import triton.language as tl + import triton.language.extra.libdevice as tldevice else: triton = TritonPlaceholder() tl = TritonLanguagePlaceholder() + tldevice = TritonLanguagePlaceholder() -__all__ = ["HAS_TRITON", "triton", "tl"] +__all__ = ["HAS_TRITON", "triton", "tl", "tldevice"] From b9a1c4c8a2430555f1959938b87b10640c751e57 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 9 Sep 2025 12:21:56 -0400 Subject: [PATCH 076/584] [ROCm][CI/Build] Sync ROCm dockerfiles with the ROCm fork (#24279) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> --- docker/Dockerfile.rocm | 4 ++- docker/Dockerfile.rocm_base | 62 ++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index f164857325043..ff8ad1607e34a 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -47,6 +47,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite # ----------------------- @@ -71,7 +72,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace RUN cd /vllm-workspace \ && rm -rf vllm \ && python3 -m pip install -e tests/vllm_test_utils \ - && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \ + && python3 -m pip install lm-eval[api]==0.4.4 \ && python3 -m pip install pytest-shard # ----------------------- @@ -100,6 +101,7 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples +COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 3414c0aa845cb..2ba5461dfe551 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,18 +1,16 @@ -ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete -ARG HIPBLASLT_BRANCH="db8e93b4" -ARG HIPBLAS_COMMON_BRANCH="7c1566b" +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete +ARG HIPBLASLT_BRANCH="aa0bda7b" +ARG HIPBLAS_COMMON_BRANCH="9b80ba8e" ARG LEGACY_HIPBLASLT_OPTION= -ARG RCCL_BRANCH="648a58d" -ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG TRITON_BRANCH="e5be006" ARG TRITON_REPO="https://github.com/triton-lang/triton.git" -ARG PYTORCH_BRANCH="295f2ed4" +ARG PYTORCH_BRANCH="f717b2af" ARG PYTORCH_VISION_BRANCH="v0.21.0" -ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" +ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="1a7f4dfa" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="916bf3c" +ARG AITER_BRANCH="4822e675" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -45,7 +43,7 @@ RUN apt-get update -y \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version -RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython +RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython FROM base AS build_hipblaslt ARG HIPBLASLT_BRANCH @@ -53,6 +51,7 @@ ARG HIPBLAS_COMMON_BRANCH # Set to "--legacy_hipblas_direct" for ROCm<=6.2 ARG LEGACY_HIPBLASLT_OPTION RUN git clone https://github.com/ROCm/hipBLAS-common.git +RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y RUN cd hipBLAS-common \ && git checkout ${HIPBLAS_COMMON_BRANCH} \ && mkdir build \ @@ -69,24 +68,17 @@ RUN cd hipBLASLt \ && make package RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install -FROM base AS build_rccl -ARG RCCL_BRANCH -ARG RCCL_REPO -RUN git clone ${RCCL_REPO} -RUN cd rccl \ - && git checkout ${RCCL_BRANCH} \ - && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH} -RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install - FROM base AS build_triton ARG TRITON_BRANCH ARG TRITON_REPO RUN git clone ${TRITON_REPO} RUN cd triton \ && git checkout ${TRITON_BRANCH} \ - && cd python \ - && python3 setup.py bdist_wheel --dist-dir=dist -RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install + && if [ ! -f setup.py ]; then cd python; fi \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && mkdir -p /app/install && cp dist/*.whl /app/install +RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \ + && python3 -m build --wheel && cp dist/*.whl /app/install; fi FROM base AS build_amdsmi RUN cd /opt/rocm/share/amd_smi \ @@ -132,15 +124,25 @@ RUN cd aiter \ RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install +FROM base AS debs +RUN mkdir /app/debs +RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ + cp /install/*.deb /app/debs +RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs + FROM base AS final RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ dpkg -i /install/*deb \ - && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ - && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status -RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \ - dpkg -i /install/*deb \ - && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ - && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status + && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ + && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ + && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ pip install /install/*.whl RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ @@ -154,8 +156,6 @@ ARG BASE_IMAGE ARG HIPBLAS_COMMON_BRANCH ARG HIPBLASLT_BRANCH ARG LEGACY_HIPBLASLT_OPTION -ARG RCCL_BRANCH -ARG RCCL_REPO ARG TRITON_BRANCH ARG TRITON_REPO ARG PYTORCH_BRANCH @@ -170,8 +170,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ - && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \ - && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \ && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ @@ -180,4 +178,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ - && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file From 9ad0688e436a41e386fa49e81ee344cb59f7d23c Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Wed, 10 Sep 2025 01:37:25 +0800 Subject: [PATCH 077/584] [Bugfix] Fix hidden_size for multimodal classification model (#24501) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- vllm/model_executor/models/adapters.py | 5 +++-- vllm/model_executor/models/utils.py | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index c189208fa075b..78ad9a433e314 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -255,7 +255,7 @@ def as_seq_cls_model(cls: _T) -> _T: from vllm.model_executor.models.interfaces import SupportsCrossEncoding from vllm.sequence import IntermediateTensors - from .utils import maybe_prefix + from .utils import get_model_hidden_size, maybe_prefix class ModelForSequenceClassification(_create_pooling_model_cls(cls), SupportsCrossEncoding): @@ -263,9 +263,10 @@ def as_seq_cls_model(cls: _T) -> _T: def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config + hidden_size = get_model_hidden_size(config) self.score = ReplicatedLinear( - config.hidden_size, + hidden_size, config.num_labels, bias=False, params_dtype=torch.float32, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 28cfefac30ddb..e716ec582baab 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -761,3 +761,10 @@ def fast_topk(values: torch.Tensor, topk: int, else: # Use topk for efficiency with larger k values return torch.topk(values, topk, dim=dim) + + +def get_model_hidden_size(hf_config: PretrainedConfig) -> int: + if hasattr(hf_config, "hidden_size"): + return hf_config.hidden_size + text_config = hf_config.get_text_config() + return text_config.hidden_size From 15cb047e25e58e79273567d3448563cf43bb39e7 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Tue, 9 Sep 2025 10:46:46 -0700 Subject: [PATCH 078/584] Extend renderer with embedding support and integrate completion endpoint (#24405) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- .../openai/test_prompt_validation.py | 14 +- tests/entrypoints/test_renderer.py | 133 +++++++++ .../v1/entrypoints/openai/test_completion.py | 2 +- vllm/entrypoints/openai/protocol.py | 15 +- vllm/entrypoints/openai/serving_completion.py | 53 ++-- vllm/entrypoints/openai/serving_embedding.py | 15 +- vllm/entrypoints/openai/serving_engine.py | 217 +------------- vllm/entrypoints/renderer.py | 267 +++++++++++++++--- vllm/inputs/data.py | 3 + 9 files changed, 410 insertions(+), 309 deletions(-) diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 4197583074dfe..bfa3f983cd87e 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -10,7 +10,7 @@ import pytest import regex as re import torch -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.renderer import BaseRenderer from ...utils import RemoteOpenAIServer @@ -27,12 +27,16 @@ async def test_empty_prompt(): with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() - with pytest.raises(openai.BadRequestError, - match="decoder prompt cannot be empty"): + with pytest.raises( + openai.BadRequestError, + match= + "Either prompt or prompt_embeds must be provided and non-empty." + ): await client.completions.create(model=model_name, prompt="", max_tokens=5, - temperature=0.0) + temperature=0.0, + extra_body={"prompt_embeds": []}) @pytest.mark.asyncio @@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout, buffer.seek(0) encoded_tensor = pybase64.b64encode(buffer.getvalue()) - loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor) + loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor) assert len(loaded_prompt_embeds) == 1 loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"] assert loaded_tensor.device.type == "cpu" diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py index 1d80ea6cb4917..d3caee4b8cec9 100644 --- a/tests/entrypoints/test_renderer.py +++ b/tests/entrypoints/test_renderer.py @@ -1,13 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io from dataclasses import dataclass from typing import Optional from unittest.mock import AsyncMock, MagicMock +import pybase64 import pytest +import torch from vllm.entrypoints.renderer import CompletionRenderer +from vllm.inputs.data import is_embeds_prompt @dataclass @@ -178,3 +182,132 @@ class TestRenderPrompt: with pytest.raises(ValueError, match="No tokenizer available"): await renderer_no_tokenizer.render_prompt( prompt_or_prompts="Hello world", max_length=100) + + @pytest.mark.asyncio + async def test_token_input_with_needs_detokenization( + self, renderer, mock_async_tokenizer): + # When needs_detokenization=True for token inputs, renderer should + # use the async tokenizer to decode and include the original text + # in the returned prompt object. + mock_async_tokenizer.decode = AsyncMock(return_value="decoded text") + renderer.async_tokenizer_pool[ + renderer.tokenizer] = mock_async_tokenizer + + tokens = [1, 2, 3, 4] + results = await renderer.render_prompt( + prompt_or_prompts=tokens, + needs_detokenization=True, + ) + + assert len(results) == 1 + assert results[0]["prompt_token_ids"] == tokens + assert results[0]["prompt"] == "decoded text" + mock_async_tokenizer.decode.assert_awaited_once() + + +class TestRenderEmbedPrompt: + + def _create_test_embed_bytes(self, tensor: torch.Tensor) -> bytes: + """Helper to create base64-encoded tensor bytes""" + buffer = io.BytesIO() + torch.save(tensor, buffer) + buffer.seek(0) + return pybase64.b64encode(buffer.read()) + + @pytest.mark.asyncio + async def test_single_prompt_embed(self, renderer): + # Create a test tensor + test_tensor = torch.randn(10, 768, dtype=torch.float32) + embed_bytes = self._create_test_embed_bytes(test_tensor) + + results = await renderer.render_prompt_and_embeds( + prompt_embeds=embed_bytes, cache_salt="test_salt") + + assert len(results) == 1 + assert is_embeds_prompt(results[0]) + assert torch.allclose(results[0]["prompt_embeds"], test_tensor) + assert results[0]["cache_salt"] == "test_salt" + + @pytest.mark.asyncio + async def test_multiple_prompt_embeds(self, renderer): + # Create multiple test tensors + test_tensors = [ + torch.randn(8, 512, dtype=torch.float32), + torch.randn(12, 512, dtype=torch.float32), + ] + embed_bytes_list = [ + self._create_test_embed_bytes(t) for t in test_tensors + ] + + results = await renderer.render_prompt_and_embeds( + prompt_embeds=embed_bytes_list) + + assert len(results) == 2 + for i, result in enumerate(results): + assert is_embeds_prompt(result) + assert torch.allclose(result["prompt_embeds"], test_tensors[i]) + + @pytest.mark.asyncio + async def test_prompt_embed_truncation(self, renderer): + # Create tensor with more tokens than truncation limit + test_tensor = torch.randn(20, 768, dtype=torch.float32) + embed_bytes = self._create_test_embed_bytes(test_tensor) + + results = await renderer.render_prompt_and_embeds( + prompt_embeds=embed_bytes, truncate_prompt_tokens=10) + + assert len(results) == 1 + # Should keep last 10 tokens + expected = test_tensor[-10:] + assert torch.allclose(results[0]["prompt_embeds"], expected) + + @pytest.mark.asyncio + async def test_prompt_embed_different_dtypes(self, renderer): + # Test different supported dtypes + dtypes = [torch.float32, torch.float16, torch.bfloat16] + + for dtype in dtypes: + test_tensor = torch.randn(5, 256, dtype=dtype) + embed_bytes = self._create_test_embed_bytes(test_tensor) + + results = await renderer.render_prompt_and_embeds( + prompt_embeds=embed_bytes) + + assert len(results) == 1 + assert results[0]["prompt_embeds"].dtype == dtype + + @pytest.mark.asyncio + async def test_prompt_embed_squeeze_batch_dim(self, renderer): + # Test tensor with batch dimension gets squeezed + test_tensor = torch.randn(1, 10, 768, dtype=torch.float32) + embed_bytes = self._create_test_embed_bytes(test_tensor) + + results = await renderer.render_prompt_and_embeds( + prompt_embeds=embed_bytes) + + assert len(results) == 1 + # Should be squeezed to 2D + assert results[0]["prompt_embeds"].shape == (10, 768) + + @pytest.mark.asyncio + async def test_both_prompts_and_embeds(self, renderer, + mock_async_tokenizer): + # Set up text tokenization + mock_async_tokenizer.return_value = MockTokenizerResult( + [101, 102, 103]) + renderer.async_tokenizer_pool[ + renderer.tokenizer] = mock_async_tokenizer + + # Create embed + test_tensor = torch.randn(5, 256, dtype=torch.float32) + embed_bytes = self._create_test_embed_bytes(test_tensor) + + results = await renderer.render_prompt_and_embeds( + prompt_or_prompts="Hello world", prompt_embeds=embed_bytes) + + assert len(results) == 2 + # First should be embed prompt + assert is_embeds_prompt(results[0]) + # Second should be tokens prompt + assert "prompt_token_ids" in results[1] + assert results[1]["prompt_token_ids"] == [101, 102, 103] diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 3a65583fab8d3..3114d7639f045 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -686,7 +686,7 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): async def test_completion_with_empty_prompt_embeds( client: openai.AsyncOpenAI) -> None: """Test completion with empty prompt embeds.""" - payload: dict[str, list] = {"prompt_embeds": []} + payload: dict[str, object] = {"prompt": "Hello", "prompt_embeds": []} headers: dict[str, str] = {"Content-Type": "application/json"} # base_url = http://localhost:8000/v1/completions response = requests.post(f"{client.base_url}completions", diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index c56c68cf76442..c8ecbd28e7db5 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1270,9 +1270,20 @@ class CompletionRequest(OpenAIBaseModel): @model_validator(mode="before") @classmethod def validate_prompt_and_prompt_embeds(cls, data): - if data.get("prompt") is None and data.get("prompt_embeds") is None: + prompt = data.get("prompt") + prompt_embeds = data.get("prompt_embeds") + + prompt_is_empty = (prompt is None + or (isinstance(prompt, str) and prompt == "")) + embeds_is_empty = (prompt_embeds is None + or (isinstance(prompt_embeds, list) + and len(prompt_embeds) == 0)) + + if prompt_is_empty and embeds_is_empty: raise ValueError( - "At least one of `prompt` or `prompt_embeds` must be set.") + "Either prompt or prompt_embeds must be provided and non-empty." + ) + return data @model_validator(mode="before") diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index b26140d4b9d7a..e3e945e22cd4e 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -26,12 +26,8 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs, PromptTokenUsageInfo, RequestResponseMetadata, UsageInfo) -from vllm.entrypoints.openai.serving_engine import ( - EmbedsPrompt as ServingEngineEmbedsPrompt) from vllm.entrypoints.openai.serving_engine import (OpenAIServing, - TextTokensPrompt, - clamp_prompt_logprobs, - is_text_tokens_prompt) + clamp_prompt_logprobs) # yapf: enable from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.utils import get_max_tokens @@ -132,12 +128,19 @@ class OpenAIServingCompletion(OpenAIServing): else: tokenizer = await self.engine_client.get_tokenizer(lora_request ) + renderer = self._get_renderer(tokenizer) + max_input_tokens_len = self.max_model_len - (request.max_tokens + or 0) - request_prompts, engine_prompts = await self._preprocess_completion( - request, - tokenizer, - request.prompt, + engine_prompts = await renderer.render_prompt_and_embeds( + prompt_or_prompts=request.prompt, + prompt_embeds=request.prompt_embeds, + max_length=max_input_tokens_len, + truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, + cache_salt=request.cache_salt, + needs_detokenization=bool(request.echo + and not request.return_token_ids), ) except ValueError as e: logger.exception("Error in preprocessing prompt inputs") @@ -198,7 +201,7 @@ class OpenAIServingCompletion(OpenAIServing): self._log_inputs( request_id_item, - request_prompts[i], + engine_prompt, params=sampling_params, lora_request=lora_request, ) @@ -249,7 +252,7 @@ class OpenAIServingCompletion(OpenAIServing): if stream: return self.completion_stream_generator( request, - request_prompts, + engine_prompts, result_generator, request_id, created_time, @@ -273,11 +276,9 @@ class OpenAIServingCompletion(OpenAIServing): # We did not pass it into vLLM engine to avoid being redundant # with the inputs token IDs if final_res.prompt is None: - request_prompt = request_prompts[i] - if is_text_tokens_prompt(request_prompt): - final_res.prompt = request_prompt["prompt"] - else: - final_res.prompt = None + engine_prompt = engine_prompts[i] + final_res.prompt = None if is_embeds_prompt( + engine_prompt) else engine_prompt.get("prompt") final_res_batch_checked = cast(list[RequestOutput], final_res_batch) @@ -313,8 +314,7 @@ class OpenAIServingCompletion(OpenAIServing): async def completion_stream_generator( self, request: CompletionRequest, - request_prompts: list[Union[TextTokensPrompt, - ServingEngineEmbedsPrompt]], + engine_prompts: list[Union[TokensPrompt, EmbedsPrompt]], result_generator: AsyncIterator[tuple[int, RequestOutput]], request_id: str, created_time: int, @@ -350,14 +350,11 @@ class OpenAIServingCompletion(OpenAIServing): num_cached_tokens = res.num_cached_tokens first_iteration = False - if res.prompt is not None: - prompt_text = res.prompt - else: - request_prompt = request_prompts[prompt_idx] - if is_text_tokens_prompt(request_prompt): - prompt_text = request_prompt["prompt"] - else: - prompt_text = None + prompt_text = res.prompt + if prompt_text is None: + engine_prompt = engine_prompts[prompt_idx] + prompt_text = None if is_embeds_prompt( + engine_prompt) else engine_prompt.get("prompt") # Prompt details are excluded from later streamed outputs if prompt_token_ids is not None: @@ -378,6 +375,8 @@ class OpenAIServingCompletion(OpenAIServing): assert request.max_tokens is not None if request.echo and not has_echoed[i]: assert prompt_token_ids is not None + if request.return_token_ids: + prompt_text = "" assert prompt_text is not None if request.max_tokens == 0: # only return the prompt @@ -525,6 +524,8 @@ class OpenAIServingCompletion(OpenAIServing): for output in final_res.outputs: assert request.max_tokens is not None if request.echo: + if request.return_token_ids: + prompt_text = "" assert prompt_text is not None if request.max_tokens == 0: token_ids = prompt_token_ids diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index c375f9e7c5064..8acd4a1e01fb1 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -28,7 +28,6 @@ from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext, TextTokensPrompt) # yapf: enable from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, @@ -290,7 +289,7 @@ class EmbeddingMixin(OpenAIServing): async def _create_single_prompt_generator( self, ctx: EmbeddingServeContext, - engine_prompt: Union[EngineTokensPrompt, EngineEmbedsPrompt], + engine_prompt: EngineTokensPrompt, pooling_params: PoolingParams, trace_headers: Optional[Mapping[str, str]], prompt_index: int, @@ -303,12 +302,6 @@ class EmbeddingMixin(OpenAIServing): params=pooling_params, lora_request=ctx.lora_request) - # Mypy has an existing bug related to inferring the variance - # of TypedDicts with `builtins.enumerate`: - # https://github.com/python/mypy/issues/8586#issuecomment-2867698435 - engine_prompt = cast(Union[EngineTokensPrompt, EngineEmbedsPrompt], - engine_prompt) - # Return the original generator without wrapping return self.engine_client.encode( engine_prompt, @@ -375,12 +368,8 @@ class EmbeddingMixin(OpenAIServing): continue # Normal processing for short prompts or non-token prompts - # Cast engine_prompt to the expected type for mypy - engine_prompt_typed = cast( - Union[EngineTokensPrompt, EngineEmbedsPrompt], - engine_prompt) generator = await self._create_single_prompt_generator( - ctx, engine_prompt_typed, pooling_params, trace_headers, i) + ctx, engine_prompt, pooling_params, trace_headers, i) generators.append(generator) from vllm.utils import merge_async_iterators diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d6e8d93a57e14..23d9b706ee628 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio -import io import json import sys import time @@ -9,10 +7,8 @@ import traceback from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence from concurrent.futures import ThreadPoolExecutor from http import HTTPStatus -from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional, - TypeVar, Union, cast, overload) +from typing import Any, Callable, ClassVar, Generic, Optional, TypeVar, Union -import pybase64 import torch from fastapi import Request from pydantic import BaseModel, ConfigDict, Field @@ -64,10 +60,8 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer # yapf: enable -from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt from vllm.inputs.data import PromptType from vllm.inputs.data import TokensPrompt as EngineTokensPrompt -from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.logprobs import Logprob, PromptLogprobs from vllm.lora.request import LoRARequest @@ -149,8 +143,7 @@ class RequestProcessingMixin(BaseModel): """ request_prompts: Optional[Sequence[RequestPrompt]] = [] - engine_prompts: Optional[Union[list[EngineTokensPrompt], - list[EngineEmbedsPrompt]]] = [] + engine_prompts: Optional[list[EngineTokensPrompt]] = [] model_config = ConfigDict(arbitrary_types_allowed=True) @@ -368,13 +361,6 @@ class OpenAIServing: for i, engine_prompt in enumerate(ctx.engine_prompts): request_id_item = f"{ctx.request_id}-{i}" - # Mypy has an existing bug related to inferring the variance of - # TypedDicts with `builtins.enumerate`: - # https://github.com/python/mypy/issues/8586#issuecomment-2867698435 - engine_prompt = cast( - Union[EngineTokensPrompt, EngineEmbedsPrompt], - engine_prompt) - self._log_inputs( request_id_item, engine_prompt, @@ -737,170 +723,6 @@ class OpenAIServing: tokenizer=tokenizer, ) - async def _tokenize_prompt_input_or_inputs_async( - self, - request: AnyRequest, - tokenizer: Optional[AnyTokenizer], - input_or_inputs: Optional[Union[str, list[str], list[int], - list[list[int]]]], - add_special_tokens: bool = True, - ) -> tuple[list[TextTokensPrompt], list[EmbedsPrompt]]: - """ - Tokenize/detokenize depending on the input format. - - According to `OpenAI API <https://platform.openai.com/docs/api-reference/embeddings/create>`_ - , each input can be a string or array of tokens. Note that each request - can pass one or more inputs. - """ - inputs_embeds = list[EmbedsPrompt]() - inputs_text = list[TextTokensPrompt]() - - truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", - None) - - if (truncate_prompt_tokens or 0) < 0: - truncate_prompt_tokens = self.max_model_len - - if (isinstance(request, CompletionRequest) - and request.prompt_embeds is not None): - inputs_embeds.extend( - self._load_prompt_embeds(request.prompt_embeds, - truncate_prompt_tokens)) - - # Empty prompts are okay as long as there are prompt embeddings - if input_or_inputs is None or (inputs_embeds - and input_or_inputs == ""): - return [], inputs_embeds - - # Although our type checking is based on mypy, - # VSCode Pyright extension should still work properly - # "is False" is required for Pyright to perform type narrowing - # See: https://github.com/microsoft/pyright/issues/7672 - - # Parse and batch the input prompts - batch_inputs = parse_and_batch_prompt(input_or_inputs) - - # Process each input in the batch concurrently - tasks = [] - for prompt_input in batch_inputs: - if prompt_input["is_tokens"] is False: - assert tokenizer is not None, ( - "Tokenizer is required for text prompts") - task = self._normalize_prompt_text_to_input( - request, - prompt_input["content"], - tokenizer=tokenizer, - add_special_tokens=add_special_tokens, - ) - else: - task = self._normalize_prompt_tokens_to_input( - request, prompt_input["content"], tokenizer=tokenizer) - tasks.append(task) - - # Wait for all tokenization tasks to complete - results = await asyncio.gather(*tasks) - inputs_text.extend(results) - - return inputs_text, inputs_embeds - - @overload - async def _preprocess_completion( - self, - request: Union[ - DetokenizeRequest, - EmbeddingCompletionRequest, - RerankRequest, - ClassificationRequest, - ScoreRequest, - TokenizeCompletionRequest, - ], - tokenizer: Optional[AnyTokenizer], - input_or_inputs: Union[str, list[str], list[int], list[list[int]]], - add_special_tokens: bool = ..., - ) -> tuple[list[TextTokensPrompt], list[EngineTokensPrompt]]: - ... - - @overload - async def _preprocess_completion( - self, - request: CompletionRequest, - tokenizer: Optional[AnyTokenizer], - input_or_inputs: Optional[Union[str, list[str], list[int], - list[list[int]]]], - add_special_tokens: bool = ..., - ) -> tuple[ - list[Union[TextTokensPrompt, EmbedsPrompt]], - list[Union[EngineTokensPrompt, EngineEmbedsPrompt]], - ]: - ... - - async def _preprocess_completion( - self, - request: CompletionLikeRequest, - tokenizer: Optional[AnyTokenizer], - input_or_inputs: Optional[Union[str, list[str], list[int], - list[list[int]]]], - add_special_tokens: bool = True, - ) -> tuple[ - Union[list[TextTokensPrompt], list[Union[TextTokensPrompt, - EmbedsPrompt]]], - Union[ - list[EngineTokensPrompt], - list[Union[EngineTokensPrompt, EngineEmbedsPrompt]], - ], - ]: - if (not isinstance(request, CompletionRequest) - and input_or_inputs is None): - raise ValueError( - "Prompt embeds with non-completion requests is not" - " currently supported.") - - ( - request_prompts_text, - request_prompts_embeds, - ) = await self._tokenize_prompt_input_or_inputs_async( - request, - tokenizer, - input_or_inputs, - add_special_tokens=add_special_tokens, - ) - - engine_prompts_text = [ - EngineTokensPrompt( - prompt_token_ids=request_prompt_text["prompt_token_ids"]) - for request_prompt_text in request_prompts_text - ] - cache_salt = (request.cache_salt if - (hasattr(request, "cache_salt") - and request.cache_salt is not None) else None) - if cache_salt: - for prompt_text in engine_prompts_text: - prompt_text["cache_salt"] = cache_salt - - # This check is equivalent to simply checking if - # `request_prompts_embeds` is empty, but it's difficult to propagate - # overloads to the private helper functions to enable this check. - # This overload is needed because only TextPrompts are allowed for - # non-completion requests and if we don't add the overload here, - # everywhere this function is used outside of serving_completion will - # need logic asserting that only text prompts are in the request. - if (not isinstance(request, CompletionRequest) - and input_or_inputs is not None): - return request_prompts_text, engine_prompts_text - - engine_prompts_embeds = [ - EngineEmbedsPrompt( - prompt_embeds=request_prompt_embeds["prompt_embeds"]) - for request_prompt_embeds in request_prompts_embeds - ] - if cache_salt: - for prompt_embed in engine_prompts_embeds: - prompt_embed["cache_salt"] = cache_salt - - request_prompts = request_prompts_embeds + request_prompts_text - engine_prompts = engine_prompts_embeds + engine_prompts_text - return request_prompts, engine_prompts - async def _preprocess_chat( self, request: Union[ChatLikeRequest, ResponsesRequest], @@ -1073,41 +895,6 @@ class OpenAIServing: # OPTIMIZATION priority = orig_priority - 1 - @staticmethod - def _load_prompt_embeds( - prompt_embeds: Optional[Union[bytes, list[bytes]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, - ) -> list[EmbedsPrompt]: - - def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: - tensor = torch.load( - io.BytesIO(pybase64.b64decode(embed, validate=True)), - weights_only=True, - map_location=torch.device("cpu"), - ) - assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( - torch.float32, - torch.bfloat16, - torch.float16, - ) - tensor = tensor.to_dense() - if tensor.dim() > 2: - tensor = tensor.squeeze(0) - assert tensor.dim() == 2 - if truncate_prompt_tokens is not None: - tensor = tensor[-truncate_prompt_tokens:] - return {"prompt_embeds": tensor} - - if prompt_embeds: - if isinstance(prompt_embeds, list): - return [ - _load_and_validate_embed(embed) for embed in prompt_embeds - ] - else: - return [_load_and_validate_embed(prompt_embeds)] - else: - return [] - def _log_inputs( self, request_id: str, diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index d3f3a8cfa5aa9..441b89ec4803e 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -2,12 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import io from abc import ABC, abstractmethod from typing import Annotated, Optional, Union +import pybase64 +import torch from pydantic import Field from vllm.config import ModelConfig +from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -49,37 +53,121 @@ class BaseRenderer(ABC): truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: Optional[bool] = True, cache_salt: Optional[str] = None, + needs_detokenization: Optional[bool] = False, ) -> list[EngineTokensPrompt]: """ - Convert input prompts into tokenized format for engine processing. - - This is the core method that transforms various input formats into - standardized TokensPrompt objects. Implementations should handle - tokenization, special token insertion, truncation, and validation - according to model requirements. - + Convert text or token inputs into engine-ready TokensPrompt objects. + + This method accepts text or token inputs and produces a + list of [`TokensPrompt`][vllm.inputs.data.TokensPrompt] objects + for the engine. + Args: - prompt_or_prompts: Input data in various formats: - - str: Single text prompt - - list[str]: Batch of text prompts - - list[int]: Pre-tokenized sequence - - list[list[int]]: Batch of pre-tokenized sequences - max_length: Maximum sequence length (endpoint-specific behavior) - truncate_prompt_tokens: Truncate to last N tokens - (None=no truncation, 0=empty) - add_special_tokens: Add model-specific tokens (e.g., [CLS], [SEP]) - to text inputs - cache_salt: Optional string to disambiguate cached prompts - + prompt_or_prompts: One of: + - ``str``: Single text prompt. + - ``list[str]``: Batch of text prompts. + - ``list[int]``: Single pre-tokenized sequence. + - ``list[list[int]]``: Batch of pre-tokenized sequences. + max_length: Maximum allowable total input token length. If provided, + token inputs longer than this raise ``ValueError``. + truncate_prompt_tokens: Number of tokens to keep. ``None`` means no + truncation. ``0`` yields an empty list (and skips embeds). + ``-1`` maps to ``model_config.max_model_len``. + add_special_tokens: Whether to add model-specific special tokens + during text tokenization. + cache_salt: Optional string to disambiguate prefix cache entries. + needs_detokenization: If True and ``prompt_or_prompts`` is token + input, detokenize IDs back to text for inclusion in outputs. + Returns: - list[EngineTokensPrompt]: Tokenized prompts ready for engine - consumption - + list[EngineTokensPrompt]: Engine-ready token prompts. + Raises: - ValueError: If input format is invalid or length limits exceeded + ValueError: If input formats are invalid or length limits exceeded. """ raise NotImplementedError + @abstractmethod + async def render_prompt_and_embeds( + self, + prompt_or_prompts: Optional[Union[str, list[str], list[int], + list[list[int]]]] = None, + prompt_embeds: Optional[Union[bytes, list[bytes]]] = None, + max_length: Optional[int] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, + add_special_tokens: Optional[bool] = True, + cache_salt: Optional[str] = None, + needs_detokenization: Optional[bool] = False, + ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]: + """ + Convert text/token and/or base64-encoded embeddings inputs into + engine-ready prompt objects. + + At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be + provided and non-empty. If both are omitted or empty (e.g., empty + string and empty list), a ``ValueError`` is raised. + + Args: + prompt_or_prompts: Text or token inputs to include. + prompt_embeds: Base64-encoded bytes (or list thereof) containing a + torch-saved tensor to be used as prompt embeddings. + max_length: Maximum allowable total input token length. If provided, + inputs longer than this raise ``ValueError``. + truncate_prompt_tokens: Number of tokens/rows to keep from the end + of the sequence. ``-1`` maps to ``model_config.max_model_len``. + add_special_tokens: Whether to add model-specific special tokens + during text tokenization. + cache_salt: Optional string to disambiguate prefix cache entries. + needs_detokenization: If True and ``prompt_or_prompts`` is token + input, detokenize IDs back to text for inclusion in outputs. + + Returns: + list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]: + Engine-ready prompt objects. + + Raises: + ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds`` + are omitted or empty (decoder prompt cannot be empty), or if + length limits are exceeded. + """ + raise NotImplementedError + + @classmethod + def load_prompt_embeds( + cls, + prompt_embeds: Union[bytes, list[bytes]], + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=0)]] = None, + cache_salt: Optional[str] = None, + ) -> list[EngineEmbedsPrompt]: + """Load and validate base64-encoded embeddings into prompt objects.""" + + def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt: + tensor = torch.load( + io.BytesIO(pybase64.b64decode(embed, validate=True)), + weights_only=True, + map_location=torch.device("cpu"), + ) + assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( + torch.float32, + torch.bfloat16, + torch.float16, + ) + tensor = tensor.to_dense() + if tensor.dim() > 2: + tensor = tensor.squeeze(0) + assert tensor.dim() == 2 + if truncate_prompt_tokens is not None: + tensor = tensor[-truncate_prompt_tokens:] + embeds_prompt = EngineEmbedsPrompt(prompt_embeds=tensor) + if cache_salt is not None: + embeds_prompt["cache_salt"] = cache_salt + return embeds_prompt + + if isinstance(prompt_embeds, list): + return [_load_and_validate_embed(embed) for embed in prompt_embeds] + else: + return [_load_and_validate_embed(prompt_embeds)] + class CompletionRenderer(BaseRenderer): @@ -101,50 +189,110 @@ class CompletionRenderer(BaseRenderer): truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: Optional[bool] = True, cache_salt: Optional[str] = None, + needs_detokenization: Optional[bool] = False, ) -> list[EngineTokensPrompt]: """Implementation of prompt rendering for completion-style requests. Uses async tokenizer pooling for improved performance. See base class for detailed parameter documentation. """ - if truncate_prompt_tokens is not None: - if truncate_prompt_tokens == 0: - return [] - if truncate_prompt_tokens < 0: - truncate_prompt_tokens = self.model_config.max_model_len - if max_length is not None and truncate_prompt_tokens > max_length: - raise ValueError( - f"truncate_prompt_tokens ({truncate_prompt_tokens}) " - f"cannot be greater than max_length ({max_length}). " - f"Please select a smaller truncation size.") + truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens( + truncate_prompt_tokens, max_length) + if truncate_prompt_tokens == 0: + return [] # Parse and batch the input prompts batch_inputs = parse_and_batch_prompt(prompt_or_prompts) - rendered_prompts: list[EngineTokensPrompt] = [] - tokenize_tasks = [] + tasks = [] for prompt_input in batch_inputs: if prompt_input["is_tokens"] is True: # Token input - token_ids = self._maybe_apply_truncation( - prompt_input["content"], truncate_prompt_tokens) - rendered_prompts.append( - self._create_tokens_prompt(token_ids, max_length, - cache_salt)) + detokenize_task = asyncio.create_task( + # Note: detokenization is needed when echo is enabled, + # where the input token IDs are decoded back to text. + self._maybe_detokenize(prompt_input["content"], max_length, + truncate_prompt_tokens, cache_salt, + needs_detokenization)) + tasks.append(detokenize_task) else: # Text input tokenize_task = asyncio.create_task( self._tokenize(prompt_input["content"], max_length, truncate_prompt_tokens, add_special_tokens, cache_salt)) - tokenize_tasks.append(tokenize_task) + tasks.append(tokenize_task) # Wait for all text tokenization to finish - if tokenize_tasks: - tokenized_text_prompts = await asyncio.gather(*tokenize_tasks) - rendered_prompts.extend(tokenized_text_prompts) + if tasks: + tokenized_text_prompts = await asyncio.gather(*tasks) + return tokenized_text_prompts - return rendered_prompts + return [] + + async def render_prompt_and_embeds( + self, + prompt_or_prompts: Optional[Union[str, list[str], list[int], + list[list[int]]]] = None, + prompt_embeds: Optional[Union[bytes, list[bytes]]] = None, + max_length: Optional[int] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, + add_special_tokens: Optional[bool] = True, + cache_salt: Optional[str] = None, + needs_detokenization: Optional[bool] = False, + ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]: + """ + Render text/token prompts and/or precomputed embedding prompts. At + least one of `prompt_or_prompts` or `prompt_embeds` must be provided. + """ + truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens( + truncate_prompt_tokens, max_length) + if truncate_prompt_tokens == 0: + return [] + + rendered: list[Union[EngineTokensPrompt, EngineEmbedsPrompt]] = [] + + if prompt_embeds is not None: + rendered.extend( + self.load_prompt_embeds(prompt_embeds, truncate_prompt_tokens, + cache_salt)) + if prompt_or_prompts is None or prompt_or_prompts == "": + return rendered + + token_prompts = await self.render_prompt( + prompt_or_prompts=prompt_or_prompts, + max_length=max_length, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=add_special_tokens, + cache_salt=cache_salt, + needs_detokenization=needs_detokenization, + ) + rendered.extend(token_prompts) + + return rendered + + def _validate_and_normalize_truncate_tokens( + self, + truncate_prompt_tokens: Optional[int], + max_length: Optional[int], + ) -> Optional[int]: + """Validate and normalize truncate_prompt_tokens parameter.""" + if truncate_prompt_tokens is None: + return None + + if truncate_prompt_tokens == 0: + return 0 + + if truncate_prompt_tokens < 0: + truncate_prompt_tokens = self.model_config.max_model_len + + if max_length is not None and truncate_prompt_tokens > max_length: + raise ValueError( + f"truncate_prompt_tokens ({truncate_prompt_tokens}) " + f"cannot be greater than max_length ({max_length}). " + f"Please select a smaller truncation size.") + + return truncate_prompt_tokens def _maybe_apply_truncation( self, token_ids: list[int], @@ -186,7 +334,29 @@ class CompletionRenderer(BaseRenderer): max_length=truncate_prompt_tokens) return self._create_tokens_prompt(encoded.input_ids, max_length, - cache_salt) + cache_salt, text) + + async def _maybe_detokenize( + self, + token_ids: list[int], + max_length: Optional[int], + truncate_prompt_tokens: Optional[int], + cache_salt: Optional[str], + needs_detokenization: Optional[bool] = False, + ) -> EngineTokensPrompt: + """Optionally detokenize token IDs and build a tokens prompt.""" + token_ids = self._maybe_apply_truncation(token_ids, + truncate_prompt_tokens) + + prompt = None + if needs_detokenization is True: + async_tokenizer = self._get_async_tokenizer() + prompt = await async_tokenizer.decode(token_ids) + + return self._create_tokens_prompt(token_ids=token_ids, + max_length=max_length, + cache_salt=cache_salt, + prompt=prompt) def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer: """Get or create async tokenizer using shared pool.""" @@ -210,6 +380,7 @@ class CompletionRenderer(BaseRenderer): token_ids: list[int], max_length: Optional[int] = None, cache_salt: Optional[str] = None, + prompt: Optional[str] = None, ) -> EngineTokensPrompt: """Create validated EngineTokensPrompt.""" if max_length is not None and len(token_ids) > max_length: @@ -221,4 +392,6 @@ class CompletionRenderer(BaseRenderer): tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids) if cache_salt is not None: tokens_prompt["cache_salt"] = cache_salt - return tokens_prompt + if prompt is not None: + tokens_prompt["prompt"] = prompt + return tokens_prompt \ No newline at end of file diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 065d0ab59291a..6a005aa634e85 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -52,6 +52,9 @@ class TokensPrompt(TypedDict): prompt_token_ids: list[int] """A list of token IDs to pass to the model.""" + prompt: NotRequired[str] + """The prompt text corresponding to the token IDs, if available.""" + token_type_ids: NotRequired[list[int]] """A list of token type IDs to pass to the cross encoder model.""" From 920ed46b09fbce1967e068be2fa6345768118752 Mon Sep 17 00:00:00 2001 From: Kazuhiro Serizawa <nserihiro@gmail.com> Date: Wed, 10 Sep 2025 02:59:46 +0900 Subject: [PATCH 079/584] [Misc] bump outlines_core to fix the version conflicts with outlines >= 1.2.0 (#24368) Signed-off-by: Kazuhiro Serizawa <nserihiro@gmail.com> Signed-off-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Simon Mo <simon.mo@hey.com> --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 8f5bc9176d90a..b8665104bd09a 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -20,7 +20,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.11.3 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" -outlines_core == 0.2.10 +outlines_core == 0.2.11 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 From 3707cb2505e93cca217831f5483e20b988b5a5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Tue, 9 Sep 2025 20:03:32 +0200 Subject: [PATCH 080/584] [Docs] Gemma3n `transcriptions` endpoint support (#24512) Signed-off-by: NickLucche <nlucches@redhat.com> --- docs/models/supported_models.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index d23fdff568fcd..bf4e243203489 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -766,6 +766,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ | +| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | ### Pooling Models From c3f9773b2c5002a3ba4421f5d7fedf036905d369 Mon Sep 17 00:00:00 2001 From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com> Date: Tue, 9 Sep 2025 23:34:25 +0530 Subject: [PATCH 081/584] [TPU] Fix tpu structured decoding in mixed batches (#24458) Signed-off-by: Chenyaaang <chenyangli@google.com> --- vllm/v1/worker/tpu_model_runner.py | 34 ++++++++++++------------------ 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 5947b54d33ce0..15af7ffac8095 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1769,28 +1769,22 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.grammar_bitmask_cpu.zero_() self.require_structured_out_cpu.zero_() - # We receive the structured output bitmask from the scheduler, but the - # indices of the requests in the batch may not match the indices of - # the bitmask since the scheduler doesn't know how the tpu runner is - # ordering the requests in the batch. We need to match the order of - # bitmask with the order of requests - struct_out_indices: list[int] = [] - mask_indices: list[int] = [] - for req_id in self.input_batch.req_ids: - mask_index = scheduler_output.structured_output_request_ids.get( - req_id) - if mask_index is None: + sorted_struct_requests = sorted( + scheduler_output.structured_output_request_ids.items(), + key=lambda item: item[1]) + cumulative_mask_idx = 0 + for req_id, _ in sorted_struct_requests: + if req_id not in self.input_batch.req_id_to_index: continue batch_index = self.input_batch.req_id_to_index[req_id] - struct_out_indices.append(batch_index) - mask_indices.append(mask_index) - self.grammar_bitmask_cpu[struct_out_indices] = torch.from_numpy( - grammar_bitmask[mask_indices]) - # It's not guaranteed that all requests in this batch require - # structured output, so create a bool tensor to represent - # the requests that need structured output. - struct_out_indices = torch.tensor(struct_out_indices, dtype=torch.long) - self.require_structured_out_cpu[struct_out_indices] = True + self.grammar_bitmask_cpu[batch_index] = torch.from_numpy( + grammar_bitmask[cumulative_mask_idx]) + # It's not guaranteed that all requests in this batch require + # structured output, so create a bool tensor to represent + # the requests that need structured output. + self.require_structured_out_cpu[batch_index] = True + cumulative_mask_idx += 1 + return self.require_structured_out_cpu[:num_reqs].to(logits.device), \ self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \ self.structured_decode_arange.to(logits.device) From b8a93076d36eff5cff8a89f99a7370d0cc6f0e98 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu <riverclouds.zhu@qq.com> Date: Wed, 10 Sep 2025 02:05:25 +0800 Subject: [PATCH 082/584] [CI] execute all piecewise compilation tests together (#24502) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> --- .buildkite/test-pipeline.yaml | 6 +- .../compile/piecewise/test_multiple_graphs.py | 28 +-------- tests/compile/piecewise/test_simple.py | 43 +++---------- tests/compile/piecewise/test_toy_llama.py | 27 +------- tests/compile/silly_attention.py | 63 +++++++++++++++++++ tests/compile/test_decorator.py | 31 ++------- 6 files changed, 81 insertions(+), 117 deletions(-) create mode 100644 tests/compile/silly_attention.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b0d4c4456d339..8f2f6083b0305 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -379,11 +379,7 @@ steps: - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py - # these tests need to be separated, cannot combine - - pytest -v -s compile/piecewise/test_simple.py - - pytest -v -s compile/piecewise/test_toy_llama.py - - pytest -v -s compile/piecewise/test_full_cudagraph.py - - pytest -v -s compile/piecewise/test_multiple_graphs.py + - pytest -v -s compile/piecewise/ - label: PyTorch Fullgraph Test # 20min timeout_in_minutes: 30 diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index aee2acbd490ee..5cfebfce9ea2a 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -4,9 +4,9 @@ Test (piecewise) compilation with a simple model where multiple submodules are compiled and graph captured separately. """ + import torch from torch import nn -from torch.library import Library from vllm.compilation.backends import set_model_tag from vllm.compilation.counter import compilation_counter @@ -15,10 +15,9 @@ from vllm.compilation.decorators import (ignore_torch_compile, from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, VllmConfig, set_current_vllm_config) from vllm.forward_context import BatchDescriptor, set_forward_context -from vllm.utils import direct_register_custom_op -# create a library to hold the custom op -silly_lib = Library("silly", "FRAGMENT") # noqa +# This import automatically registers `torch.ops.silly.attention` +from .. import silly_attention # noqa: F401 BATCH_SIZE = 32 MLP_SIZE = 128 @@ -26,27 +25,6 @@ HIDDEN_SIZE = 1024 RANDOM_SEED = 0 -def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - out.copy_(q) - out += k - out += v - - -def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - return - - -direct_register_custom_op( - op_name="attention", - op_func=silly_attention, - mutates_args=["out"], - fake_impl=silly_attention_fake, - target_lib=silly_lib, -) - - @support_torch_compile class ParentModel(nn.Module): diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 2d1a72d44ec70..84f4945c82725 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -4,10 +4,10 @@ Test the piecewise compilation with a simple model so that we can exactly calculate the expected output and side effects. """ + import pytest import torch from torch import nn -from torch.library import Library from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile @@ -15,35 +15,9 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, VllmConfig, set_current_vllm_config) from vllm.envs import VLLM_USE_V1 from vllm.forward_context import BatchDescriptor, set_forward_context -from vllm.utils import direct_register_custom_op -global_counter = 0 - -# create a library to hold the custom op -silly_lib = Library("silly", "FRAGMENT") # noqa - - -def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - global global_counter - global_counter += 1 - print(f"{global_counter=}") - out.copy_(q) - out[0] += 1 - - -def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - return - - -direct_register_custom_op( - op_name="attention", - op_func=silly_attention, - mutates_args=["out"], - fake_impl=silly_attention_fake, - target_lib=silly_lib, -) +# This import automatically registers `torch.ops.silly.attention` +from ..silly_attention import get_global_counter, reset_global_counter @support_torch_compile @@ -59,8 +33,7 @@ class SillyModel(nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: """ Overall effect: - x += 1 - x[0] += 2 + x = 3 * x + 19 global_counter += 2 """ x = x + 1 @@ -78,6 +51,7 @@ class SillyModel(nn.Module): @pytest.mark.parametrize("use_inductor", [True, False]) +@torch.inference_mode() def test_simple_piecewise_compile(use_inductor): assert VLLM_USE_V1 @@ -121,13 +95,12 @@ def test_simple_piecewise_compile(use_inductor): model(torch.randn(1).cuda()) input = torch.zeros(2).cuda() - global global_counter - global_counter = 0 + reset_global_counter() with set_forward_context( None, vllm_config=vllm_config, cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, batch_descriptor=BatchDescriptor(num_tokens=2, )): output = model(input) - assert global_counter == 2 - assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) + assert get_global_counter() == 2 + assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index bcfd0d834c5db..cba7517647e51 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -14,38 +14,15 @@ from typing import Any, Optional import pytest import torch from torch import nn -from torch.library import Library from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, VllmConfig, set_current_vllm_config) from vllm.forward_context import BatchDescriptor, set_forward_context -from vllm.utils import direct_register_custom_op -# create a library to hold the custom op -silly_lib = Library("silly", "FRAGMENT") # noqa - - -def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - out.copy_(q) - out += k - out += v - - -def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - return - - -direct_register_custom_op( - op_name="attention", - op_func=silly_attention, - mutates_args=["out"], - fake_impl=silly_attention_fake, - target_lib=silly_lib, -) +# This import automatically registers `torch.ops.silly.attention` +from .. import silly_attention # noqa: F401 @dataclass diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py new file mode 100644 index 0000000000000..13eb0bf4b1fa1 --- /dev/null +++ b/tests/compile/silly_attention.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared PyTorch custom silly attention for compilation tests. +Centralizes custom operation definitions to avoid duplicate registrations. +""" + +import torch +from torch.library import Library + +from vllm.utils import direct_register_custom_op + +# Shared library for all compilation test operations +# Using "silly" namespace to match existing test expectations +# import this file will automatically register +# torch ops for testing (like silly.attention) +silly_lib = Library("silly", "FRAGMENT") + +# Global counter that counts the number of times attention is invoked +_global_counter = 0 + + +def get_global_counter(): + """Get the current global counter value""" + return _global_counter + + +def reset_global_counter(): + """Reset the global counter to 0""" + global _global_counter + _global_counter = 0 + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + """ + Unified attention implementation that depends on + all inputs and affects the output. + Always increments a global counter that tests can use or ignore. + """ + global _global_counter + + # Always increment the global counter + _global_counter += 1 + + # Unified implementation that depends on all inputs + out.copy_(q + k + v) + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + """Fake implementation for testing""" + return + + +# Register the unified attention operation +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 51f8ddd566d56..d73586d53ff3e 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from torch import nn -from torch.library import Library from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import (ignore_torch_compile, @@ -10,36 +9,14 @@ from vllm.compilation.decorators import (ignore_torch_compile, from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel, CUDAGraphMode, VllmConfig, set_current_vllm_config) from vllm.forward_context import BatchDescriptor, set_forward_context -from vllm.utils import direct_register_custom_op -# create a library to hold the custom op -silly_lib = Library("silly", "FRAGMENT") # noqa +# This import automatically registers `torch.ops.silly.attention` +from . import silly_attention # noqa: F401 BATCH_SIZE = 32 MLP_SIZE = 128 -def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - out.copy_(q) - out += k - out += v - - -def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: - return - - -direct_register_custom_op( - op_name="attention", - op_func=silly_attention, - mutates_args=["out"], - fake_impl=silly_attention_fake, - target_lib=silly_lib, -) - - @torch.inference_mode def run_model(vllm_config: VllmConfig, model: nn.Module, cudagraph_runtime_mode: CUDAGraphMode): @@ -151,7 +128,7 @@ def test_ignore_torch_compile_decorator(): run_model(vllm_config, mod_C, cudagraph_runtime_mode) -# Only enable torch.compile if +# Only enable torch.compile if # vllm_config.cache_config.kv_sharing_fast_prefill=True @support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. kv_sharing_fast_prefill) @@ -173,7 +150,7 @@ class B(nn.Module): return x -# Only enable torch.compile if +# Only enable torch.compile if # vllm_config.cache_config.kv_sharing_fast_prefill=False @support_torch_compile(enable_if=lambda vllm_config: not vllm_config. cache_config.kv_sharing_fast_prefill) From 15de5ff9ea30950212b724818c716084528a2f7b Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:59:34 -0400 Subject: [PATCH 083/584] [Feature] Disallow FlashMLA on Blackwell (#24521) Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/attention/backends/flashmla.py | 11 +++++++++++ vllm/v1/attention/backends/mla/flashmla.py | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index f23c096952ce0..411eb5413f53c 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -17,6 +17,7 @@ from vllm.attention.backends.mla.common import (MLACommonBackend, from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, get_mla_metadata, is_flashmla_supported) +from vllm.platforms.cuda import CudaPlatform class FlashMLABackend(MLACommonBackend): @@ -181,6 +182,16 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" + # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs + # context: + # https://github.com/deepseek-ai/FlashMLA/issues/83 + # https://github.com/vllm-project/vllm/issues/24513 + if CudaPlatform.has_device_capability(100): + raise NotImplementedError( + "FlashMLA is temporarily disabled on Blackwell (SM 10.0). " + "Please use CUTLASS_MLA or TRITON_MLA instead. " + "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`") + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 2f13f19218d9c..549af1a062252 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -12,6 +12,7 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, is_flashmla_supported) from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.platforms.cuda import CudaPlatform from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonDecodeMetadata, MLACommonImpl, @@ -158,6 +159,16 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" + # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs + # context: + # https://github.com/deepseek-ai/FlashMLA/issues/83 + # https://github.com/vllm-project/vllm/issues/24513 + if CudaPlatform.has_device_capability(100): + raise NotImplementedError( + "FlashMLA is temporarily disabled on Blackwell (SM 10.0). " + "Please use CUTLASS_MLA or TRITON_MLA instead. " + "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`") + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( From b5fb3005a8624af171f12aaa6fb97557d91cbcd3 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu <riverclouds.zhu@qq.com> Date: Wed, 10 Sep 2025 04:46:35 +0800 Subject: [PATCH 084/584] [Log] Use a relative path in debug-level logs to distinguish files with identical names (#23846) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> --- vllm/logger.py | 2 +- vllm/logging_utils/formatter.py | 65 ++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/vllm/logger.py b/vllm/logger.py index 8f06eb03c7f93..a453aa308aaf2 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -22,7 +22,7 @@ VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX _FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " - "[%(filename)s:%(lineno)d] %(message)s") + "[%(fileinfo)s:%(lineno)d] %(message)s") _DATE_FORMAT = "%m-%d %H:%M:%S" DEFAULT_LOGGING_CONFIG = { diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py index 0affef10078dc..004b79f3ea6e2 100644 --- a/vllm/logging_utils/formatter.py +++ b/vllm/logging_utils/formatter.py @@ -2,16 +2,77 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging +from pathlib import Path + +from vllm import envs class NewLineFormatter(logging.Formatter): """Adds logging prefix to newlines to align multi-line messages.""" def __init__(self, fmt, datefmt=None, style="%"): - logging.Formatter.__init__(self, fmt, datefmt, style) + super().__init__(fmt, datefmt, style) + + self.use_relpath = envs.VLLM_LOGGING_LEVEL == "DEBUG" + if self.use_relpath: + self.root_dir = Path(__file__).resolve().parent.parent.parent def format(self, record): - msg = logging.Formatter.format(self, record) + + def shrink_path(relpath: Path) -> str: + """ + Shortens a file path for logging display: + - Removes leading 'vllm' folder if present. + - If path starts with 'v1', + keeps the first two and last two levels, + collapsing the middle as '...'. + - Otherwise, keeps the first and last two levels, + collapsing the middle as '...'. + - If the path is short, returns it as-is. + - Examples: + vllm/model_executor/layers/quantization/utils/fp8_utils.py -> + model_executor/.../quantization/utils/fp8_utils.py + vllm/model_executor/layers/quantization/awq.py -> + model_executor/layers/quantization/awq.py + vllm/v1/attention/backends/mla/common.py -> + v1/attention/backends/mla/common.py + + Args: + relpath (Path): The relative path to be shortened. + Returns: + str: The shortened path string for display. + """ + parts = list(relpath.parts) + new_parts = [] + if parts and parts[0] == "vllm": + parts = parts[1:] + if parts and parts[0] == "v1": + new_parts += parts[:2] + parts = parts[2:] + elif parts: + new_parts += parts[:1] + parts = parts[1:] + if len(parts) > 2: + new_parts += ["..."] + parts[-2:] + else: + new_parts += parts + return "/".join(new_parts) + + if self.use_relpath: + abs_path = getattr(record, "pathname", None) + if abs_path: + try: + relpath = Path(abs_path).resolve().relative_to( + self.root_dir) + except Exception: + relpath = Path(record.filename) + else: + relpath = Path(record.filename) + record.fileinfo = shrink_path(relpath) + else: + record.fileinfo = record.filename + + msg = super().format(record) if record.message != "": parts = msg.split(record.message) msg = msg.replace("\n", "\r\n" + parts[0]) From 0dc9cbb527c9445e85ccd1bd4374adfe71405b9d Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 9 Sep 2025 17:15:41 -0400 Subject: [PATCH 085/584] [Benchmark] Update bench doc with mtbench, blazedit, spec bench (#24450) Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- benchmarks/README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/benchmarks/README.md b/benchmarks/README.md index 957c2f988051d..ee172642033de 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -95,6 +95,24 @@ become available. <td style="text-align: center;">✅</td> <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td> </tr> + <tr> + <td><strong>HuggingFace-MTBench</strong></td> + <td style="text-align: center;">✅</td> + <td style="text-align: center;">✅</td> + <td><code>philschmid/mt-bench</code></td> + </tr> + <tr> + <td><strong>HuggingFace-Blazedit</strong></td> + <td style="text-align: center;">✅</td> + <td style="text-align: center;">✅</td> + <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td> + </tr> + <tr> + <td><strong>Spec Bench</strong></td> + <td style="text-align: center;">✅</td> + <td style="text-align: center;">✅</td> + <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td> + </tr> <tr> <td><strong>Custom</strong></td> <td style="text-align: center;">✅</td> @@ -239,6 +257,43 @@ vllm bench serve \ --num-prompts 2048 ``` +### Spec Bench Benchmark with Speculative Decoding + +``` bash +VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' +``` + +[SpecBench dataset](https://github.com/hemingkx/Spec-Bench) + +Run all categories: + +``` bash +# Download the dataset using: +# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl + +vllm bench serve \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name spec_bench \ + --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \ + --num-prompts -1 +``` + +Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`. + +Run only a specific category like "summarization": + +``` bash +vllm bench serve \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name spec_bench \ + --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \ + --num-prompts -1 + --spec-bench-category "summarization" +``` + ### Other HuggingFaceDataset Examples ```bash @@ -295,6 +350,18 @@ vllm bench serve \ --num-prompts 80 ``` +`vdaita/edit_5k_char` or `vdaita/edit_10k_char`: + +``` bash +vllm bench serve \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path vdaita/edit_5k_char \ + --num-prompts 90 \ + --blazedit-min-distance 0.01 \ + --blazedit-max-distance 0.99 +``` + ### Running With Sampling Parameters When using OpenAI-compatible backends such as `vllm`, optional sampling From fb1a8f932a6dabf19f890dd905ebe217539fd147 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 9 Sep 2025 18:00:17 -0400 Subject: [PATCH 086/584] [Benchmark] Add option to skip oversampling in benchmark (#24457) Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- vllm/benchmarks/datasets.py | 71 ++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 784536054a19f..32820b026b6f6 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -198,8 +198,9 @@ class BenchmarkDataset(ABC): @abstractmethod def sample(self, tokenizer: PreTrainedTokenizerBase, - num_requests: int, - request_id_prefix: str = "") -> list[SampleRequest]: + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False) -> list[SampleRequest]: """ Abstract method to generate sample requests from the dataset. @@ -224,6 +225,7 @@ class BenchmarkDataset(ABC): requests: list[SampleRequest], num_requests: int, request_id_prefix: str = "", + no_oversample: bool = False, ) -> None: """ Oversamples the list of requests if its size is less than the desired @@ -236,6 +238,11 @@ class BenchmarkDataset(ABC): request_id_prefix (str) The prefix of the request ids. """ + if no_oversample: + logger.info("Skipping oversampling. " \ + "Total samples: %d.", len(requests)) + return + if len(requests) < num_requests: random.seed(self.random_seed) additional = deepcopy( @@ -405,6 +412,7 @@ class RandomDataset(BenchmarkDataset): tokenizer: PreTrainedTokenizerBase, num_requests: int, request_id_prefix: str = "", + no_oversample: bool = False, prefix_len: int = DEFAULT_PREFIX_LEN, range_ratio: float = DEFAULT_RANGE_RATIO, input_len: int = DEFAULT_INPUT_LEN, @@ -832,6 +840,7 @@ class RandomMultiModalDataset(RandomDataset): tokenizer: PreTrainedTokenizerBase, num_requests: int, request_id_prefix: str = "", + no_oversample: bool = False, prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, input_len: int = RandomDataset.DEFAULT_INPUT_LEN, @@ -959,6 +968,7 @@ class ShareGPTDataset(BenchmarkDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list: samples: list = [] @@ -1002,7 +1012,10 @@ class ShareGPTDataset(BenchmarkDataset): request_id=request_id_prefix + str(ind), )) ind += 1 - self.maybe_oversample_requests(samples, num_requests, request_id_prefix) + self.maybe_oversample_requests(samples, + num_requests, + request_id_prefix, + no_oversample) return samples @@ -1036,6 +1049,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser): help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.", ) + parser.add_argument( + "--no-oversample", + action="store_true", + help="Do not oversample if the dataset has " \ + "fewer samples than num-prompts.", + ) # group for dataset specific arguments custom_group = parser.add_argument_group("custom dataset options") @@ -1322,6 +1341,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: output_len=args.custom_output_len, skip_chat_template=args.custom_skip_chat_template, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ) elif args.dataset_name == "sonnet": @@ -1336,6 +1356,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, return_prompt_formatted=False, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ) else: assert tokenizer.chat_template or tokenizer.default_chat_template, ( @@ -1348,6 +1369,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, return_prompt_formatted=True, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ) elif args.dataset_name == "hf": @@ -1443,6 +1465,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, output_len=args.hf_output_len, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, **hf_kwargs ) @@ -1456,6 +1479,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, output_len=args.spec_bench_output_len, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ), "sharegpt": lambda: ShareGPTDataset( random_seed=args.seed, dataset_path=args.dataset_path @@ -1464,6 +1488,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: num_requests=args.num_prompts, output_len=args.sharegpt_output_len, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ), "burstgpt": lambda: BurstGPTDataset( random_seed=args.seed, dataset_path=args.dataset_path @@ -1471,6 +1496,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, num_requests=args.num_prompts, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ), "random": lambda: RandomDataset( random_seed=args.seed, dataset_path=args.dataset_path @@ -1483,6 +1509,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: range_ratio=args.random_range_ratio, request_id_prefix=args.request_id_prefix, batchsize=args.random_batch_size, + no_oversample=args.no_oversample, ), "random-mm": lambda: RandomMultiModalDataset( @@ -1499,6 +1526,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio, bucket_config=args.random_mm_bucket_config, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ), "prefix_repetition": lambda: PrefixRepetitionRandomDataset( @@ -1511,6 +1539,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: num_prefixes=args.prefix_repetition_num_prefixes, output_len=args.prefix_repetition_output_len, request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, ), } @@ -1592,6 +1621,7 @@ class CustomDataset(BenchmarkDataset): enable_multimodal_chat: bool = False, skip_chat_template: bool = False, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list: # load all data if needed @@ -1628,7 +1658,7 @@ class CustomDataset(BenchmarkDataset): request_id=request_id_prefix + str(i), )) self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -1719,6 +1749,7 @@ class SonnetDataset(BenchmarkDataset): output_len: int = DEFAULT_OUTPUT_LEN, return_prompt_formatted: bool = False, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list: # Calculate average token length for a poem line. @@ -1814,6 +1845,7 @@ class BurstGPTDataset(BenchmarkDataset): max_loras: Optional[int] = None, lora_path: Optional[str] = None, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list[SampleRequest]: samples = [] @@ -1893,6 +1925,7 @@ class ConversationDataset(HuggingFaceDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs) -> list: # Filter examples with at least 2 conversations filtered_data = self.data.filter( @@ -1934,7 +1967,7 @@ class ConversationDataset(HuggingFaceDataset): )) ind += 1 self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -1964,6 +1997,7 @@ class VisionArenaDataset(HuggingFaceDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list: output_len = (output_len @@ -1993,7 +2027,7 @@ class VisionArenaDataset(HuggingFaceDataset): request_id=request_id_prefix + str(i), )) self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -2023,6 +2057,7 @@ class InstructCoderDataset(HuggingFaceDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs) -> list: output_len = (output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN) @@ -2054,7 +2089,7 @@ class InstructCoderDataset(HuggingFaceDataset): request_id=request_id_prefix + str(i), )) self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -2085,6 +2120,7 @@ class MTBenchDataset(HuggingFaceDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list: output_len = (output_len @@ -2115,7 +2151,7 @@ class MTBenchDataset(HuggingFaceDataset): request_id=request_id_prefix + str(i), )) self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -2149,6 +2185,7 @@ class BlazeditDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, request_id_prefix: str = "", + no_oversample: bool = False, min_distance: float = 0.0, max_distance: float = 1.0, **kwargs, @@ -2202,7 +2239,7 @@ Please generate the new code file in the "New file" section below.""" # noqa: E5 request_id=request_id_prefix + str(i), )) self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -2226,6 +2263,7 @@ class AIMODataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs) -> list: sampled_requests = [] ind = 0 @@ -2258,7 +2296,7 @@ class AIMODataset(HuggingFaceDataset): )) ind += 1 self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -2330,6 +2368,7 @@ class NextEditPredictionDataset(HuggingFaceDataset): def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs): formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name) if formatting_prompt_func is None: @@ -2347,7 +2386,10 @@ class NextEditPredictionDataset(HuggingFaceDataset): )) if len(samples) >= num_requests: break - self.maybe_oversample_requests(samples, num_requests, request_id_prefix) + self.maybe_oversample_requests(samples, + num_requests, + request_id_prefix, + no_oversample) return samples @@ -2398,6 +2440,7 @@ class ASRDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list: output_len = (output_len @@ -2436,7 +2479,7 @@ class ASRDataset(HuggingFaceDataset): skipped, ) self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -2474,6 +2517,7 @@ class MLPerfDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list[SampleRequest]: # Force dynamic output length based on reference completion. @@ -2520,7 +2564,7 @@ class MLPerfDataset(HuggingFaceDataset): ind += 1 self.maybe_oversample_requests(sampled_requests, num_requests, - request_id_prefix) + request_id_prefix, no_oversample) return sampled_requests @@ -2554,6 +2598,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset): num_prefixes: int = DEFAULT_NUM_PREFIXES, output_len: int = DEFAULT_OUTPUT_LEN, request_id_prefix: str = "", + no_oversample: bool = False, **kwargs, ) -> list[SampleRequest]: vocab_size = tokenizer.vocab_size From 73e688cb79786d4403c227b09a39d9a79a201299 Mon Sep 17 00:00:00 2001 From: Charlie Fu <charlifu@amd.com> Date: Tue, 9 Sep 2025 18:27:35 -0500 Subject: [PATCH 087/584] [ROCm][Feature] Enable Pipeline Parallelism with Ray Compiled Graph on ROCm (#24275) Signed-off-by: charlifu <charlifu@amd.com> --- docker/Dockerfile.rocm | 1 + requirements/rocm.txt | 2 +- vllm/utils/__init__.py | 16 ++++++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index ff8ad1607e34a..063fc49693288 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -104,6 +104,7 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false # ENV that can improve safe tensor loading, and end-to-end time diff --git a/requirements/rocm.txt b/requirements/rocm.txt index c3bb65b70a0b8..8e39951210714 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' boto3 botocore datasets -ray>=2.10.0,<2.45.0 +ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. peft pytest-asyncio tensorizer==2.10.1 diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 49c706bc37a84..e166e64e11016 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -78,6 +78,7 @@ if TYPE_CHECKING: from argparse import Namespace from vllm.config import ModelConfig, VllmConfig + from vllm.sequence import IntermediateTensors logger = init_logger(__name__) @@ -1472,7 +1473,8 @@ def current_stream() -> torch.cuda.Stream: # is hurting performance. Therefore creating a dedicated stream # per process if current_platform.is_rocm(): - _current_stream_tls.value = torch.cuda.Stream() + # torch.cuda.set_stream here is the alias of _pathed_set_stream + torch.cuda.set_stream(torch.cuda.Stream()) elif current_platform.is_cpu(): _current_stream_tls.value = _StreamPlaceholder() else: @@ -2278,7 +2280,8 @@ def weak_ref_tensor(tensor: Any) -> Any: def weak_ref_tensors( - tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]] + tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor], + IntermediateTensors] ) -> Union[torch.Tensor, list[Any], tuple[Any], Any]: """ Convenience function to create weak references to tensors, @@ -2290,6 +2293,15 @@ def weak_ref_tensors( return [weak_ref_tensor(t) for t in tensors] if isinstance(tensors, tuple): return tuple(weak_ref_tensor(t) for t in tensors) + + # For IntermediateTensors used in pipeline parallelism + from vllm.sequence import IntermediateTensors + if isinstance(tensors, IntermediateTensors): + ret = IntermediateTensors({ + key: weak_ref_tensor(val) + for key, val in tensors.tensors.items() + }) + return ret raise ValueError("Invalid type for tensors") From 561f38dc3cfc54cca7710b033a86a315deb24111 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith <tyler@neuralmagic.com> Date: Tue, 9 Sep 2025 20:32:36 -0400 Subject: [PATCH 088/584] [Bugfix] Improve EPLB config validation error message (#24524) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> --- vllm/config/parallel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 3a74b5fb7e64f..2f8ad5c6b6b04 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -368,8 +368,10 @@ class ParallelConfig: else: if self.eplb_config.num_redundant_experts != 0: raise ValueError( - "num_redundant_experts should be used with EPLB." - f"{self.eplb_config.num_redundant_experts}.") + "num_redundant_experts is set to " + f"{self.eplb_config.num_redundant_experts} but EPLB is not " + "enabled. Either enable EPLB or unset " + "num_redundant_experts.") if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. From b23fb78623f0019854af7a820d398cba9a316677 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Tue, 9 Sep 2025 20:53:53 -0400 Subject: [PATCH 089/584] [Bugfix] Fix for 24530. Fix naive all2all shared expert overlap. (#24538) --- vllm/model_executor/layers/fused_moe/layer.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2f88a63665c52..551f284a36095 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1755,9 +1755,6 @@ class FusedMoE(CustomOp): self.dp_size > 1 and not self.moe_parallel_config.use_deepep_ht_kernels and not self.moe_config.use_flashinfer_cutlass_kernels) - if do_naive_dispatch_combine: - hidden_states, router_logits = get_ep_group().dispatch( - hidden_states, router_logits) # If there are shared experts but we are not using a modular kernel, the # shared experts must be called here @@ -1768,6 +1765,10 @@ class FusedMoE(CustomOp): else: shared_output = None + if do_naive_dispatch_combine: + hidden_states, router_logits = get_ep_group().dispatch( + hidden_states, router_logits) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1800,8 +1801,9 @@ class FusedMoE(CustomOp): final_hidden_states, ) - def reduce_output(states: torch.Tensor) -> torch.Tensor: - if do_naive_dispatch_combine: + def reduce_output(states: torch.Tensor, + do_combine: bool = True) -> torch.Tensor: + if do_naive_dispatch_combine and do_combine: states = get_ep_group().combine(states) if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): @@ -1810,10 +1812,11 @@ class FusedMoE(CustomOp): return states if self.shared_experts is None: + assert not isinstance(final_hidden_states, tuple) return reduce_output(final_hidden_states) else: return ( - reduce_output(final_hidden_states[0]), + reduce_output(final_hidden_states[0], do_combine=False), reduce_output(final_hidden_states[1]), ) From dc625ea6b8f8072afd3e0923f7ef5a39be87fc96 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Tue, 9 Sep 2025 20:01:06 -0700 Subject: [PATCH 090/584] [Perf] Convert np array to torch tensor to index into block table for attn chunking (#24474) Signed-off-by: Yong Hoon Shin <yhshin@meta.com> --- vllm/v1/attention/backends/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index b286a4ba9fe54..8e3d530fc1f98 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -542,7 +542,14 @@ def make_local_attention_virtual_batches( 1) batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32), local_blocks * pages_per_local_batch) - block_table_local = block_table[batch_indices, block_indices]\ + + # NOTE: https://github.com/pytorch/pytorch/pull/160256 causes performance + # regression when using numpy arrays (batch and block indices) to index into + # torch tensor (block_table). As a workaround, convert numpy arrays to torch + # tensor first, which recovers perf. + batch_indices_torch = torch.from_numpy(batch_indices) + block_indices_torch = torch.from_numpy(block_indices) + block_table_local = block_table[batch_indices_torch, block_indices_torch]\ .view(virtual_batches, -1) query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local) From 41f160b9741f265564c665b4a008573d9e7afc38 Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 9 Sep 2025 20:30:32 -0700 Subject: [PATCH 091/584] Add @heheda12345 to CODEOWNERS of KVCacheManager related code (#24546) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- .github/CODEOWNERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b6b3e184bff25..f627ff46447e1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -30,6 +30,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/attention/backends/triton_attn.py @tdoublep +/vllm/v1/core @heheda12345 +/vllm/v1/kv_cache_interface.py @heheda12345 # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo @@ -46,6 +48,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm +/tests/v1/core @heheda12345 /tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee /tests/models/language/generation/test_hybrid.py @tdoublep From 7e7db04310922f45524d4bfbe72f1dc7205ae7d6 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Tue, 9 Sep 2025 20:33:10 -0700 Subject: [PATCH 092/584] [CI] Retry flaky fp8 cutlass mla tests (#24536) Signed-off-by: Nick Hill <nhill@redhat.com> --- tests/kernels/test_cutlass_mla_decode.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py index 820dac0e6cec9..5078bd730a1a3 100644 --- a/tests/kernels/test_cutlass_mla_decode.py +++ b/tests/kernels/test_cutlass_mla_decode.py @@ -49,7 +49,13 @@ CUTLASS_MLA_UNSUPPORTED_REASON = \ @pytest.mark.parametrize("block_size", [64]) @pytest.mark.parametrize("causal", [True]) @pytest.mark.parametrize("varlen", [False, True]) -@pytest.mark.parametrize("torch_dtype", [torch.bfloat16, torch.float8_e4m3fn]) +@pytest.mark.parametrize( + "torch_dtype", + [ + torch.bfloat16, + # fp8 can have occasional precision-related failures. + pytest.param(torch.float8_e4m3fn, marks=pytest.mark.flaky(reruns=2)) + ]) @torch.inference_mode() def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, varlen, torch_dtype): From 3c2156b3af0e1fd8060bdfe5980d680ebc550a43 Mon Sep 17 00:00:00 2001 From: Ignacio Sica <mignacio.sica@gmail.com> Date: Wed, 10 Sep 2025 00:50:21 -0300 Subject: [PATCH 093/584] [Hardware][Apple-CPU] Enable native bfloat16 on Apple Silicon (M2 and later) (#24129) Signed-off-by: ignaciosica <mignacio.sica@gmail.com> --- vllm/platforms/cpu.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 12d5e0bf08652..31d626a9e9667 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -75,12 +75,12 @@ class CpuPlatform(Platform): def supported_dtypes(self) -> list[torch.dtype]: if self.get_cpu_architecture() == CpuArchEnum.POWERPC: return [torch.bfloat16, torch.float32] - elif sys.platform.startswith( - "darwin") and self.get_cpu_architecture() == CpuArchEnum.ARM: - # TODO: change this condition to check if the platform support bf16 - # instead of checking the OS. For instance M2 shall supports bf16 - # already. But we need to modify `cpu_extension.cmake` to activate - # the feature in the build. + elif (self.get_cpu_architecture() == CpuArchEnum.ARM + and sys.platform.startswith("darwin")): + if (subprocess.check_output( + ["sysctl -n hw.optional.arm.FEAT_BF16"], + shell=True).strip() == b"1"): + return [torch.bfloat16, torch.float16, torch.float32] return [torch.float16, torch.float32] # x86/aarch64 CPU has supported both bf16 and fp16 natively. return [torch.bfloat16, torch.float16, torch.float32] From f88e84016fdd3f1ab081c18357096576b1f8fb71 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Tue, 9 Sep 2025 21:07:13 -0700 Subject: [PATCH 094/584] [BugFix] Fix async core engine client finalizer (#24540) Signed-off-by: Nick Hill <nhill@redhat.com> --- vllm/v1/engine/core_client.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 65f7abc97110c..605bedaf10e66 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -347,8 +347,9 @@ class BackgroundResources: if isinstance(self.output_socket, zmq.asyncio.Socket): # Async case. - loop = self.output_socket._get_loop() - asyncio.get_running_loop() + loop = self.output_queue_task._loop \ + if self.output_queue_task else None + sockets = (self.output_socket, self.input_socket, self.first_req_send_socket, self.first_req_rcv_socket, self.stats_update_socket) @@ -359,11 +360,12 @@ class BackgroundResources: close_sockets(sockets) for task in tasks: if task is not None and not task.done(): - task.cancel() + with contextlib.suppress(Exception): + task.cancel() if in_loop(loop): close_sockets_and_tasks() - elif not loop.is_closed(): + elif loop and not loop.is_closed(): loop.call_soon_threadsafe(close_sockets_and_tasks) else: # Loop has been closed, try to clean up directly. From 83dd28aae4c3cd8442f5be42e0d77263b68c4236 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Tue, 9 Sep 2025 21:07:33 -0700 Subject: [PATCH 095/584] [CI] Adjust threshold for flaky ngram spec decoding test (#24528) Signed-off-by: Nick Hill <nhill@redhat.com> --- tests/v1/e2e/test_spec_decode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index cd1d34fc6c3ec..6848f204358cd 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -117,9 +117,9 @@ def test_ngram_correctness( print(f"ref_output: {ref_output.outputs[0].text}") print(f"spec_output: {spec_output.outputs[0].text}") - # Heuristic: expect at least 70% of the prompts to match exactly + # Heuristic: expect at least 68% of the prompts to match exactly # Upon failure, inspect the outputs to check for inaccuracy. - assert matches > int(0.7 * len(ref_outputs)) + assert matches >= int(0.68 * len(ref_outputs)) del spec_llm torch.cuda.empty_cache() cleanup_dist_env_and_memory() From b4a01aaf95f54bba90cb0b072e9254ddb998af8f Mon Sep 17 00:00:00 2001 From: Yihua Cheng <yihua98@uchicago.edu> Date: Tue, 9 Sep 2025 21:23:37 -0700 Subject: [PATCH 096/584] [KV Connector] More async support for `get_num_new_matched_tokens` (#23620) Signed-off-by: ApostaC <yihua98@uchicago.edu> --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 9 ++++++--- .../kv_transfer/kv_connector/v1/lmcache_connector.py | 2 +- .../kv_transfer/kv_connector/v1/multi_connector.py | 6 +++++- .../kv_transfer/kv_connector/v1/nixl_connector.py | 2 +- .../kv_connector/v1/shared_storage_connector.py | 4 ++-- vllm/v1/core/sched/scheduler.py | 8 ++++++++ 6 files changed, 23 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index f3f493144d283..cd4561154b78b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -243,7 +243,7 @@ class KVConnectorBase_V1(ABC): self, request: "Request", num_computed_tokens: int, - ) -> tuple[int, bool]: + ) -> tuple[Optional[int], bool]: """ Get number of new tokens that can be loaded from the external KV cache beyond the num_computed_tokens. @@ -255,8 +255,11 @@ class KVConnectorBase_V1(ABC): Returns: A tuple with the following elements: - - The number of tokens that can be loaded from the - external KV cache beyond what is already computed. + - An optional number of tokens that can be loaded from the + external KV cache beyond what is already computed. + If None, it means that the connector needs more time to + determine the number of matched tokens, and the scheduler + should query for this request again later. - `True` if external KV cache tokens will be loaded asynchronously (between scheduler steps). Must be 'False' if the first element is 0. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index e838ac2499c04..c99f538ee4185 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -110,7 +110,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1): self, request: "Request", num_computed_tokens: int, - ) -> tuple[int, bool]: + ) -> tuple[Optional[int], bool]: """ Get number of new tokens that can be loaded from the external KV cache beyond the num_computed_tokens. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index edbff4e4340f4..200b4bf874d89 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -143,11 +143,15 @@ class MultiConnector(KVConnectorBase_V1): self, request: "Request", num_computed_tokens: int, - ) -> tuple[int, bool]: + ) -> tuple[Optional[int], bool]: to_return = (0, False) for i, c in enumerate(self._connectors): toks, load_async = c.get_num_new_matched_tokens( request, num_computed_tokens) + # If there is a connector still looking up the matches, + # we return None to indicate that we are not done yet. + if toks is None: + return (None, False) # The first connector that has new matched tokens will be assigned # to this request. if to_return[0] == 0 and toks > 0: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 20d1e31a7106b..17f5be76ce400 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -162,7 +162,7 @@ class NixlConnector(KVConnectorBase_V1): def get_num_new_matched_tokens( self, request: "Request", - num_computed_tokens: int) -> tuple[int, bool]: + num_computed_tokens: int) -> tuple[Optional[int], bool]: assert self.connector_scheduler is not None return self.connector_scheduler.get_num_new_matched_tokens( request, num_computed_tokens) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index fd79387269d56..9f4da613f8b62 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -3,7 +3,7 @@ import hashlib import os from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import safetensors import torch @@ -238,7 +238,7 @@ class SharedStorageConnector(KVConnectorBase_V1): self, request: "Request", num_computed_tokens: int, - ) -> tuple[int, bool]: + ) -> tuple[Optional[int], bool]: """ Get number of new tokens that can be loaded from the external KV cache beyond the num_computed_tokens. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2d40e96632c95..ed7c16dc520fe 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -387,6 +387,14 @@ class Scheduler(SchedulerInterface): self.connector.get_num_new_matched_tokens( request, num_new_local_computed_tokens)) + if num_external_computed_tokens is None: + # The request cannot be scheduled because + # the KVConnector couldn't determine + # the number of matched tokens. + self.waiting.pop_request() + skipped_waiting_requests.prepend_request(request) + continue + # Total computed tokens (local + external). num_computed_tokens = (num_new_local_computed_tokens + num_external_computed_tokens) From 309d7aa401703a051301dbad471a6543021a760f Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Wed, 10 Sep 2025 12:24:11 +0800 Subject: [PATCH 097/584] [P/D] MultiConnector supports shutdown (#24425) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> --- .../kv_transfer/kv_connector/v1/multi_connector.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 200b4bf874d89..616d158d67670 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -88,6 +88,18 @@ class MultiConnector(KVConnectorBase_V1): for c in self._connectors: c.clear_connector_metadata() + def shutdown(self): + exception: Optional[Exception] = None + for c in self._connectors: + try: + c.shutdown() + except Exception as e: + logger.exception("Exception during connector %s shutdown.", + c.__class__.__name__) + exception = e + if exception: + raise exception + # ============================== # Worker-side methods # ============================== From 53b42f4102dadd77e16a439b282addab19e2a625 Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Tue, 9 Sep 2025 21:24:23 -0700 Subject: [PATCH 098/584] [BugFix][Spec Decode] Fix out-of-range index triggered by eagle3; re-enable test for LlamaForCausalLMEagle3 (#24392) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> --- tests/models/registry.py | 9 ++-- tests/v1/e2e/test_spec_decode.py | 55 ++++++++++------------ vllm/config/__init__.py | 9 +++- vllm/model_executor/models/llama.py | 17 ++++++- vllm/model_executor/models/llama_eagle3.py | 4 ++ vllm/model_executor/models/registry.py | 3 +- vllm/transformers_utils/configs/eagle.py | 2 + 7 files changed, 58 insertions(+), 41 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 755a37b109d76..a5e83bc11f144 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -602,11 +602,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { trust_remote_code=True, speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", tokenizer="meta-llama/Llama-3.1-8B-Instruct"), - # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 - # "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501 - # trust_remote_code=True, - # speculative_model="AngelSlim/Qwen3-8B_eagle3", # noqa: E501 - # tokenizer="Qwen/Qwen3-8B"), + "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501 + trust_remote_code=True, + speculative_model="AngelSlim/Qwen3-8B_eagle3", # noqa: E501 + tokenizer="Qwen/Qwen3-8B"), "EagleLlama4ForCausalLM": _HfExamplesInfo( "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", trust_remote_code=True, diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 6848f204358cd..469464f777fb2 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -125,37 +125,30 @@ def test_ngram_correctness( cleanup_dist_env_and_memory() -@pytest.mark.parametrize( - ["model_setup", "mm_enabled"], - [ - # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 - # (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False), - (("eagle", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False), - (("eagle3", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False), - pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - False, - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), - pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - True, - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), - (("eagle", "eagle618/deepseek-v3-random", - "eagle618/eagle-deepseek-v3-random", 1), False), - ], - ids=[ - # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 - # "qwen3_eagle3", - "llama3_eagle", - "llama3_eagle3", - "llama4_eagle", - "llama4_eagle_mm", - "deepseek_eagle" - ]) +@pytest.mark.parametrize(["model_setup", "mm_enabled"], [ + (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False), + (("eagle", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False), + (("eagle3", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + False, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + True, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + (("eagle", "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", 1), False), +], + ids=[ + "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", + "llama4_eagle", "llama4_eagle_mm", + "deepseek_eagle" + ]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) def test_eagle_correctness( diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 4f4673ac6e67a..651cd7339101b 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2191,9 +2191,14 @@ class SpeculativeConfig: # Automatically detect the method if self.method in ('eagle', 'eagle3'): pass - elif "eagle-" in self.draft_model_config.model.lower() or \ - "eagle3-" in self.draft_model_config.model.lower(): + # examples: + # yuhuili/EAGLE-LLaMA3-Instruct-8B + # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B + # AngelSlim/Qwen3-8B_eagle3 + elif "eagle-" in self.draft_model_config.model.lower(): self.method = "eagle" + elif "eagle3" in self.draft_model_config.model.lower(): + self.method = "eagle3" elif self.draft_model_config.hf_config.model_type == "medusa": self.method = "medusa" elif (self.draft_model_config.hf_config.model_type == diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index a22bde194f5de..96530562b072c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -171,7 +171,22 @@ class LlamaAttention(nn.Module): sliding_window = None if layer_types := getattr(config, "layer_types", None): - is_sliding = layer_types[layer_idx] == "sliding_attention" + # Fix for Eagle3 compatibility: + # for draft models, subtract target layer count + # to get draft-relative layer index starting from 0 + if hasattr(config, 'target_layer_count'): + # This is a draft model, + # adjust layer_idx to be relative to draft layers + effective_layer_idx = layer_idx - config.target_layer_count + else: + # This is a target model, use layer_idx directly + effective_layer_idx = layer_idx + assert effective_layer_idx < len(layer_types), \ + f"effective_layer_idx: {effective_layer_idx} \ + is out of bounds for layer_types: {layer_types}" + + is_sliding = layer_types[ + effective_layer_idx] == "sliding_attention" if is_sliding: sliding_window = config.sliding_window diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 572930c39a846..bceb6cc42768e 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -199,6 +199,10 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): speculative_config.draft_model_config.hf_config target_layer_num = vllm_config.model_config.get_num_layers( vllm_config.parallel_config) + + # Store target layer count in draft config for + # proper layer_types indexing in draft models + self.config.target_layer_count = target_layer_num self.model = LlamaModel(vllm_config=vllm_config, prefix="model", start_layer_id=target_layer_num) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 43075956b450b..ed7797ce9d4f1 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -277,8 +277,7 @@ _SPECULATIVE_DECODING_MODELS = { "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"), "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), - # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 - # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 6aabf9e5262e6..444ed70de3d0c 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -46,6 +46,7 @@ class EAGLEConfig(PretrainedConfig): # Eagle model name should follow naming convention of # LlamaForCausalLM -> EagleLlamaForCausalLM # LlamaForCausalLM -> Eagle3LlamaForCausalLM + # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3 if method == "eagle": assert self.model is not None, \ "model should not be None when method is eagle" @@ -53,6 +54,7 @@ class EAGLEConfig(PretrainedConfig): f"Eagle{arch}" if not arch.startswith("Eagle") \ else arch for arch in self.model.architectures ] + elif method == "eagle3": assert self.model is not None, \ "model should not be None when method is eagle3" From 0efdb5c3bad240121d91083189565df0b82502c2 Mon Sep 17 00:00:00 2001 From: Wei <wwei6@meta.com> Date: Tue, 9 Sep 2025 21:27:53 -0700 Subject: [PATCH 099/584] [gpt-oss] Cache permute indices for faster MXFP4 MoE layer loading (#24154) Signed-off-by: Wei Wei <wwei6@meta.com> --- tests/kernels/moe/test_mxfp4_moe.py | 101 +++++++++++++++--- .../layers/quantization/mxfp4.py | 78 ++++++++++---- 2 files changed, 145 insertions(+), 34 deletions(-) diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index c29bed3dd6b32..882b034e2f230 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -24,6 +24,8 @@ if TRTLLM_GEN_MXFP4_AVAILABLE: next_positive_power_of_2, reorder_rows_for_gated_act_gemm, shuffle_matrix_a, shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe) + from flashinfer.fp4_quantization import nvfp4_block_scale_interleave + from flashinfer.fused_moe.core import _maybe_get_cached_w2_permute_indices @dataclass @@ -204,6 +206,7 @@ def tg_mxfp4_moe( alpha, beta, limit, + transpose_optimized: bool = False, ) -> torch.Tensor: sf_block_size = 32 assert (w13_weight.dim() == 3 and w13_weight.shape[0] == num_experts @@ -267,22 +270,85 @@ def tg_mxfp4_moe( gemm1_bias_shuffled = [] gemm2_bias_shuffled = [] epilogue_tile_m = 128 # FIXME: this depends on the kernel internals - for i in range(num_experts): - gemm1_weights_shuffled.append( - shuffle_matrix_a(w13_weight[i].view(torch.uint8), epilogue_tile_m)) - gemm1_scales_shuffled.append( - shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8), - epilogue_tile_m)) + _cache_permute_indices: dict[torch.Size, torch.Tensor] = {} + if transpose_optimized: + for i in range(num_experts): + # w13 weight shuffling + permute_indices = _maybe_get_cached_w2_permute_indices( + _cache_permute_indices, + w13_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm1_weights_shuffled.append(w13_weight[i].view( + torch.uint8)[permute_indices.to( + w13_weight.device)].contiguous()) + # w13 scale shuffling + permute_sf_indices = _maybe_get_cached_w2_permute_indices( + _cache_permute_indices, + w13_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm1_scales_shuffled.append( + nvfp4_block_scale_interleave(w13_weight_scale[i].view( + torch.uint8)[permute_sf_indices.to( + w13_weight_scale.device)].contiguous())) + # w13 bias shuffling + permute_bias_indices = _maybe_get_cached_w2_permute_indices( + _cache_permute_indices, + w13_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm1_bias_shuffled.append(w13_bias[i].clone().reshape( + -1, 1)[permute_bias_indices.to(w13_bias.device)].contiguous()) + # w2 weight shuffling + permute_indices = _maybe_get_cached_w2_permute_indices( + _cache_permute_indices, + w2_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm2_weights_shuffled.append(w2_weight[i].view( + torch.uint8)[permute_indices.to( + w2_weight.device)].contiguous()) + # w2 scale shuffling + permute_sf_indices = _maybe_get_cached_w2_permute_indices( + _cache_permute_indices, + w2_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm2_scales_shuffled.append( + nvfp4_block_scale_interleave(w2_weight_scale[i].view( + torch.uint8)[permute_sf_indices.to( + w2_weight_scale.device)].contiguous())) + # w2 bias shuffling + permute_indices = _maybe_get_cached_w2_permute_indices( + _cache_permute_indices, + w2_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm2_bias_shuffled.append(w2_bias[i].clone().reshape( + -1, 1)[permute_indices.to(w2_bias.device)].contiguous()) - gemm2_weights_shuffled.append( - shuffle_matrix_a(w2_weight[i].view(torch.uint8), epilogue_tile_m)) - gemm2_scales_shuffled.append( - shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8), - epilogue_tile_m)) - gemm1_bias_shuffled.append( - shuffle_matrix_a(w13_bias[i].reshape(-1, 1), epilogue_tile_m)) - gemm2_bias_shuffled.append( - shuffle_matrix_a(w2_bias[i].reshape(-1, 1), epilogue_tile_m)) + else: + for i in range(num_experts): + gemm1_weights_shuffled.append( + shuffle_matrix_a(w13_weight[i].view(torch.uint8), + epilogue_tile_m)) + gemm1_scales_shuffled.append( + shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8), + epilogue_tile_m)) + + gemm2_weights_shuffled.append( + shuffle_matrix_a(w2_weight[i].view(torch.uint8), + epilogue_tile_m)) + gemm2_scales_shuffled.append( + shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8), + epilogue_tile_m)) + gemm1_bias_shuffled.append( + shuffle_matrix_a(w13_bias[i].reshape(-1, 1), epilogue_tile_m)) + gemm2_bias_shuffled.append( + shuffle_matrix_a(w2_bias[i].reshape(-1, 1), epilogue_tile_m)) w13_weight = torch.stack(gemm1_weights_shuffled) w13_weight_scale = torch.stack(gemm1_scales_shuffled).reshape( @@ -356,6 +422,7 @@ def check_accuracy(a, b, atol, rtol, percent): @pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None), (1.702, 1.0, 7.0)]) @pytest.mark.parametrize("act_type", ['mxfp8', 'bf16']) +@pytest.mark.parametrize("transpose_optimized", [False, True]) @pytest.mark.skipif( not TRTLLM_GEN_MXFP4_AVAILABLE, reason="nvidia gpu and compute capability sm100 is required for this test") @@ -369,6 +436,7 @@ def test_trtllm_gen_mxfp4_fused_moe( beta: float, limit: Optional[float], act_type: str, + transpose_optimized: bool, ): seed = 42 torch.manual_seed(seed) @@ -470,6 +538,7 @@ def test_trtllm_gen_mxfp4_fused_moe( act_type, alpha=alpha, beta=beta, - limit=limit) + limit=limit, + transpose_optimized=transpose_optimized) # relatively loose check since the mxfp4 quantization is less accurate check_accuracy(ref_result, tg_result, atol=0, rtol=0.3, percent=0.8) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 889c15df3c878..74922756afe56 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -122,6 +122,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): "MXFP4 MoE is enabled on Blackwell but FlashInfer " "is not available. This may result in degraded performance. " "Please `pip install vllm[flashinfer]` for best results.") + self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {} def _should_use_marlin(self): if envs.VLLM_MXFP4_USE_MARLIN is not None: @@ -266,7 +267,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): if self.use_marlin: prepare_moe_fp4_layer_for_marlin(layer) elif should_use_flashinfer_mxfp4(): - from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a + from flashinfer.fp4_quantization import ( + nvfp4_block_scale_interleave) + from flashinfer.fused_moe.core import ( + _maybe_get_cached_w2_permute_indices) layer.gemm1_alpha = Parameter(torch.tensor( [1.702] * self.num_experts, dtype=torch.float32).cuda(), requires_grad=False) @@ -343,25 +347,63 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): gemm2_bias_shuffled = [] epilogue_tile_m = 128 # FIXME: this depends on the kernel internals for i in range(self.num_experts): - gemm1_weights_mxfp4_shuffled.append( - shuffle_matrix_a(w13_weight[i].view(torch.uint8), - epilogue_tile_m)) + # w13 weight shuffling + permute_indices = _maybe_get_cached_w2_permute_indices( + self._cache_permute_indices, + w13_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm1_weights_mxfp4_shuffled.append(w13_weight[i].view( + torch.uint8)[permute_indices.to( + w13_weight.device)].contiguous()) + # w13 scale shuffling + permute_sf_indices = _maybe_get_cached_w2_permute_indices( + self._cache_permute_indices, + w13_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) gemm1_scales_mxfp4_shuffled.append( - shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8), - epilogue_tile_m)) - gemm1_bias_shuffled.append( - shuffle_matrix_a(w13_bias[i].clone().reshape(-1, 1), - epilogue_tile_m)) - - gemm2_weights_mxfp4_shuffled.append( - shuffle_matrix_a(w2_weight[i].view(torch.uint8), - epilogue_tile_m)) + nvfp4_block_scale_interleave(w13_weight_scale[i].view( + torch.uint8)[permute_sf_indices.to( + w13_weight_scale.device)].contiguous())) + # w13 bias shuffling + permute_bias_indices = _maybe_get_cached_w2_permute_indices( + self._cache_permute_indices, + w13_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm1_bias_shuffled.append(w13_bias[i].clone().reshape( + -1, + 1)[permute_bias_indices.to(w13_bias.device)].contiguous()) + # w2 weight shuffling + permute_indices = _maybe_get_cached_w2_permute_indices( + self._cache_permute_indices, + w2_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm2_weights_mxfp4_shuffled.append(w2_weight[i].view( + torch.uint8)[permute_indices.to( + w2_weight.device)].contiguous()) + # w2 scale shuffling + permute_sf_indices = _maybe_get_cached_w2_permute_indices( + self._cache_permute_indices, + w2_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) gemm2_scales_mxfp4_shuffled.append( - shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8), - epilogue_tile_m)) - gemm2_bias_shuffled.append( - shuffle_matrix_a(w2_bias[i].clone().reshape(-1, 1), - epilogue_tile_m)) + nvfp4_block_scale_interleave(w2_weight_scale[i].view( + torch.uint8)[permute_sf_indices.to( + w2_weight_scale.device)].contiguous())) + # w2 bias shuffling + permute_indices = _maybe_get_cached_w2_permute_indices( + self._cache_permute_indices, + w2_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm2_bias_shuffled.append(w2_bias[i].clone().reshape( + -1, 1)[permute_indices.to(w2_bias.device)].contiguous()) w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled) w13_weight_scale = torch.stack( From 009d689b0c30bc16301e39ca258219ecfbd77f20 Mon Sep 17 00:00:00 2001 From: Chenheli Hua <huachenheli@outlook.com> Date: Tue, 9 Sep 2025 21:36:09 -0700 Subject: [PATCH 100/584] [Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing. (#24271) Signed-off-by: Chenheli Hua <huachenheli@outlook.com> --- .../test_processor_multi_modal_uuids.py | 12 +-- vllm/inputs/preprocess.py | 88 ++++++++----------- vllm/model_executor/models/deepseek_vl2.py | 9 +- vllm/model_executor/models/h2ovl.py | 8 +- vllm/model_executor/models/llava.py | 7 +- vllm/model_executor/models/mllama.py | 6 +- vllm/model_executor/models/paligemma.py | 7 +- vllm/model_executor/models/pixtral.py | 6 +- vllm/model_executor/models/terratorch.py | 6 +- vllm/model_executor/models/transformers.py | 10 +-- vllm/model_executor/models/voxtral.py | 7 +- vllm/multimodal/processing.py | 50 +++++------ vllm/v1/engine/processor.py | 21 +++-- 13 files changed, 110 insertions(+), 127 deletions(-) diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index 970a59eca8ece..955c74d262a09 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -152,8 +152,8 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( *, tokenization_kwargs=None, lora_request=None, - mm_hash_overrides=None): - captured["mm_hash_overrides"] = mm_hash_overrides + mm_uuids=None): + captured["mm_uuids"] = mm_uuids # Minimal processed inputs for decoder-only flow return {"type": "token", "prompt_token_ids": [1]} @@ -180,7 +180,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( params=SamplingParams(), ) - assert captured["mm_hash_overrides"] == mm_uuids + assert captured["mm_uuids"] == mm_uuids def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): @@ -196,8 +196,8 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): *, tokenization_kwargs=None, lora_request=None, - mm_hash_overrides=None): - captured["mm_hash_overrides"] = mm_hash_overrides + mm_uuids=None): + captured["mm_uuids"] = mm_uuids return {"type": "token", "prompt_token_ids": [1]} monkeypatch.setattr(processor.input_preprocessor, @@ -223,7 +223,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): ) # Expect request-id-based overrides are passed through - assert captured["mm_hash_overrides"] == { + assert captured["mm_uuids"] == { "image": [f"{request_id}-image-0", f"{request_id}-image-1"], "video": [f"{request_id}-video-0"], } diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index ec82be831e0de..22287aa6f41e0 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -258,8 +258,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -281,7 +280,7 @@ class InputPreprocessor: mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) mm_hashes = mm_input["mm_hashes"] @@ -302,8 +301,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Async version of @@ -325,7 +323,7 @@ class InputPreprocessor: mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) mm_hashes = mm_input["mm_hashes"] @@ -390,8 +388,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = self._truncate_inputs( parsed_content["prompt_token_ids"], tokenization_kwargs) @@ -404,7 +401,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: inputs = token_inputs(prompt_token_ids=prompt_token_ids) @@ -420,8 +417,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = self._truncate_inputs( parsed_content["prompt_token_ids"], tokenization_kwargs) @@ -434,7 +430,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: inputs = token_inputs(prompt_token_ids=prompt_token_ids, ) @@ -450,8 +446,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -463,7 +458,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: prompt_token_ids = self._tokenize_prompt( @@ -487,8 +482,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -500,7 +494,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: prompt_token_ids = await self._tokenize_prompt_async( @@ -524,8 +518,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -547,21 +540,21 @@ class InputPreprocessor: return self._process_tokens( parsed["content"], lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "text": return self._process_text( parsed["content"], tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "str": return self._process_text( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) assert_never(parsed) @@ -572,8 +565,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> SingletonInputs: """ Async version of @@ -587,21 +579,21 @@ class InputPreprocessor: return await self._process_tokens_async( parsed["content"], lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "text": return await self._process_text_async( parsed["content"], tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "str": return await self._process_text_async( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) assert_never(parsed) @@ -712,8 +704,7 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> EncoderDecoderInputs: """ For encoder/decoder models only: @@ -755,7 +746,7 @@ class InputPreprocessor: encoder_inputs = self._prompt_to_llm_inputs( prompt["encoder_prompt"], tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if (decoder_input := prompt["decoder_prompt"]) is None: decoder_inputs = None @@ -771,7 +762,7 @@ class InputPreprocessor: inputs = self._prompt_to_llm_inputs( prompt, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model @@ -788,8 +779,7 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> EncoderDecoderInputs: """ Async version of @@ -802,7 +792,7 @@ class InputPreprocessor: encoder_task = self._prompt_to_llm_inputs_async( prompt["encoder_prompt"], tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if (decoder_input := prompt["decoder_prompt"]) is None: @@ -812,7 +802,7 @@ class InputPreprocessor: decoder_task = self._prompt_to_llm_inputs_async( decoder_input, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) encoder_inputs, decoder_inputs = await asyncio.gather( @@ -828,7 +818,7 @@ class InputPreprocessor: inputs = await self._prompt_to_llm_inputs_async( prompt, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model @@ -856,8 +846,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> DecoderOnlyInputs: """ For decoder-only models: @@ -878,7 +867,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -889,8 +878,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> DecoderOnlyInputs: """ Async version of @@ -900,7 +888,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -911,8 +899,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.model_config.is_encoder_decoder: @@ -921,7 +908,7 @@ class InputPreprocessor: return self._process_encoder_decoder_prompt( prompt, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if is_explicit_encoder_decoder_prompt(prompt): @@ -933,7 +920,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) async def preprocess_async( @@ -942,8 +929,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> ProcessorInputs: """ Async version of @@ -955,7 +941,7 @@ class InputPreprocessor: return await self._process_encoder_decoder_prompt_async( prompt, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if is_explicit_encoder_decoder_prompt(prompt): @@ -967,7 +953,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) def clear_cache(self) -> None: diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 5eab02b17151c..d7ae8206baca5 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -21,7 +21,8 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models.transformers import replace_linear_class from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems, NestedTensors) + MultiModalKwargsItems, MultiModalUUIDDict, + NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -290,7 +291,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is @@ -302,7 +303,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return super()._cached_apply_hf_processor( @@ -310,7 +311,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 306775af68065..b42df3ad86508 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -17,7 +17,7 @@ from transformers import PretrainedConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargsItems +from vllm.multimodal.inputs import MultiModalKwargsItems, MultiModalUUIDDict from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (MultiModalProcessingInfo, @@ -479,7 +479,7 @@ class H2OVLMultiModalProcessor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is @@ -491,7 +491,7 @@ class H2OVLMultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return super()._cached_apply_hf_processor( @@ -499,7 +499,7 @@ class H2OVLMultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 8a847a6180f3a..d692b2783048f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -24,7 +24,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, MultiModalKwargsItems) + MultiModalInputs, MultiModalKwargsItems, + MultiModalUUIDDict) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -795,7 +796,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -810,7 +811,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_items = self._to_mm_items(mm_data) mm_item_counts = mm_items.get_all_counts() diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index f441287a4d089..68aa16f8b9ecf 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -57,7 +57,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalFieldConfig, - MultiModalKwargsItems) + MultiModalKwargsItems, MultiModalUUIDDict) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseProcessingInfo, @@ -184,13 +184,13 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalEncDecInputs: mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) image_token_id = self.info.get_hf_config().image_token_index # Check that the number of image tokens in the decoder prompt matches diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index b74a09ee92c33..d6eec77ebcee5 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -12,7 +12,8 @@ from vllm.logger import init_logger from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, MultiModalKwargsItems) + MultiModalInputs, MultiModalKwargsItems, + MultiModalUUIDDict) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -203,13 +204,13 @@ class PaliGemmaMultiModalProcessor( mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) prompt_token_ids = mm_inputs["prompt_token_ids"] tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index e7f5799a80067..142d3251bc67a 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - NestedTensors) + MultiModalUUIDDict, NestedTensors) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -316,14 +316,14 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 453da1a51d987..b9dfa8e9b6f51 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -36,7 +36,7 @@ from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, - PlaceholderRange) + MultiModalUUIDDict, PlaceholderRange) from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -164,7 +164,7 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: if "image" in mm_data: image_data = mm_data["image"] @@ -177,7 +177,7 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor): mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} mm_processed_data = BatchFeature(image_data) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5ad0482330ecd..a386f47e1929f 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -44,7 +44,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, PlaceholderRange) + MultiModalInputs, MultiModalUUIDDict, + PlaceholderRange) from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) @@ -347,7 +348,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -415,9 +416,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): num_image_patches), ) # Use overrides if provided; fallback to data-dependent hashing. - mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else - self._hash_mm_items(mm_items, hf_processor_mm_kwargs, - tokenization_kwargs)) + mm_hashes = (mm_uuids if mm_uuids is not None else self._hash_mm_items( + mm_items, hf_processor_mm_kwargs, tokenization_kwargs)) return MultiModalInputs( type="multimodal", diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index f3731b389cfe0..2831c4df78ba7 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -31,7 +31,8 @@ from vllm.model_executor.models.whisper import WhisperEncoder from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems, NestedTensors) + MultiModalKwargsItems, MultiModalUUIDDict, + NestedTensors) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -290,14 +291,14 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo] mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0531b7bd9f0a7..e5db356b635f3 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1022,13 +1022,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: return self.apply(prompt, mm_data, hf_processor_mm_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) def _get_data_parser(self) -> MultiModalDataParser: """ @@ -1364,8 +1363,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1). @@ -1376,30 +1374,30 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): model_id = self.info.model_id hashes: MultiModalHashes = {} - mm_hash_overrides = mm_hash_overrides or {} + mm_uuids = mm_uuids or {} for modality, items in mm_items.items(): - if modality in mm_hash_overrides: - mm_hashes = mm_hash_overrides[modality] - if isinstance(mm_hashes, str): - mm_hashes = [mm_hashes] + if modality in mm_uuids: + mm_uuids_per_modality = mm_uuids[modality] + if isinstance(mm_uuids_per_modality, str): + mm_uuids_per_modality = [mm_uuids_per_modality] # For None entries, compute a hash; otherwise, use provided ID. computed: list[str] = [] for i, item in enumerate(items): - mm_hash = mm_hashes[i] + item_uuid = mm_uuids_per_modality[i] - # NOTE: Even if a mm_hash is provided, we still compute a + # NOTE: Even if a item_uuid is provided, we still compute a # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs` # are provided. This is because the processed multimodal # inputs can be different depending on the processor kwargs. - if mm_hash is None or \ + if item_uuid is None or \ hf_processor_mm_kwargs or \ tokenization_kwargs: # NOTE: use provided hash string to hash with kwargs # if available for better performance. - item = mm_hash if mm_hash is not None else item + item = item_uuid if item_uuid is not None else item computed.append( MultiModalHasher.hash_kwargs( model_id=model_id, @@ -1407,7 +1405,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **hf_processor_mm_kwargs, **tokenization_kwargs)) else: - computed.append(mm_hash) + computed.append(item_uuid) hashes[modality] = computed else: hashes[modality] = [ @@ -1514,8 +1512,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1539,7 +1536,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_prompt_updates = self._get_mm_prompt_updates( mm_data_items, @@ -1562,8 +1559,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1578,13 +1574,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_missing_data_items = self._get_cache_missing_items( cache=cache, @@ -1785,8 +1781,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1815,7 +1810,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_items, hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) # NOTE: tokenization_kwargs are not required to init processor @@ -1901,8 +1896,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1917,7 +1911,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return self._get_enc_dec_inputs( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index baade243140d5..8ce070e4d6fba 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -12,7 +12,7 @@ from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import processor_cache_from_config -from vllm.multimodal.inputs import MultiModalFeatureSpec +from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams @@ -276,11 +276,11 @@ class Processor: # Remember that this backend was set automatically params.guided_decoding.backend_was_auto = True - def _maybe_build_mm_hash_overrides( + def _maybe_build_mm_uuids( self, request_id: str, prompt: PromptType, - ) -> Optional[dict[str, list[str]]]: + ) -> Optional[MultiModalUUIDDict]: """Build per-item multimodal hash overrides when enabled. In this case, multimodal data items are identified by their request id, modality and index rather than their content. @@ -303,13 +303,13 @@ class Processor: if not mm_data: return None - overrides: dict[str, list[str]] = {} + mm_uuids: MultiModalUUIDDict = {} for modality, data in mm_data.items(): n = len(data) if isinstance(data, list) else 1 - overrides[modality] = [ + mm_uuids[modality] = [ f"{request_id}-{modality}-{i}" for i in range(n) ] - return overrides + return mm_uuids def process_inputs( self, @@ -351,16 +351,15 @@ class Processor: if (self.model_config.multimodal_config and self.model_config.multimodal_config.mm_processor_cache_gb == 0 and not self.cache_config.enable_prefix_caching): - mm_hash_overrides = self._maybe_build_mm_hash_overrides( - request_id, prompt) + mm_uuids = self._maybe_build_mm_uuids(request_id, prompt) else: # Otherwise, use user-provided uuids as multimodal hash overrides # if provided. self._validate_multi_modal_uuids(prompt) if isinstance(prompt, dict): - mm_hash_overrides = prompt.get("multi_modal_uuids") + mm_uuids = prompt.get("multi_modal_uuids") else: - mm_hash_overrides = None + mm_uuids = None # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. @@ -370,7 +369,7 @@ class Processor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) from vllm.platforms import current_platform current_platform.validate_request( From 4377b1ae3b87aa7544e3707f9917389f0ea21246 Mon Sep 17 00:00:00 2001 From: pwschuurman <psch@google.com> Date: Tue, 9 Sep 2025 21:37:17 -0700 Subject: [PATCH 101/584] [Bugfix] Update Run:AI Model Streamer Loading Integration (#23845) Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai> Signed-off-by: Peter Schuurman <psch@google.com> Co-authored-by: Omer Dayan (SW-GPU) <omer@run.ai> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- setup.py | 6 +- .../test_runai_utils.py | 39 ++++++++ vllm/config/__init__.py | 64 ++++++------ vllm/engine/arg_utils.py | 9 +- .../model_loader/runai_streamer_loader.py | 20 ++-- vllm/transformers_utils/runai_utils.py | 99 +++++++++++++++++++ vllm/transformers_utils/s3_utils.py | 72 -------------- 7 files changed, 187 insertions(+), 122 deletions(-) create mode 100644 tests/runai_model_streamer_test/test_runai_utils.py create mode 100644 vllm/transformers_utils/runai_utils.py diff --git a/setup.py b/setup.py index 4ea0baa0b2204..eb313b7d219c7 100644 --- a/setup.py +++ b/setup.py @@ -656,8 +656,10 @@ setup( "bench": ["pandas", "datasets"], "tensorizer": ["tensorizer==2.10.1"], "fastsafetensors": ["fastsafetensors >= 0.1.10"], - "runai": - ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"], + "runai": [ + "runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs", + "google-cloud-storage", "runai-model-streamer-s3", "boto3" + ], "audio": ["librosa", "soundfile", "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility diff --git a/tests/runai_model_streamer_test/test_runai_utils.py b/tests/runai_model_streamer_test/test_runai_utils.py new file mode 100644 index 0000000000000..bde77ff665063 --- /dev/null +++ b/tests/runai_model_streamer_test/test_runai_utils.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import glob +import os +import tempfile + +import huggingface_hub.constants + +from vllm.model_executor.model_loader.weight_utils import ( + download_weights_from_hf) +from vllm.transformers_utils.runai_utils import (is_runai_obj_uri, + list_safetensors) + + +def test_is_runai_obj_uri(): + assert is_runai_obj_uri("gs://some-gcs-bucket/path") + assert is_runai_obj_uri("s3://some-s3-bucket/path") + assert not is_runai_obj_uri("nfs://some-nfs-path") + + +def test_runai_list_safetensors_local(): + with tempfile.TemporaryDirectory() as tmpdir: + huggingface_hub.constants.HF_HUB_OFFLINE = False + download_weights_from_hf("openai-community/gpt2", + allow_patterns=["*.safetensors", "*.json"], + cache_dir=tmpdir) + safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True) + assert len(safetensors) > 0 + parentdir = [ + os.path.dirname(safetensor) for safetensor in safetensors + ][0] + files = list_safetensors(parentdir) + assert len(safetensors) == len(files) + + +if __name__ == "__main__": + test_is_runai_obj_uri() + test_runai_list_safetensors_local() diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 651cd7339101b..a859f44eb40d3 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -48,8 +48,9 @@ from vllm.transformers_utils.config import ( is_interleaved, maybe_override_with_speculators_target_model, try_get_generation_config, try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) -from vllm.transformers_utils.s3_utils import S3Model -from vllm.transformers_utils.utils import is_s3, maybe_model_redirect +from vllm.transformers_utils.runai_utils import (ObjectStorageModel, + is_runai_obj_uri) +from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, LazyLoader, common_broadcastable_dtype, random_uuid) @@ -556,15 +557,6 @@ class ModelConfig: "affect the random state of the Python process that " "launched vLLM.", self.seed) - if self.runner != "draft": - # If we're not running the draft model, check for speculators config - # If speculators config, set model / tokenizer to be target model - self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 - model=self.model, - tokenizer=self.tokenizer, - revision=self.revision, - trust_remote_code=self.trust_remote_code) - # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name(self.model, self.served_model_name) @@ -603,7 +595,16 @@ class ModelConfig: f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) - self.maybe_pull_model_tokenizer_for_s3(self.model, self.tokenizer) + self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + + if self.runner != "draft": + # If we're not running the draft model, check for speculators config + # If speculators config, set model / tokenizer to be target model + self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code) if (backend := envs.VLLM_ATTENTION_BACKEND ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: @@ -832,41 +833,42 @@ class ModelConfig: """The architecture vllm actually used.""" return self._architecture - def maybe_pull_model_tokenizer_for_s3(self, model: str, - tokenizer: str) -> None: - """Pull model/tokenizer from S3 to temporary directory when needed. + def maybe_pull_model_tokenizer_for_runai(self, model: str, + tokenizer: str) -> None: + """Pull model/tokenizer from Object Storage to temporary + directory when needed. Args: model: Model name or path tokenizer: Tokenizer name or path """ - if not (is_s3(model) or is_s3(tokenizer)): + if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): return - if is_s3(model): - s3_model = S3Model() - s3_model.pull_files(model, - allow_pattern=["*.model", "*.py", "*.json"]) + if is_runai_obj_uri(model): + object_storage_model = ObjectStorageModel() + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"]) self.model_weights = model - self.model = s3_model.dir + self.model = object_storage_model.dir # If tokenizer is same as model, download to same directory if model == tokenizer: - s3_model.pull_files(model, - ignore_pattern=[ - "*.pt", "*.safetensors", "*.bin", - "*.tensors" - ]) - self.tokenizer = s3_model.dir + object_storage_model.pull_files(model, + ignore_pattern=[ + "*.pt", "*.safetensors", + "*.bin", "*.tensors" + ]) + self.tokenizer = object_storage_model.dir return # Only download tokenizer if needed and not already handled - if is_s3(tokenizer): - s3_tokenizer = S3Model() - s3_tokenizer.pull_files( + if is_runai_obj_uri(tokenizer): + object_storage_tokenizer = ObjectStorageModel() + object_storage_tokenizer.pull_files( model, ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"]) - self.tokenizer = s3_tokenizer.dir + self.tokenizer = object_storage_tokenizer.dir def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: if self._model_info.supports_multimodal: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 94c984116131d..a4b5abbb35a3f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1053,9 +1053,10 @@ class EngineArgs: SpeculatorsConfig) if self.speculative_config is None: - hf_config = get_config(self.hf_config_path or self.model, - self.trust_remote_code, self.revision, - self.code_revision, self.config_format) + hf_config = get_config( + self.hf_config_path or target_model_config.model, + self.trust_remote_code, self.revision, self.code_revision, + self.config_format) # if loading a SpeculatorsConfig, load the speculative_config # details from the config directly @@ -1065,7 +1066,7 @@ class EngineArgs: self.speculative_config = {} self.speculative_config[ "num_speculative_tokens"] = hf_config.num_lookahead_tokens - self.speculative_config["model"] = self.model + self.speculative_config["model"] = target_model_config.model self.speculative_config["method"] = hf_config.method else: return None diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 83e0f386c1082..19e73efc91077 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 -import glob import os from collections.abc import Generator from typing import Optional @@ -15,8 +14,8 @@ from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, runai_safetensors_weights_iterator) -from vllm.transformers_utils.s3_utils import glob as s3_glob -from vllm.transformers_utils.utils import is_s3 +from vllm.transformers_utils.runai_utils import (is_runai_obj_uri, + list_safetensors) class RunaiModelStreamerLoader(BaseModelLoader): @@ -53,27 +52,22 @@ class RunaiModelStreamerLoader(BaseModelLoader): If the model is not local, it will be downloaded.""" - is_s3_path = is_s3(model_name_or_path) + is_object_storage_path = is_runai_obj_uri(model_name_or_path) is_local = os.path.isdir(model_name_or_path) safetensors_pattern = "*.safetensors" index_file = SAFE_WEIGHTS_INDEX_NAME - hf_folder = (model_name_or_path if - (is_local or is_s3_path) else download_weights_from_hf( + hf_folder = (model_name_or_path if (is_local or is_object_storage_path) + else download_weights_from_hf( model_name_or_path, self.load_config.download_dir, [safetensors_pattern], revision, ignore_patterns=self.load_config.ignore_patterns, )) - if is_s3_path: - hf_weights_files = s3_glob(path=hf_folder, - allow_pattern=[safetensors_pattern]) - else: - hf_weights_files = glob.glob( - os.path.join(hf_folder, safetensors_pattern)) + hf_weights_files = list_safetensors(path=hf_folder) - if not is_local and not is_s3_path: + if not is_local and not is_object_storage_path: download_safetensors_index_file_from_hf( model_name_or_path, index_file, self.load_config.download_dir, revision) diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py new file mode 100644 index 0000000000000..357c180ed35f2 --- /dev/null +++ b/vllm/transformers_utils/runai_utils.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import shutil +import signal +import tempfile +from typing import Optional + +from vllm.logger import init_logger +from vllm.utils import PlaceholderModule + +logger = init_logger(__name__) + +SUPPORTED_SCHEMES = ['s3://', 'gs://'] + +try: + from runai_model_streamer import list_safetensors as runai_list_safetensors + from runai_model_streamer import pull_files as runai_pull_files +except (ImportError, OSError): + # see https://github.com/run-ai/runai-model-streamer/issues/26 + # OSError will be raised on arm64 platform + runai_model_streamer = PlaceholderModule( + "runai_model_streamer") # type: ignore[assignment] + runai_pull_files = runai_model_streamer.placeholder_attr("pull_files") + runai_list_safetensors = runai_model_streamer.placeholder_attr( + "list_safetensors") + + +def list_safetensors(path: str = "") -> list[str]: + """ + List full file names from object path and filter by allow pattern. + + Args: + path: The object storage path to list from. + allow_pattern: A list of patterns of which files to pull. + + Returns: + list[str]: List of full object storage paths allowed by the pattern + """ + return runai_list_safetensors(path) + + +def is_runai_obj_uri(model_or_path: str) -> bool: + return model_or_path.lower().startswith(tuple(SUPPORTED_SCHEMES)) + + +class ObjectStorageModel: + """ + A class representing an ObjectStorage model mirrored into a + temporary directory. + + Attributes: + dir: The temporary created directory. + + Methods: + pull_files(): Pull model from object storage to the temporary + directory. + """ + + def __init__(self) -> None: + for sig in (signal.SIGINT, signal.SIGTERM): + existing_handler = signal.getsignal(sig) + signal.signal(sig, self._close_by_signal(existing_handler)) + + self.dir = tempfile.mkdtemp() + + def __del__(self): + self._close() + + def _close(self) -> None: + if os.path.exists(self.dir): + shutil.rmtree(self.dir) + + def _close_by_signal(self, existing_handler=None): + + def new_handler(signum, frame): + self._close() + if existing_handler: + existing_handler(signum, frame) + + return new_handler + + def pull_files(self, + model_path: str = "", + allow_pattern: Optional[list[str]] = None, + ignore_pattern: Optional[list[str]] = None) -> None: + """ + Pull files from object storage into the temporary directory. + + Args: + model_path: The object storage path of the model. + allow_pattern: A list of patterns of which files to pull. + ignore_pattern: A list of patterns of which files not to pull. + + """ + if not model_path.endswith("/"): + model_path = model_path + "/" + runai_pull_files(model_path, self.dir, allow_pattern, ignore_pattern) diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index f95aae7815e0b..62c87c167e682 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -2,11 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -import os -import shutil -import signal -import tempfile -from pathlib import Path from typing import Optional from vllm.utils import PlaceholderModule @@ -93,70 +88,3 @@ def list_files( paths = _filter_ignore(paths, ignore_pattern) return bucket_name, prefix, paths - - -class S3Model: - """ - A class representing a S3 model mirrored into a temporary directory. - - Attributes: - s3: S3 client. - dir: The temporary created directory. - - Methods: - pull_files(): Pull model from S3 to the temporary directory. - """ - - def __init__(self) -> None: - self.s3 = boto3.client('s3') - for sig in (signal.SIGINT, signal.SIGTERM): - existing_handler = signal.getsignal(sig) - signal.signal(sig, self._close_by_signal(existing_handler)) - - self.dir = tempfile.mkdtemp() - - def __del__(self): - self._close() - - def _close(self) -> None: - if os.path.exists(self.dir): - shutil.rmtree(self.dir) - - def _close_by_signal(self, existing_handler=None): - - def new_handler(signum, frame): - self._close() - if existing_handler: - existing_handler(signum, frame) - - return new_handler - - def pull_files(self, - s3_model_path: str = "", - allow_pattern: Optional[list[str]] = None, - ignore_pattern: Optional[list[str]] = None) -> None: - """ - Pull files from S3 storage into the temporary directory. - - Args: - s3_model_path: The S3 path of the model. - allow_pattern: A list of patterns of which files to pull. - ignore_pattern: A list of patterns of which files not to pull. - - """ - if not s3_model_path.endswith("/"): - s3_model_path = s3_model_path + "/" - - bucket_name, base_dir, files = list_files(self.s3, s3_model_path, - allow_pattern, - ignore_pattern) - if len(files) == 0: - return - - for file in files: - destination_file = os.path.join( - self.dir, - file.removeprefix(base_dir).lstrip("/")) - local_dir = Path(destination_file).parent - os.makedirs(local_dir, exist_ok=True) - self.s3.download_file(bucket_name, file, destination_file) From e40827280b225bf0a0797cc9842fc3cdfea8ebdf Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 10 Sep 2025 05:40:45 +0100 Subject: [PATCH 102/584] [Docs] Enable relative links in examples to function when rendered in the docs (#24041) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/mkdocs/hooks/generate_examples.py | 43 +++++++++++++++++++++----- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index ac2101daac2ef..0cbaebb598a34 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -114,6 +114,33 @@ class Example: return match.group('title') return fix_case(self.path.stem.replace("_", " ").title()) + def fix_relative_links(self, content: str) -> str: + """ + Fix relative links in markdown content by converting them to gh-file + format. + + Args: + content (str): The markdown content to process + + Returns: + str: Content with relative links converted to gh-file format + """ + # Regex to match markdown links [text](relative_path) + # This matches links that don't start with http, https, ftp, or # + link_pattern = r'\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)' + + def replace_link(match): + link_text = match.group(1) + relative_path = match.group(2) + + # Make relative to repo root + gh_file = (self.main_file.parent / relative_path).resolve() + gh_file = gh_file.relative_to(ROOT_DIR) + + return f'[{link_text}](gh-file:{gh_file})' + + return re.sub(link_pattern, replace_link, content) + def generate(self) -> str: content = f"# {self.title}\n\n" content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n" @@ -121,14 +148,16 @@ class Example: # Use long code fence to avoid issues with # included files containing code fences too code_fence = "``````" - # Skip the title from md snippets as it's been included above - start_line = 2 + if self.is_code: - content += f"{code_fence}{self.main_file.suffix[1:]}\n" - start_line = 1 - content += f'--8<-- "{self.main_file}:{start_line}"\n' - if self.is_code: - content += f"{code_fence}\n" + content += (f"{code_fence}{self.main_file.suffix[1:]}\n" + f'--8<-- "{self.main_file}"\n' + f"{code_fence}\n") + else: + with open(self.main_file) as f: + # Skip the title from md snippets as it's been included above + main_content = f.readlines()[1:] + content += self.fix_relative_links("".join(main_content)) content += "\n" if not self.other_files: From 91130ae376ef083a3eedc39cb9b427d085bd2f34 Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Tue, 9 Sep 2025 23:24:20 -0700 Subject: [PATCH 103/584] [docs] promo pytorch conf and ray summit (#24562) Signed-off-by: simon-mo <simon.mo@hey.com> --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 4e03df758c261..b4a3583c214cf 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,9 @@ Easy, fast, and cheap LLM serving for everyone | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | </p> +--- +Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year! + --- *Latest News* 🔥 From feaf202e93a18228cd34ae2e6698aece83a15e87 Mon Sep 17 00:00:00 2001 From: Remy <comsky03@gmail.com> Date: Wed, 10 Sep 2025 15:24:42 +0900 Subject: [PATCH 104/584] [Bugfix] Guard `_may_reorder_batch` for encoder-only models on CPU (#24319) (#24348) Signed-off-by: Remy <eunhwan.shin@dtonic.io> Co-authored-by: Li, Jiang <jiang1.li@intel.com> --- .../models/language/pooling/test_embedding.py | 10 ++++++++-- vllm/config/__init__.py | 3 ++- vllm/v1/worker/cpu_model_runner.py | 20 +++++++++++++++---- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 0733ac85c11fc..41574b844a668 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -27,11 +27,17 @@ from ...utils import check_embeddings_close pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]), # [Encoder-only] - pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]), + pytest.param( + "BAAI/bge-base-en-v1.5", + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), # [Cross-Encoder] - pytest.param("sentence-transformers/stsb-roberta-base-v2"), + pytest.param( + "sentence-transformers/stsb-roberta-base-v2", + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), ], ) def test_models( diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index a859f44eb40d3..7422527a6854b 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3665,7 +3665,8 @@ class VllmConfig: # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later. - if not (current_platform.is_cuda() or current_platform.is_rocm()): + if not (current_platform.is_cuda() or current_platform.is_rocm() + or current_platform.is_cpu()): # Hybrid KV cache manager is not supported on non-GPU platforms. self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_transfer_config is not None: diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index feb49978d7518..d5ec19b86b061 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -55,11 +55,23 @@ class CPUModelRunner(GPUModelRunner): raise ValueError("Multiple KVCacheGroups is not" "currently supported with CPU model runner.") - assert type(self.attn_groups[0] - [0].metadata_builder) is TorchSDPAMetadataBuilderV1 + # Guard against encoder-only / pooling models where `attn_groups` + # may be empty or lack the expected metadata_builder. + # Without this check, accessing `attn_groups[0][0]` would trigger + # an AssertionError on CPU backend. + if not hasattr(self, "attn_groups") or not self.attn_groups: + return + if not self.attn_groups[0]: + return - self.attn_groups[0][0].metadata_builder.reorder_batch( - self.input_batch, scheduler_output) + mb = getattr(self.attn_groups[0][0], "metadata_builder", None) + if not isinstance(mb, TorchSDPAMetadataBuilderV1): + # Encoder-only / rerank models do not benefit from reordering, + # so we safely skip here. + return + + # Safe path for decoder/attention-heavy models + mb.reorder_batch(self.input_batch, scheduler_output) def _postprocess_tensors(self) -> None: # Note: replace device tensors with cpu tensors From 77f62613f9eb0963dffca1f58d22f718505e80c7 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Wed, 10 Sep 2025 01:44:47 -0700 Subject: [PATCH 105/584] Consolidate rendering parameters into RenderConfig dataclass (#24543) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/entrypoints/test_renderer.py | 70 +++++++----- .../openai/serving_classification.py | 10 +- vllm/entrypoints/openai/serving_completion.py | 25 +++-- vllm/entrypoints/openai/serving_embedding.py | 24 ++-- vllm/entrypoints/openai/serving_engine.py | 16 ++- vllm/entrypoints/openai/serving_pooling.py | 13 ++- .../openai/serving_tokenization.py | 11 +- vllm/entrypoints/renderer.py | 106 +++++++++--------- 8 files changed, 167 insertions(+), 108 deletions(-) diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py index d3caee4b8cec9..1f55b1fba613b 100644 --- a/tests/entrypoints/test_renderer.py +++ b/tests/entrypoints/test_renderer.py @@ -10,7 +10,7 @@ import pybase64 import pytest import torch -from vllm.entrypoints.renderer import CompletionRenderer +from vllm.entrypoints.renderer import CompletionRenderer, RenderConfig from vllm.inputs.data import is_embeds_prompt @@ -56,8 +56,8 @@ class TestRenderPrompt: @pytest.mark.asyncio async def test_token_input(self, renderer): tokens = [101, 7592, 2088] - results = await renderer.render_prompt(prompt_or_prompts=tokens, - max_length=100) + results = await renderer.render_prompt( + prompt_or_prompts=tokens, config=RenderConfig(max_length=100)) assert len(results) == 1 assert results[0]["prompt_token_ids"] == tokens @@ -65,8 +65,8 @@ class TestRenderPrompt: @pytest.mark.asyncio async def test_token_list_input(self, renderer): token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]] - results = await renderer.render_prompt(prompt_or_prompts=token_lists, - max_length=100) + results = await renderer.render_prompt( + prompt_or_prompts=token_lists, config=RenderConfig(max_length=100)) assert len(results) == 3 assert results[0]["prompt_token_ids"] == [101, 7592, 2088] @@ -80,8 +80,9 @@ class TestRenderPrompt: renderer.async_tokenizer_pool[ renderer.tokenizer] = mock_async_tokenizer - results = await renderer.render_prompt(prompt_or_prompts="Hello world", - max_length=100) + results = await renderer.render_prompt( + prompt_or_prompts="Hello world", + config=RenderConfig(max_length=100)) assert len(results) == 1 assert results[0]["prompt_token_ids"] == [101, 7592, 2088] @@ -96,7 +97,8 @@ class TestRenderPrompt: text_list_input = ["Hello world", "How are you?", "Good morning"] results = await renderer.render_prompt( - prompt_or_prompts=text_list_input, max_length=100) + prompt_or_prompts=text_list_input, + config=RenderConfig(max_length=100)) assert len(results) == 3 for result in results: @@ -110,8 +112,9 @@ class TestRenderPrompt: renderer.async_tokenizer_pool[ renderer.tokenizer] = mock_async_tokenizer - results = await renderer.render_prompt(prompt_or_prompts="Hello world", - max_length=100) + results = await renderer.render_prompt( + prompt_or_prompts="Hello world", + config=RenderConfig(max_length=100)) assert len(results) == 1 call_args = mock_async_tokenizer.call_args @@ -126,8 +129,9 @@ class TestRenderPrompt: renderer.tokenizer] = mock_async_tokenizer results = await renderer.render_prompt(prompt_or_prompts="Hello world", - max_length=100, - truncate_prompt_tokens=50) + config=RenderConfig( + max_length=100, + truncate_prompt_tokens=50)) assert len(results) == 1 call_args = mock_async_tokenizer.call_args @@ -143,8 +147,9 @@ class TestRenderPrompt: renderer.tokenizer] = mock_async_tokenizer results = await renderer.render_prompt(prompt_or_prompts="Hello world", - max_length=200, - truncate_prompt_tokens=-1) + config=RenderConfig( + max_length=200, + truncate_prompt_tokens=-1)) assert len(results) == 1 call_args = mock_async_tokenizer.call_args @@ -157,8 +162,9 @@ class TestRenderPrompt: long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] # 10 tokens results = await renderer.render_prompt(prompt_or_prompts=long_tokens, - max_length=100, - truncate_prompt_tokens=5) + config=RenderConfig( + max_length=100, + truncate_prompt_tokens=5)) assert len(results) == 1 # Should keep the last 5 tokens: [105, 106, 107, 108, 109] @@ -170,7 +176,7 @@ class TestRenderPrompt: with pytest.raises(ValueError, match="maximum context length"): await renderer.render_prompt(prompt_or_prompts=long_tokens, - max_length=100) + config=RenderConfig(max_length=100)) @pytest.mark.asyncio async def test_no_tokenizer_for_text(self, mock_model_config): @@ -181,7 +187,8 @@ class TestRenderPrompt: with pytest.raises(ValueError, match="No tokenizer available"): await renderer_no_tokenizer.render_prompt( - prompt_or_prompts="Hello world", max_length=100) + prompt_or_prompts="Hello world", + config=RenderConfig(max_length=100)) @pytest.mark.asyncio async def test_token_input_with_needs_detokenization( @@ -196,7 +203,7 @@ class TestRenderPrompt: tokens = [1, 2, 3, 4] results = await renderer.render_prompt( prompt_or_prompts=tokens, - needs_detokenization=True, + config=RenderConfig(needs_detokenization=True), ) assert len(results) == 1 @@ -221,7 +228,9 @@ class TestRenderEmbedPrompt: embed_bytes = self._create_test_embed_bytes(test_tensor) results = await renderer.render_prompt_and_embeds( - prompt_embeds=embed_bytes, cache_salt="test_salt") + prompt_embeds=embed_bytes, + config=RenderConfig(cache_salt="test_salt"), + ) assert len(results) == 1 assert is_embeds_prompt(results[0]) @@ -240,7 +249,9 @@ class TestRenderEmbedPrompt: ] results = await renderer.render_prompt_and_embeds( - prompt_embeds=embed_bytes_list) + prompt_embeds=embed_bytes_list, + config=RenderConfig(), + ) assert len(results) == 2 for i, result in enumerate(results): @@ -254,7 +265,9 @@ class TestRenderEmbedPrompt: embed_bytes = self._create_test_embed_bytes(test_tensor) results = await renderer.render_prompt_and_embeds( - prompt_embeds=embed_bytes, truncate_prompt_tokens=10) + prompt_embeds=embed_bytes, + config=RenderConfig(truncate_prompt_tokens=10), + ) assert len(results) == 1 # Should keep last 10 tokens @@ -271,7 +284,9 @@ class TestRenderEmbedPrompt: embed_bytes = self._create_test_embed_bytes(test_tensor) results = await renderer.render_prompt_and_embeds( - prompt_embeds=embed_bytes) + prompt_embeds=embed_bytes, + config=RenderConfig(), + ) assert len(results) == 1 assert results[0]["prompt_embeds"].dtype == dtype @@ -283,7 +298,9 @@ class TestRenderEmbedPrompt: embed_bytes = self._create_test_embed_bytes(test_tensor) results = await renderer.render_prompt_and_embeds( - prompt_embeds=embed_bytes) + prompt_embeds=embed_bytes, + config=RenderConfig(), + ) assert len(results) == 1 # Should be squeezed to 2D @@ -303,7 +320,10 @@ class TestRenderEmbedPrompt: embed_bytes = self._create_test_embed_bytes(test_tensor) results = await renderer.render_prompt_and_embeds( - prompt_or_prompts="Hello world", prompt_embeds=embed_bytes) + prompt_or_prompts="Hello world", + prompt_embeds=embed_bytes, + config=RenderConfig(), + ) assert len(results) == 2 # First should be embed prompt diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 98b7a206fa0cb..ef730442691cf 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -20,6 +20,7 @@ from vllm.entrypoints.openai.serving_engine import (ClassificationServeContext, OpenAIServing, ServeContext) from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig from vllm.logger import init_logger from vllm.outputs import ClassificationOutput, PoolingRequestOutput from vllm.pooling_params import PoolingParams @@ -57,8 +58,7 @@ class ClassificationMixin(OpenAIServing): renderer = self._get_renderer(ctx.tokenizer) ctx.engine_prompts = await renderer.render_prompt( prompt_or_prompts=ctx.request.input, - max_length=self.max_model_len, - truncate_prompt_tokens=ctx.request.truncate_prompt_tokens) + config=self._build_render_config(ctx.request)) return None @@ -114,6 +114,12 @@ class ClassificationMixin(OpenAIServing): usage=usage, ) + def _build_render_config(self, + request: ClassificationRequest) -> RenderConfig: + return RenderConfig( + max_length=self.max_model_len, + truncate_prompt_tokens=request.truncate_prompt_tokens) + class ServingClassification(ClassificationMixin): request_id_prefix = "classify" diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index e3e945e22cd4e..10b1c54fe6a98 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -30,6 +30,7 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing, clamp_prompt_logprobs) # yapf: enable from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.utils import get_max_tokens from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt, is_tokens_prompt) @@ -129,18 +130,11 @@ class OpenAIServingCompletion(OpenAIServing): tokenizer = await self.engine_client.get_tokenizer(lora_request ) renderer = self._get_renderer(tokenizer) - max_input_tokens_len = self.max_model_len - (request.max_tokens - or 0) engine_prompts = await renderer.render_prompt_and_embeds( prompt_or_prompts=request.prompt, prompt_embeds=request.prompt_embeds, - max_length=max_input_tokens_len, - truncate_prompt_tokens=request.truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - cache_salt=request.cache_salt, - needs_detokenization=bool(request.echo - and not request.return_token_ids), + config=self._build_render_config(request), ) except ValueError as e: logger.exception("Error in preprocessing prompt inputs") @@ -677,3 +671,18 @@ class OpenAIServingCompletion(OpenAIServing): tokens=out_tokens, top_logprobs=out_top_logprobs, ) + + def _build_render_config( + self, + request: CompletionRequest, + max_input_length: Optional[int] = None, + ) -> RenderConfig: + max_input_tokens_len = self.max_model_len - (request.max_tokens or 0) + return RenderConfig( + max_length=max_input_tokens_len, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + cache_salt=request.cache_salt, + needs_detokenization=bool(request.echo + and not request.return_token_ids), + ) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 8acd4a1e01fb1..476c21ad6a302 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -28,6 +28,7 @@ from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext, TextTokensPrompt) # yapf: enable from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, @@ -97,23 +98,28 @@ class EmbeddingMixin(OpenAIServing): add_special_tokens=ctx.request.add_special_tokens, ) else: - # Set max_length based on chunked processing capability - if self._should_use_chunked_processing(ctx.request): - max_length = None - else: - max_length = self.max_embed_len or self.max_model_len - ctx.engine_prompts = await renderer.render_prompt( prompt_or_prompts=ctx.request.input, - max_length=max_length, - truncate_prompt_tokens=ctx.request.truncate_prompt_tokens, - add_special_tokens=ctx.request.add_special_tokens, + config=self._build_render_config(ctx.request), ) return None except (ValueError, TypeError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + def _build_render_config( + self, request: EmbeddingCompletionRequest) -> RenderConfig: + # Set max_length based on chunked processing capability + if self._should_use_chunked_processing(request): + max_length = None + else: + max_length = self.max_embed_len or self.max_model_len + + return RenderConfig( + max_length=max_length, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens) + @override def _build_response( self, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 23d9b706ee628..708b1a5facf49 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -58,7 +58,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, TranslationRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser -from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer +from vllm.entrypoints.renderer import (BaseRenderer, CompletionRenderer, + RenderConfig) # yapf: enable from vllm.inputs.data import PromptType from vllm.inputs.data import TokensPrompt as EngineTokensPrompt @@ -248,6 +249,19 @@ class OpenAIServing: tokenizer=tokenizer, async_tokenizer_pool=self._async_tokenizer_pool) + def _build_render_config( + self, + request: Any, + ) -> RenderConfig: + """ + Build and return a `RenderConfig` for an endpoint. + + Used by the renderer to control how prompts are prepared + (e.g., tokenization and length handling). Endpoints should + implement this with logic appropriate to their request type. + """ + raise NotImplementedError + def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer: """ Return (and cache) an `AsyncMicrobatchTokenizer` bound to the diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index c08c0743ffca6..d70e1a808aba6 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -28,6 +28,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.utils import _validate_truncation_size from vllm.logger import init_logger from vllm.outputs import PoolingOutput, PoolingRequestOutput @@ -149,10 +150,7 @@ class OpenAIServingPooling(OpenAIServing): elif isinstance(request, PoolingCompletionRequest): engine_prompts = await renderer.render_prompt( prompt_or_prompts=request.input, - max_length=self.max_model_len, - truncate_prompt_tokens=truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - cache_salt=getattr(request, 'cache_salt', None), + config=self._build_render_config(request), ) else: raise ValueError( @@ -270,3 +268,10 @@ class OpenAIServingPooling(OpenAIServing): data=items, usage=usage, ) + + def _build_render_config( + self, request: PoolingCompletionRequest) -> RenderConfig: + return RenderConfig( + max_length=self.max_model_len, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 70cb6c21b2213..1efd9678571c4 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -22,6 +22,7 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest, # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -72,7 +73,7 @@ class OpenAIServingTokenization(OpenAIServing): [tool.model_dump() for tool in request.tools]) ( _, - request_prompts, + _, engine_prompts, ) = await self._preprocess_chat( request, @@ -90,15 +91,14 @@ class OpenAIServingTokenization(OpenAIServing): else: engine_prompts = await renderer.render_prompt( prompt_or_prompts=request.prompt, - add_special_tokens=request.add_special_tokens, - cache_salt=getattr(request, 'cache_salt', None), + config=self._build_render_config(request), ) except (ValueError, TypeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") input_ids: list[int] = [] - for i, engine_prompt in enumerate(engine_prompts): + for engine_prompt in engine_prompts: self._log_inputs(request_id, engine_prompt, params=None, @@ -157,6 +157,9 @@ class OpenAIServingTokenization(OpenAIServing): return self.create_error_response( f"Failed to get tokenizer info: {str(e)}") + def _build_render_config(self, request: TokenizeRequest) -> RenderConfig: + return RenderConfig(add_special_tokens=request.add_special_tokens) + @dataclass class TokenizerInfo: diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 441b89ec4803e..68d33ada40975 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -4,6 +4,7 @@ import asyncio import io from abc import ABC, abstractmethod +from dataclasses import dataclass from typing import Annotated, Optional, Union import pybase64 @@ -18,6 +19,29 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import AsyncMicrobatchTokenizer +@dataclass(frozen=True) +class RenderConfig: + """Configuration to control how prompts are prepared.""" + + max_length: Optional[int] = None + """Maximum allowable total input token length. If provided, + token inputs longer than this raise ``ValueError``.""" + + truncate_prompt_tokens: Optional[int] = None + """Number of tokens to keep. ``None`` means no truncation. + ``0`` yields an empty list (and skips embeds). + ``-1`` maps to ``model_config.max_model_len``.""" + + add_special_tokens: Optional[bool] = True + """Whether to add model-specific special tokens during tokenization.""" + + cache_salt: Optional[str] = None + """String to disambiguate prefix cache entries.""" + + needs_detokenization: Optional[bool] = False + """If True, detokenize IDs back to text for inclusion in outputs.""" + + class BaseRenderer(ABC): """ Base class for unified input processing and rendering. @@ -48,12 +72,9 @@ class BaseRenderer(ABC): @abstractmethod async def render_prompt( self, + *, prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]], - max_length: Optional[int] = None, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, - add_special_tokens: Optional[bool] = True, - cache_salt: Optional[str] = None, - needs_detokenization: Optional[bool] = False, + config: "RenderConfig", ) -> list[EngineTokensPrompt]: """ Convert text or token inputs into engine-ready TokensPrompt objects. @@ -68,16 +89,8 @@ class BaseRenderer(ABC): - ``list[str]``: Batch of text prompts. - ``list[int]``: Single pre-tokenized sequence. - ``list[list[int]]``: Batch of pre-tokenized sequences. - max_length: Maximum allowable total input token length. If provided, - token inputs longer than this raise ``ValueError``. - truncate_prompt_tokens: Number of tokens to keep. ``None`` means no - truncation. ``0`` yields an empty list (and skips embeds). - ``-1`` maps to ``model_config.max_model_len``. - add_special_tokens: Whether to add model-specific special tokens - during text tokenization. - cache_salt: Optional string to disambiguate prefix cache entries. - needs_detokenization: If True and ``prompt_or_prompts`` is token - input, detokenize IDs back to text for inclusion in outputs. + config: Render configuration controlling how prompts are prepared + (e.g., tokenization and length handling). Returns: list[EngineTokensPrompt]: Engine-ready token prompts. @@ -90,18 +103,15 @@ class BaseRenderer(ABC): @abstractmethod async def render_prompt_and_embeds( self, + *, prompt_or_prompts: Optional[Union[str, list[str], list[int], list[list[int]]]] = None, prompt_embeds: Optional[Union[bytes, list[bytes]]] = None, - max_length: Optional[int] = None, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, - add_special_tokens: Optional[bool] = True, - cache_salt: Optional[str] = None, - needs_detokenization: Optional[bool] = False, + config: "RenderConfig", ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]: """ Convert text/token and/or base64-encoded embeddings inputs into - engine-ready prompt objects. + engine-ready prompt objects using a unified RenderConfig. At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be provided and non-empty. If both are omitted or empty (e.g., empty @@ -111,15 +121,8 @@ class BaseRenderer(ABC): prompt_or_prompts: Text or token inputs to include. prompt_embeds: Base64-encoded bytes (or list thereof) containing a torch-saved tensor to be used as prompt embeddings. - max_length: Maximum allowable total input token length. If provided, - inputs longer than this raise ``ValueError``. - truncate_prompt_tokens: Number of tokens/rows to keep from the end - of the sequence. ``-1`` maps to ``model_config.max_model_len``. - add_special_tokens: Whether to add model-specific special tokens - during text tokenization. - cache_salt: Optional string to disambiguate prefix cache entries. - needs_detokenization: If True and ``prompt_or_prompts`` is token - input, detokenize IDs back to text for inclusion in outputs. + config: Render configuration controlling how prompts are prepared + (e.g., tokenization and length handling). Returns: list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]: @@ -184,12 +187,9 @@ class CompletionRenderer(BaseRenderer): async def render_prompt( self, + *, prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]], - max_length: Optional[int] = None, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, - add_special_tokens: Optional[bool] = True, - cache_salt: Optional[str] = None, - needs_detokenization: Optional[bool] = False, + config: "RenderConfig", ) -> list[EngineTokensPrompt]: """Implementation of prompt rendering for completion-style requests. @@ -197,7 +197,7 @@ class CompletionRenderer(BaseRenderer): for detailed parameter documentation. """ truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens( - truncate_prompt_tokens, max_length) + config.truncate_prompt_tokens, config.max_length) if truncate_prompt_tokens == 0: return [] @@ -211,16 +211,19 @@ class CompletionRenderer(BaseRenderer): detokenize_task = asyncio.create_task( # Note: detokenization is needed when echo is enabled, # where the input token IDs are decoded back to text. - self._maybe_detokenize(prompt_input["content"], max_length, - truncate_prompt_tokens, cache_salt, - needs_detokenization)) + self._maybe_detokenize(prompt_input["content"], + config.max_length, + truncate_prompt_tokens, + config.cache_salt, + config.needs_detokenization)) tasks.append(detokenize_task) else: # Text input tokenize_task = asyncio.create_task( - self._tokenize(prompt_input["content"], max_length, - truncate_prompt_tokens, add_special_tokens, - cache_salt)) + self._tokenize(prompt_input["content"], config.max_length, + truncate_prompt_tokens, + config.add_special_tokens, + config.cache_salt)) tasks.append(tokenize_task) # Wait for all text tokenization to finish @@ -232,21 +235,18 @@ class CompletionRenderer(BaseRenderer): async def render_prompt_and_embeds( self, + *, prompt_or_prompts: Optional[Union[str, list[str], list[int], list[list[int]]]] = None, prompt_embeds: Optional[Union[bytes, list[bytes]]] = None, - max_length: Optional[int] = None, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, - add_special_tokens: Optional[bool] = True, - cache_salt: Optional[str] = None, - needs_detokenization: Optional[bool] = False, + config: "RenderConfig", ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]: """ Render text/token prompts and/or precomputed embedding prompts. At least one of `prompt_or_prompts` or `prompt_embeds` must be provided. """ truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens( - truncate_prompt_tokens, max_length) + config.truncate_prompt_tokens, config.max_length) if truncate_prompt_tokens == 0: return [] @@ -255,17 +255,13 @@ class CompletionRenderer(BaseRenderer): if prompt_embeds is not None: rendered.extend( self.load_prompt_embeds(prompt_embeds, truncate_prompt_tokens, - cache_salt)) + config.cache_salt)) if prompt_or_prompts is None or prompt_or_prompts == "": return rendered token_prompts = await self.render_prompt( prompt_or_prompts=prompt_or_prompts, - max_length=max_length, - truncate_prompt_tokens=truncate_prompt_tokens, - add_special_tokens=add_special_tokens, - cache_salt=cache_salt, - needs_detokenization=needs_detokenization, + config=config, ) rendered.extend(token_prompts) @@ -394,4 +390,4 @@ class CompletionRenderer(BaseRenderer): tokens_prompt["cache_salt"] = cache_salt if prompt is not None: tokens_prompt["prompt"] = prompt - return tokens_prompt \ No newline at end of file + return tokens_prompt From 267c80d31f6b77092a5d5903da64556ac15c4d4d Mon Sep 17 00:00:00 2001 From: li-jinpeng <71747297+li-jinpeng@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:45:44 +0800 Subject: [PATCH 106/584] [Model] Limit CPU threads for image transformations in InternVL to reduce cpu contention. (#24519) Signed-off-by: li-jinpeng <3332126450@qq.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/internvl.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index b09ed7bbe72a3..9565628b198e2 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -7,6 +7,7 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +import os from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence from typing import Annotated, Any, Literal, Optional, TypeVar, Union @@ -37,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import set_default_torch_num_threads from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -115,13 +117,26 @@ InternVLVideoInputs = Union[InternVLVideoPixelInputs, # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - return T.Compose([ + transform = T.Compose([ T.Lambda(lambda img: convert_image_mode(img, 'RGB')), T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD) ]) + # Image transformation operations (which include tensor computations + # on the CPU) can occupy a substantial number of CPU cores, introducing + # overhead due to CPU contention. This issue becomes particularly + # noticeable when deploying multiple vLLM instances on a single machine. + # Therefore, it is necessary to limit the number of threads allocated to + # image transformation tasks. + num_threads = int(os.environ.get("OMP_NUM_THREADS", "1")) + + def apply(img): + with set_default_torch_num_threads(num_threads): + return transform(img) + + return apply # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B From 0ae43dbf8cb28a299ae724fc742b0c5bcddea868 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Date: Wed, 10 Sep 2025 05:19:26 -0400 Subject: [PATCH 107/584] [Attention] add DCP support for FLASH_ATTN_MLA backend (#24453) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> --- .../v1/attention/backends/mla/flashattn_mla.py | 18 ++++++++++++++++-- vllm/v1/worker/gpu_model_runner.py | 3 +++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 12f206637d7c6..472095e13615b 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -11,6 +11,7 @@ from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, from vllm.attention.utils.fa_utils import (flash_attn_supports_mla, get_flash_attn_version) from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_dcp_group from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonDecodeMetadata, @@ -98,6 +99,11 @@ class FlashAttnMLAMetadataBuilder( # pre-allocated during capture. self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + # TODO(lucas): Until we add support for the DCP custom masking we need + # to restrict decodes to q_len == 1 when DCP is enabled. + self.__class__.reorder_batch_threshold = 1 \ + if get_dcp_group().world_size > 1 else self.reorder_batch_threshold + def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): if self.fa_aot_schedule: @@ -172,6 +178,7 @@ class FlashAttnMLAMetadataBuilder( class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]): + can_return_lse_for_decode: bool = True def __init__( self, @@ -239,7 +246,7 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]): # to prevent invalid grid configuration during graph capture. max_seqlen_q = max(attn_metadata.decode.max_query_len, 1) - o = flash_attn_varlen_func( + attn_out = flash_attn_varlen_func( q=q_pe, k=k_pe_cache.unsqueeze(-2), # Add head dim of 1 v=kv_c_cache.unsqueeze(-2), # Add head dim of 1 @@ -251,9 +258,16 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]): block_table=attn_metadata.decode.block_table, softmax_scale=self.scale, causal=True, + return_softmax_lse=self.need_to_return_lse_for_decode, fa_version=3, # only version 3 is supported scheduler_metadata=attn_metadata.decode.scheduler_metadata, num_splits=attn_metadata.decode.max_num_splits, ) - return self._v_up_proj(o) + if self.need_to_return_lse_for_decode: + o, lse = attn_out + # FA returns LSE in shape [ H, B ] but DCP wants [ B, H ] + return o, lse.transpose(0, 1) # [ H, B ] -> [ B, H ] + else: + o = attn_out + return o, None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 897c3a6213207..944793cad94f4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -440,6 +440,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return if self.reorder_batch_threshold is not None: + # NOTE(lucas): currently no backend supports the custom masking + # required for DCP with q_len > 1, so we assert here. Remove this + # assert once the custom mask is support is added to FA3. if self.dcp_world_size > 1: assert self.reorder_batch_threshold == 1, \ "DCP not support reorder_batch_threshold > 1 now." From 7c195d43da241d1ae07e73062c6fe593be3e4aac Mon Sep 17 00:00:00 2001 From: vllmellm <vllm.ellm@embeddedllm.com> Date: Wed, 10 Sep 2025 21:08:03 +0800 Subject: [PATCH 108/584] [ROCm][Bugfix] Fix Aiter RMSNorm (#23412) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> --- .../model_executor/test_enabled_custom_ops.py | 37 ++++---- vllm/model_executor/layers/layernorm.py | 89 +++++++++++++++---- vllm/platforms/rocm.py | 18 +++- 3 files changed, 108 insertions(+), 36 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 140f00294765d..86139d598582d 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -13,13 +13,15 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func, vllm_topk_softmax) from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( is_rocm_aiter_moe_enabled) -from vllm.model_executor.layers.layernorm import ( - RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm, - rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm) +from vllm.model_executor.layers.layernorm import (RMSNorm, + dispatch_rocm_rmsnorm_func, + fused_add_rms_norm, rms_norm) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul) from vllm.platforms import current_platform +RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16] + # Registered subclass for test @CustomOp.register("relu3") @@ -149,24 +151,27 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch): @pytest.mark.parametrize("add_residual", [True, False]) +@pytest.mark.parametrize("dtype", + [torch.float32, torch.float16, torch.bfloat16]) @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"]) @pytest.mark.skipif(not current_platform.is_rocm(), reason="AITER is a feature exclusive for ROCm") -def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str, - use_rocm_aiter_norm: str, monkeypatch): +def test_rms_norm_dispatch(add_residual: bool, dtype: torch.dtype, + use_rocm_aiter: str, use_rocm_aiter_norm: str, + monkeypatch): monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm) - rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual) + rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype) - if not add_residual: - if current_platform.is_rocm() and int(use_rocm_aiter) and int( - use_rocm_aiter_norm): - assert rms_norm_func == rocm_aiter_rms_norm - else: - assert rms_norm_func == rms_norm - elif current_platform.is_rocm() and int(use_rocm_aiter) and int( - use_rocm_aiter_norm): - assert rms_norm_func == rocm_aiter_fused_add_rms_norm - else: + should_use_rocm_aiter = current_platform.is_rocm() and int(use_rocm_aiter) \ + and int(use_rocm_aiter_norm) and dtype in RMS_NORM_SUPPORTED_DTYPES + + if add_residual and should_use_rocm_aiter: + assert rms_norm_func == torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add + elif should_use_rocm_aiter: + assert rms_norm_func == torch.ops.vllm.rocm_aiter_rms_norm + elif add_residual: assert rms_norm_func == fused_add_rms_norm + else: + assert rms_norm_func == rms_norm diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index a5fc1db2dc10f..0488eab1e03fb 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -9,11 +9,11 @@ import torch.nn as nn import vllm.envs as envs from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op def is_rocm_aiter_rmsnorm_enabled() -> bool: - return current_platform.is_rocm() \ - and envs.VLLM_ROCM_USE_AITER_RMSNORM \ + return envs.VLLM_ROCM_USE_AITER_RMSNORM \ and envs.VLLM_ROCM_USE_AITER @@ -43,8 +43,8 @@ def fused_add_rms_norm( return x, residual -def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor, - variance_epsilon: float) -> torch.Tensor: +def rocm_aiter_rms_norm_impl(x: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> torch.Tensor: import aiter as rocm_aiter if x.dim() > 2: x_original_shape = x.shape @@ -55,7 +55,7 @@ def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor, return rocm_aiter.rms_norm(x, weight, variance_epsilon) -def rocm_aiter_fused_add_rms_norm( +def rocm_aiter_rmsnorm2d_fwd_with_add_impl( x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]: @@ -74,14 +74,48 @@ def rocm_aiter_fused_add_rms_norm( return output, residual_out -def dispatch_cuda_rmsnorm_func(add_residual: bool): - if add_residual: - if is_rocm_aiter_rmsnorm_enabled(): - return rocm_aiter_fused_add_rms_norm - return fused_add_rms_norm +def rocm_aiter_rms_norm_fake(x: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> torch.Tensor: + return torch.empty_like(x) - if is_rocm_aiter_rmsnorm_enabled(): - return rocm_aiter_rms_norm + +def rocm_aiter_rmsnorm2d_fwd_with_add_fake( + x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]: + return torch.empty_like(x), torch.empty_like(residual) + + +if current_platform.is_rocm(): + direct_register_custom_op( + op_name="rocm_aiter_rms_norm", + op_func=rocm_aiter_rms_norm_impl, + mutates_args=[], + fake_impl=rocm_aiter_rms_norm_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm2d_fwd_with_add", + op_func=rocm_aiter_rmsnorm2d_fwd_with_add_impl, + mutates_args=[], + fake_impl=rocm_aiter_rmsnorm2d_fwd_with_add_fake, + dispatch_key=current_platform.dispatch_key, + ) + + +def dispatch_rocm_rmsnorm_func(with_fused_add: bool, dtype: torch.dtype): + use_aiter = is_rocm_aiter_rmsnorm_enabled() and dtype in [ + torch.float16, torch.bfloat16 + ] + + if use_aiter and with_fused_add: + return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add + if use_aiter: + return torch.ops.vllm.rocm_aiter_rms_norm + + # fall back to CUDA implementation + if with_fused_add: + return fused_add_rms_norm return rms_norm @@ -114,6 +148,13 @@ class RMSNorm(CustomOp): self.weight = torch.ones(hidden_size) if self.has_weight: self.weight = nn.Parameter(self.weight) + weight_dtype = self.weight.data.dtype + + if current_platform.is_rocm(): + self.rocm_norm_func = dispatch_rocm_rmsnorm_func( + with_fused_add=False, dtype=weight_dtype) + self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func( + with_fused_add=True, dtype=weight_dtype) def forward_native( self, @@ -162,13 +203,27 @@ class RMSNorm(CustomOp): return self.forward_native(x, residual) add_residual = residual is not None - norm_func = dispatch_cuda_rmsnorm_func(add_residual) - if add_residual: - return norm_func(x, residual, self.weight.data, - self.variance_epsilon) + return fused_add_rms_norm(x, residual, self.weight.data, + self.variance_epsilon) else: - return norm_func(x, self.weight.data, self.variance_epsilon) + return rms_norm(x, self.weight.data, self.variance_epsilon) + + def forward_hip( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + if self.variance_size_override is not None: + return self.forward_native(x, residual) + + add_residual = residual is not None + if add_residual: + return self.rocm_norm_func_with_add(x, residual, self.weight.data, + self.variance_epsilon) + else: + return self.rocm_norm_func(x, self.weight.data, + self.variance_epsilon) def forward_xpu( self, diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index c6d14aa87c7f2..cf7b87cf030a9 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -322,23 +322,35 @@ class RocmPlatform(Platform): @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: + from vllm.config.compilation import CUDAGraphMode + cache_config = vllm_config.cache_config + compilation_config = vllm_config.compilation_config + parallel_config = vllm_config.parallel_config + is_eager_execution = compilation_config == CUDAGraphMode.NONE + + use_v1 = envs.VLLM_USE_V1 + use_aiter_rms_norm = envs.VLLM_ROCM_USE_AITER and \ + envs.VLLM_ROCM_USE_AITER_RMSNORM + if cache_config and cache_config.block_size is None: cache_config.block_size = 16 - parallel_config = vllm_config.parallel_config if parallel_config.worker_cls == "auto": if vllm_config.speculative_config: - if not envs.VLLM_USE_V1: + if not use_v1: raise NotImplementedError( "Speculative decoding is not supported on vLLM V0.") parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" else: - if envs.VLLM_USE_V1: + if use_v1: parallel_config.worker_cls = \ "vllm.v1.worker.gpu_worker.Worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker" + # Aiter rms norm perform best when CUDA Graph capture is enabled. + if use_v1 and use_aiter_rms_norm and not is_eager_execution: + compilation_config.custom_ops.append("+rms_norm") @classmethod def verify_model_arch(cls, model_arch: str) -> None: From 9dbefd88e9bbd3ffa12494424db3ee79aebdc9e7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 10 Sep 2025 14:08:21 +0100 Subject: [PATCH 109/584] [Docs] Improve organisation of API Reference nav (#24569) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/.nav.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.nav.yml b/docs/.nav.yml index 8a21dc9f1d703..327755582a821 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -48,7 +48,7 @@ nav: - Design Documents: design - API Reference: - api/README.md - - api/vllm/* + - api/vllm - CLI Reference: cli - Community: - community/* From 8b83b937397ca921ca59eb14515da0ebf0d73b6d Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith <tyler@neuralmagic.com> Date: Wed, 10 Sep 2025 09:09:49 -0400 Subject: [PATCH 110/584] [Docs] Document the extra memory footprint overhead when using EPLB (#24537) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> --- docs/serving/expert_parallel_deployment.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 7bf87b151e6af..f8701870864dc 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -156,6 +156,13 @@ vllm serve Qwen/Qwen3-30B-A3B \ - **Default**: Each EP rank has `NUM_TOTAL_EXPERTS ÷ NUM_EP_RANKS` experts - **With redundancy**: Each EP rank has `(NUM_TOTAL_EXPERTS + NUM_REDUNDANT_EXPERTS) ÷ NUM_EP_RANKS` experts +### Memory Footprint Overhead + +EPLB uses redundant experts to that need to fit in GPU memory. This means that EPLB may not be a good fit for memory constrained environments or when KV cache space is at a premium. + +This overhead equals `NUM_MOE_LAYERS * BYTES_PER_EXPERT * (NUM_TOTAL_EXPERTS + NUM_REDUNDANT_EXPERTS) ÷ NUM_EP_RANKS`. +For DeepSeekV3, this is approximately `2.4 GB` for one redundant expert per rank. + ### Example Command Single node deployment with EPLB enabled: From 72d30108a0fe06a1ba14d8645dda830a9ab01791 Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:10:06 +0300 Subject: [PATCH 111/584] Support for NemotronH Nano VLM (#23644) Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com> --- tests/models/registry.py | 3 + vllm/config/__init__.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 1395 +++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 1400 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/nano_nemotron_vl.py diff --git a/tests/models/registry.py b/tests/models/registry.py index a5e83bc11f144..c80f045d98743 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -515,6 +515,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501 trust_remote_code=True), + "NemotronH_Nano_VL": _HfExamplesInfo("nano_vl_dummy", + is_available_online=False, + trust_remote_code=True), "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True, max_transformers_version="4.53", transformers_version_reason="HF model is not compatible", # noqa: E501 diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 7422527a6854b..4bab06a98cb21 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1552,7 +1552,7 @@ class ModelConfig: for bc in block_configs[start:end]) else: # Hybrid model Jamba - layers_block_type_value = getattr(self.hf_config, + layers_block_type_value = getattr(self.hf_text_config, "layers_block_type", None) if layers_block_type_value is not None: if hasattr(self.hf_text_config, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py new file mode 100644 index 0000000000000..21765a483b8e0 --- /dev/null +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -0,0 +1,1395 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# -------------------------------------------------------- +# Adapted from +# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/internvl.py +# under Apache-2.0 License +# LICENSE is in root directory. +# -------------------------------------------------------- + +import copy +import warnings +from abc import ABC, abstractmethod +from collections.abc import Iterable, Mapping, Sequence +from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union + +import numpy.typing as npt +import torch +import torch.nn as nn +import torchvision.transforms as T +from PIL import Image +from transformers import (AutoModel, BatchEncoding, BatchFeature, + PretrainedConfig, TensorType) + +from vllm.config import VllmConfig +from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, + MultiModalEmbeddings, + SupportsMultiModal) +from vllm.model_executor.models.internvl import (calculate_internvl_targets, + get_internvl_target_ratios) +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM +from vllm.model_executor.models.utils import (flatten_bn, + init_vllm_registered_model, + maybe_prefix, + merge_multimodal_embeddings) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs, MultiModalKwargsItems, + NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +# Configure PIL to handle large images without warnings +# This prevents DecompressionBombWarning for legitimate large images +Image.MAX_IMAGE_PIXELS = None # Disable the limit entirely +# Alternative: Set a specific higher limit +# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels + +IMG_START = "<img>" +IMG_END = "</img>" +IMG_CONTEXT = "<image>" + +# Profiling +MAX_FRAMES = 16 + + +class NanoNemotronVLImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values_flat: torch.Tensor + """ + Shape: + `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` + """ + + num_patches: torch.Tensor + """Shape: `(batch_size * num_images)`""" + + +class NanoNemotronVLImageEmbeddinInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_images, total_image_feature_size, hidden_size)` + or a list of tensors of shape `(total_image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +NanoNemotronVLImageInputs = Union[NanoNemotronVLImagePixelInputs, + NanoNemotronVLImageEmbeddinInputs] + + +class NanoNemotronVLVideoPixelInputs(TensorSchema): + """ + Dimensions: + - bvf: Batch size * number of videos * num_frames + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height of each video frame + - w: Width of each video frame + """ + type: Literal["pixel_values_videos"] + pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")] + num_patches: Annotated[torch.Tensor, TensorShape("bn")] + + +class NanoNemotronVLVideoEmbeddingInputs(TensorSchema): + """ + Dimensions: + - n: Number of videos + - f: Total video feature size + - h: Hidden size (must match the hidden size of language model backbone) + """ + type: Literal["video_embeds"] + data: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("n", "f", "h")] + + +NanoNemotronVLVideoInputs = Union[NanoNemotronVLVideoPixelInputs, + NanoNemotronVLVideoEmbeddingInputs] + + +def input_conditioner(x, norm_mean, norm_std): + y = (x - norm_mean) / norm_std + return y + + +def dynamic_preprocess(image, + *, + image_size=512, + max_num_tiles=12, + use_thumbnail=True, + idx=0): + orig_width, orig_height = image.size + + target_ratios = get_internvl_target_ratios(1, max_num_tiles) + + blocks, target_width, target_height = calculate_internvl_targets( + orig_width=orig_width, + orig_height=orig_height, + target_ratios=target_ratios, + image_size=image_size, + use_thumbnail=False) + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + + processed_images = [ + img.convert("RGB") if img.mode != "RGB" else img + for img in processed_images + ] + processed_images = [ + T.Resize((image_size, image_size), + interpolation=T.InterpolationMode.BICUBIC)(img) + for img in processed_images + ] + processed_images = [T.ToTensor()(img) for img in processed_images] + return processed_images + + +def image_to_pixel_values( + image: Image.Image, + *, + input_size: int, + max_num: int, + use_thumbnail: bool, + idx: int, +) -> torch.Tensor: + images = dynamic_preprocess( + image, + image_size=input_size, + max_num_tiles=max_num, + use_thumbnail=use_thumbnail, + idx=idx, + ) + + pixel_values = torch.stack(images) + return pixel_values + + +def video_to_pixel_values( + video: npt.NDArray, + *, + input_size: int, + max_num_tiles: int = 1, + use_thumbnail: bool, +) -> torch.Tensor: + # Convert each frame to a single resized tile tensor consistent + # with image path + frames_tensors: list[torch.Tensor] = [] + for frame in video: + pil_frame = dynamic_preprocess( + Image.fromarray(frame, mode="RGB"), + image_size=input_size, + max_num_tiles=max_num_tiles, + use_thumbnail=use_thumbnail, + idx=0, + ) + # dynamic_preprocess returns tensors already; take the single tile + assert len(pil_frame) >= 1 + frames_tensors.append(pil_frame[0]) + + return torch.stack(frames_tensors) + + +class BaseNanoNemotronVLProcessor(ABC): + """ + This model doesn't define its own HF processor, + so we implement our own one here. + + The code to insert image tokens is based on: + https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252 + """ + + def __init__(self, config: PretrainedConfig, tokenizer: AnyTokenizer, + *args, **kwargs) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + image_size: int = config.force_image_size + patch_size: int = config.patch_size + + self.num_image_token = int( + (image_size // patch_size)**2 * (config.downsample_ratio**2)) + self.image_size = image_size + self.use_thumbnail: bool = config.use_thumbnail + self.norm_mean = torch.Tensor(config.norm_mean).reshape(1, 3, 1, 1) + self.norm_std = torch.Tensor(config.norm_std).reshape(1, 3, 1, 1) + + @property + @abstractmethod + def image_token_id(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_image_repl( + self, + feature_size: int, + num_patches: Optional[int], + ) -> PromptUpdateDetails[str]: + raise NotImplementedError + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + max_num_tiles: int, + ) -> int: + target_ratios = get_internvl_target_ratios(1, max_num_tiles) + + num_patches, _, _ = calculate_internvl_targets( + orig_width=image_width, + orig_height=image_height, + target_ratios=target_ratios, + image_size=self.image_size, + use_thumbnail=self.use_thumbnail, + ) + + return num_patches * self.num_image_token + + def _images_to_pixel_values_lst( + self, + images: list[Image.Image], + max_num_tiles: int, + ) -> list[torch.Tensor]: + return [ + image_to_pixel_values( + image, + input_size=self.image_size, + max_num=max_num_tiles, + use_thumbnail=self.use_thumbnail, + idx=idx, + ) for idx, image in enumerate(images) + ] + + def _preprocess_image( + self, + text: list[str], + images: list[Image.Image], + max_num_tiles: int, + ) -> tuple[list[str], dict[str, torch.Tensor]]: + if len(images) == 0: + image_inputs = {} + else: + pixel_values_lst = self._images_to_pixel_values_lst( + images, max_num_tiles) + image_inputs: dict[str, NestedTensors] = { + "pixel_values_flat": + input_conditioner(torch.cat(pixel_values_lst), self.norm_mean, + self.norm_std), + "image_num_patches": + torch.tensor([len(item) for item in pixel_values_lst]), + } + + for pixel_values in pixel_values_lst: + num_patches = pixel_values.shape[0] + feature_size = num_patches * self.num_image_token + image_repl = self.get_image_repl(feature_size, num_patches) + text = [t.replace('<image>', image_repl.full, 1) for t in text] + return text, image_inputs + + def _make_batch_input(self, + input_item: Optional[Union[Any, list[Any]]] = None): + if input_item is None: + input_item = [] + if not isinstance(input_item, list): + input_item = [input_item] + return input_item + + def __call__( + self, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + max_num_tiles: Optional[int] = None, + ) -> Mapping[str, NestedTensors]: + # Use default if not provided + if max_num_tiles is None: + max_num_tiles = 12 + + text, images = [self._make_batch_input(x) for x in (text, images)] + + text, image_inputs = self._preprocess_image( + text=text, + images=images, + max_num_tiles=max_num_tiles, + ) + + text_inputs = self.tokenizer(text, add_special_tokens=False) + + return { + **BatchEncoding(text_inputs, tensor_type=return_tensors), + **image_inputs, + } + + +class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): + """ + HF Processor with extended video processing logic. + Code for video processing is adapted from video example: + https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers + """ + + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + video_token: Optional[str] = None, + ) -> None: + super().__init__( + config=config, + tokenizer=tokenizer, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + # add extra video token for video processing + self.video_token = video_token + + @property + def supports_video(self) -> bool: + return self.video_token_id is not None + + @property + def video_token_id(self) -> Optional[int]: + if self.video_token is None: + return None + return self.tokenizer.get_vocab().get(self.video_token, None) + + @property + def image_token_id(self) -> int: + return self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT) + + def _videos_to_pixel_values_lst( + self, + videos: list[npt.NDArray], + max_num_tiles: int, + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + + return [ + video_to_pixel_values( + video, + input_size=self.image_size, + max_num_tiles=max_num_tiles, + use_thumbnail=self.use_thumbnail, + ) for video in videos + ] + + def _preprocess_video( + self, + text: list[str], + videos: list[npt.NDArray], + max_num_tiles: int, + dynamic_image_size: Optional[bool] = None, + ): + if len(videos) == 0 or not self.supports_video: + video_inputs = {} + else: + pixel_values_lst_video = self._videos_to_pixel_values_lst( + videos, + max_num_tiles=max_num_tiles, + dynamic_image_size=dynamic_image_size, + ) + + video_inputs: dict[str, NestedTensors] = { + "pixel_values_flat_video": + input_conditioner(torch.cat(pixel_values_lst_video), + self.norm_mean, self.norm_std), + "video_num_patches": + torch.tensor([len(item) for item in pixel_values_lst_video]), + } + + for pixel_values in pixel_values_lst_video: + num_patches = pixel_values.shape[0] + + video_repl = self.get_video_repl(self.num_image_token, + num_patches, self.video_token) + text = [t.replace('<video>', video_repl.full, 1) for t in text] + return text, video_inputs + + def __call__( + self, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, + videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + max_num_tiles: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> Mapping[str, NestedTensors]: + # Use default if not provided + if max_num_tiles is None: + max_num_tiles = 12 + + text, images, videos = [ + self._make_batch_input(x) for x in (text, images, videos) + ] + + text, image_inputs = self._preprocess_image( + text=text, + images=images, + max_num_tiles=max_num_tiles, + ) + + text, video_inputs = self._preprocess_video( + text=text, + videos=videos, + max_num_tiles=max_num_tiles, + dynamic_image_size=dynamic_image_size, + ) + + text_inputs = self.tokenizer(text, add_special_tokens=False) + + return BatchFeature({ + **BatchEncoding(text_inputs, tensor_type=return_tensors), + **image_inputs, + **video_inputs, + }) + + def get_image_repl( + self, + feature_size: int, + num_patches: Optional[int], + ) -> PromptUpdateDetails[str]: + repl_features = IMG_CONTEXT * feature_size + repl_full = IMG_START + repl_features + IMG_END + + return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + + def get_video_repl( + self, + feature_size: int, + num_patches: Optional[int] = None, + video_context_token: str = IMG_CONTEXT, + ) -> PromptUpdateDetails[str]: + repl_features = video_context_token * self.num_image_token + repl_features_with_sep = IMG_START + repl_features + IMG_END + # num_patches is equal to num_frames + repl_full = ''.join([ + f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches) + ]) + + return PromptUpdateDetails.select_text(repl_full, video_context_token) + + +class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo): + """Basic image-only ProcessingInfo for InternVL-style models.""" + + @abstractmethod + def get_hf_processor( + self, + **kwargs: object, + ) -> BaseNanoNemotronVLProcessor: + raise NotImplementedError + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + max_num_tiles: int, + processor: Optional[BaseNanoNemotronVLProcessor], + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + return processor.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + max_num_tiles=max_num_tiles, + ) + + def get_image_size_with_most_features(self, + max_num_tiles: int) -> ImageSize: + processor = self.get_hf_processor() + + base_size = processor.image_size + target_ratios = get_internvl_target_ratios(1, max_num_tiles) + + largest_feature_size, largest_feature_pinpoint = 0, None + for wr, hr in target_ratios: + width, height = base_size * wr, base_size * hr + + feat_size = self.get_num_image_tokens( + image_width=width, + image_height=height, + max_num_tiles=max_num_tiles, + processor=processor, + ) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + + def get_max_image_tokens(self) -> int: + processor = self.get_hf_processor() + # Use default max_num_tiles for max tokens calculation + max_num_tiles = 12 + target_width, target_height = self.get_image_size_with_most_features( + max_num_tiles) + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + max_num_tiles=max_num_tiles, + processor=processor, + ) + + +_I = TypeVar("_I", bound=BaseNanoNemotronVLProcessingInfo) + + +class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo): + """ ProcessingInfo extended for video processing""" + + @property + def supports_video(self): + return self.get_hf_processor().supports_video + + def get_supported_mm_limits(self): + video_limit = {"video": None} if self.supports_video else {} + return {**super().get_supported_mm_limits(), **video_limit} + + def get_video_token(self) -> Optional[str]: + return IMG_CONTEXT + + def get_num_frames_with_most_features( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + max_images = mm_counts.get("image", 0) + max_videos = mm_counts.get("video", 0) + + processor = self.get_hf_processor() # we get the CustomProcessor here + + max_image_tokens = self.get_max_image_tokens() * max_images + max_total_frames = (seq_len - + max_image_tokens) // processor.num_image_token + max_frames_per_video = max_total_frames // max(max_videos, 1) + + max_frames_per_video = min(max_frames_per_video, MAX_FRAMES) + return max(max_frames_per_video, 1) + + def get_hf_processor(self, **kwargs: object) -> NanoNemotronVLProcessor: + return self.ctx.init_processor( + NanoNemotronVLProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + video_token=self.get_video_token(), + **kwargs, + ) + + +class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]): + """Basic image-only MultiModalProcessor for InternVL-style models.""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + hf_processor = self.info.get_hf_processor(**mm_kwargs) + image_token_id = hf_processor.image_token_id + + # Since there may be extra tokens in the feature placeholders, + # we need to pass the image token ID to the model to select the + # tokens to merge from the vision encoder outputs + processed_outputs["image_token_id"] = torch.tensor(image_token_id) + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) + num_images = len(image_num_patches) + + return dict( + pixel_values_flat=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_patches), + image_num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + image_token_id=MultiModalFieldConfig.shared("image", num_images), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "image_num_patches" in out_mm_kwargs: + image_num_patches = out_mm_kwargs["image_num_patches"] + assert isinstance(image_num_patches, torch.Tensor) + image_num_patches = image_num_patches.tolist() + elif "image_embeds" in out_mm_kwargs: + # to compute num_patches (similar to Qwen2-VL) + image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + else: + image_num_patches = [] + + def get_replacement_custom(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + # Extract max_num_tiles from kwargs, default to 12 + max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + max_num_tiles=max_num_tiles, + processor=hf_processor, + ) + + num_patches = None + local_image_num_patches = image_num_patches + if isinstance(local_image_num_patches, torch.Tensor): + local_image_num_patches = local_image_num_patches.tolist() + if isinstance( + local_image_num_patches, + (list, tuple)) and item_idx < len(local_image_num_patches): + num_patches = int(local_image_num_patches[item_idx]) + + return hf_processor.get_image_repl(feature_size, num_patches) + + return [ + PromptReplacement( + modality="image", + target="<image>", + replacement=get_replacement_custom, + ) + ] + + +class NanoNemotronVLMultiModalProcessor( + NanoNemotronBaseVLMultiModalProcessor[NanoNemotronVLProcessingInfo]): + """MultiModalProcessor extended for video support""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + processed_outputs = super()._call_hf_processor(prompt, mm_data, + mm_kwargs, tok_kwargs) + + hf_processor = self.info.get_hf_processor(**mm_kwargs) + if self.info.supports_video and ( + video_token_id := hf_processor.video_token_id) is not None: + processed_outputs["video_token_id"] = torch.tensor(video_token_id) + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_fields = super()._get_mm_fields_config(hf_inputs, + hf_processor_mm_kwargs) + if self.info.supports_video: + video_num_patches = hf_inputs.get("video_num_patches", + torch.empty(0)) + num_videos = len(video_num_patches) + video_fields = dict( + pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_patches), + video_num_patches=MultiModalFieldConfig.batched("video"), + video_token_id=MultiModalFieldConfig.shared( + "video", num_videos)) + else: + video_fields = {} + + return image_fields | video_fields + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + prompt_repl = super()._get_prompt_updates( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, + ) + + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + out_mm_data = out_mm_kwargs.get_data() + if "video_num_patches" in out_mm_data: + video_num_patches = out_mm_data["video_num_patches"] + assert isinstance(video_num_patches, torch.Tensor) + video_num_patches = video_num_patches.tolist() + else: + video_num_patches = [] + + def get_video_replacement_internvl(item_idx: int): + feature_size = hf_processor.num_image_token + num_patches = video_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return hf_processor.get_video_repl( + feature_size, + num_patches, + video_context_token=hf_processor.video_token) + + if self.info.supports_video: + prompt_repl = [ + *prompt_repl, + PromptReplacement( + modality="video", + target="<video>", + replacement=get_video_replacement_internvl, + ) + ] + + return prompt_repl + + +class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): + """Basic image-only DummyInputsBuilder for InternVL-style models.""" + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + return "<image>" * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + # Use default max_num_tiles for dummy data generation + max_num_tiles = 12 + target_width, target_height = ( + self.info.get_image_size_with_most_features(max_num_tiles)) + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + +class NanoNemotronVLDummyInputsBuilder( + NanoNemotronVLDummyInputsBuilder[NanoNemotronVLProcessingInfo]): + """DummyInputsBuilder extended for video support""" + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_videos = mm_counts.get("video", 0) + + return super().get_dummy_text(mm_counts) + "<video>" * num_videos + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + dummy_image = super().get_dummy_mm_data(seq_len=seq_len, + mm_counts=mm_counts) + if self.info.supports_video: + config = self.info.get_hf_config() + image_size: int = config.force_image_size + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len, mm_counts) + num_videos = mm_counts.get("video", 0) + dummy_video = { + "video": + self._get_dummy_videos(width=image_size, + height=image_size, + num_frames=target_num_frames, + num_videos=num_videos) + } + else: + dummy_video = {} + return {**dummy_image, **dummy_video} + + +@MULTIMODAL_REGISTRY.register_processor( + NanoNemotronVLMultiModalProcessor, + info=NanoNemotronVLProcessingInfo, + dummy_inputs=NanoNemotronVLDummyInputsBuilder, +) +class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid, + SupportsMultiModal): + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<image>" + if modality.startswith("video"): + return "<video>" + return None + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + image_size = config.force_image_size + patch_size = config.patch_size + self.patch_size = patch_size + self.template = config.template + self.num_image_token = int( + (image_size // patch_size)**2 * (config.downsample_ratio**2)) + self.downsample_ratio = config.downsample_ratio + self.ps_version = config.ps_version + self.image_tag_type = config.image_tag_type + + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) + self.vision_model = AutoModel.from_config(config.vision_config, + trust_remote_code=True) + self.vision_model.model._initialize_weights = ( + self.vision_model.model._init_weights) + # Move input normalization to processor to mirror original HF + # implementation where normalization is done in fp32 + self.vision_model.radio_model.make_preprocessor_external() + self.vision_model = self.vision_model.to( + self.language_model.config.torch_dtype) + + self.drop_vision_class_token = True + + # Construct the vision projection. + vit_hidden_size = config.vit_hidden_size + vision_projection_hidden_size = config.projector_hidden_size + llm_hidden_size = config.text_config.hidden_size + + self.mlp1 = nn.Sequential( + RMSNorm(hidden_size=vit_hidden_size * + int(1 / self.downsample_ratio)**2, + eps=1e-5), + nn.Linear( + vit_hidden_size * int(1 / self.downsample_ratio)**2, + vision_projection_hidden_size, + bias=False, + ), + ReLUSquaredActivation(), + nn.Linear(vision_projection_hidden_size, + llm_hidden_size, + bias=False), + ) + self.mlp1 = self.mlp1.to(self.language_model.config.torch_dtype) + + self.img_context_token_id = None + self.video_context_token_id = None + self.config = config + + def pixel_shuffle(self, x, scale_factor=0.5): + n, w, h, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view( + n, + w, + int(h * scale_factor), + int(c / scale_factor), + ) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + # N, H * scale, W, C // scale --> + # N, H * scale, W * scale, C // (scale ** 2) + x = x.view( + n, + int(h * scale_factor), + int(w * scale_factor), + int(c / (scale_factor * scale_factor)), + ) + if self.ps_version == "v1": + warnings.warn( + "In ps_version 'v1', the height and width have not " + "been swapped back, which results in a transposed image.", + stacklevel=2, + ) + else: + x = x.permute(0, 2, 1, 3).contiguous() + return x + + def extract_feature(self, pixel_values): + vit_embeds = self.vision_model(pixel_values).features + vit_embeds = vit_embeds.to(dtype=torch.bfloat16) + h = w = int(vit_embeds.shape[1]**0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, + scale_factor=self.downsample_ratio) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, + vit_embeds.shape[-1]) + vit_embeds = self.mlp1(vit_embeds) + return vit_embeds + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[NanoNemotronVLImageInputs]: + pixel_values_flat = kwargs.pop("pixel_values_flat", None) + image_num_patches = kwargs.pop("image_num_patches", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values_flat is None and image_embeds is None: + return None + + if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return NanoNemotronVLImageEmbeddinInputs( + type="image_embeds", + data=flatten_bn(image_embeds), + ) + + image_token_id = kwargs["image_token_id"] + assert isinstance(image_token_id, torch.Tensor) + self.img_context_token_id = image_token_id.flatten().unique().item() + + if pixel_values_flat is not None: + if not isinstance(pixel_values_flat, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values_flat)}") + + if not isinstance(image_num_patches, (torch.Tensor, list)): + raise ValueError("Incorrect type of image_num_patches. " + f"Got type: {type(image_num_patches)}") + + pixel_values_flat = flatten_bn(pixel_values_flat, concat=True) + image_num_patches = flatten_bn(image_num_patches, concat=True) + + return NanoNemotronVLImagePixelInputs( + type="pixel_values", + pixel_values_flat=pixel_values_flat, + num_patches=image_num_patches, + ) + + raise AssertionError("This line should be unreachable.") + + def _process_image_input( + self, image_input: NanoNemotronVLImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_model is not None + + image_embeds = self.extract_feature(image_input["pixel_values_flat"]) + num_patches = image_input["num_patches"] + + # Only one image in the current batch + if len(num_patches) == 1: + return (image_embeds.view(-1, + self.config.text_config.hidden_size), ) + + # NOTE: Image embeddings are split into separate tensors for each image + # by the size of each embedding. + feature_size = image_embeds.shape[1] + image_embeds = image_embeds.view(-1, + self.config.text_config.hidden_size) + image_feature_sizes = [ + num_patches * feature_size for num_patches in num_patches + ] + return image_embeds.split(image_feature_sizes) + + def _parse_and_validate_video_input( + self, + **kwargs: object) -> Optional[NanoNemotronVLVideoPixelInputs]: + pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None) + video_num_patches = kwargs.pop("video_num_patches", None) + video_embeds = kwargs.pop("video_embeds", None) + + if pixel_values_flat_video is None and video_embeds is None: + return None + + if video_embeds is not None: + return NanoNemotronVLVideoEmbeddingInputs( + type="video_embeds", + data=flatten_bn(video_embeds), + ) + + video_token_id = kwargs["video_token_id"] + assert isinstance(video_token_id, torch.Tensor) + self.video_context_token_id = video_token_id.flatten().unique().item() + + if pixel_values_flat_video is not None: + if not isinstance(pixel_values_flat_video, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values_flat_video)}") + + if not isinstance(video_num_patches, (torch.Tensor, list)): + raise ValueError("Incorrect type of image_num_patches. " + f"Got type: {type(video_num_patches)}") + + pixel_values_flat_video = flatten_bn(pixel_values_flat_video, + concat=True) + video_num_patches = flatten_bn(video_num_patches, concat=True) + expected_h = expected_w = self.config.force_image_size + resolve_bindings = {"h": expected_h, "w": expected_w} + + return NanoNemotronVLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_flat=pixel_values_flat_video, + num_patches=video_num_patches, + resolve_bindings=resolve_bindings, + ) + + raise AssertionError("This line should be unreachable.") + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values_flat", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_flat_video", + ) and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + # Validate the multimodal input keyword arguments + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if modalities is None: + return [] + + # # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_image_input(video_input) + multimodal_embeddings += video_embeddings + + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if (multimodal_embeddings is not None + and len(multimodal_embeddings) != 0): + context_token_ids = [ + token_id for token_id in (self.img_context_token_id, + self.video_context_token_id) + if token_id is not None + ] + assert len(context_token_ids) >= 1 + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + context_token_ids, + ) + + return inputs_embeds + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + input_ids = None + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) + + return hidden_states + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="mlp1", + tower_model="vision_model", + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + + def is_vision_model_weights(weight: tuple[str, torch.Tensor]): + return weight[0].startswith("vision_model") + + def is_adapter_weights(weight: tuple[str, torch.Tensor]): + return weight[0].startswith("mlp1") + + # Get references to parameters for direct loading + vision_model_dict = dict(self.vision_model.named_parameters()) + vision_model_buffers = dict(self.vision_model.named_buffers()) + adapter_dict = dict(self.mlp1.named_parameters()) + + def llm_weights_generator(): + # Single pass over weights + for name, w in weights: + if is_vision_model_weights((name, w)): + # Load vision encoder weights directly + trimmed_name = ".".join(name.split(".")[1:]) + if "input_conditioner" in trimmed_name: + continue + if trimmed_name in vision_model_buffers: + param = vision_model_buffers[trimmed_name] + else: + param = vision_model_dict[trimmed_name] + with torch.no_grad(): + default_weight_loader(param, w) + elif is_adapter_weights((name, w)): + # Load vision-language adapter weights directly + trimmed_name = ".".join(name.split(".")[1:]) + param = adapter_dict[trimmed_name] + with torch.no_grad(): + default_weight_loader(param, w) + else: + # LLM weights: yield them to be loaded + # by language_model.load_weights + assert name.startswith("language_model") + trimmed_name = ".".join(name.split(".")[1:]) + yield (trimmed_name, w) + + # Now we call the language model load with the generator + self.language_model.load_weights(llm_weights_generator()) + + def print_architecture(self, + detailed: bool = True, + save_to_file: str = None): + """ + Print model architecture with parameter names, shapes, and sizes. + + Args: + detailed: If True, show detailed parameter breakdown + save_to_file: If provided, save output to this file path + """ + import sys + from io import StringIO + + # Capture output if saving to file + original_stdout = sys.stdout + if save_to_file: + sys.stdout = StringIO() + + try: + print("=" * 100) + print("NemotronH_Nano_VL Model Architecture") + print("=" * 100) + + total_params = 0 + param_groups = { + "language_model": [], + "vision_model": [], + "mlp1": [], + "other": [], + } + + for name, param in self.named_parameters(): + param_size = param.numel() + total_params += param_size + + # Group parameters by main component + if name.startswith("language_model"): + param_groups["language_model"].append( + (name, param.shape, param_size, param.dtype)) + elif name.startswith("vision_model"): + param_groups["vision_model"].append( + (name, param.shape, param_size, param.dtype)) + elif name.startswith("mlp1"): + param_groups["mlp1"].append( + (name, param.shape, param_size, param.dtype)) + else: + param_groups["other"].append( + (name, param.shape, param_size, param.dtype)) + + if detailed: + print(f"{name:<70} | Shape: {str(param.shape):<25} | " + f"Size: {param_size:>12,} | Dtype: {param.dtype}") + + print("=" * 100) + print("Summary by Component:") + print("-" * 60) + + for component, params in param_groups.items(): + if params: # Only show components that have parameters + component_total = sum(size for _, _, size, _ in params) + percentage = ((component_total / total_params) * + 100 if total_params > 0 else 0) + print(f"{component:<20} | Parameters: {len(params):>4} | " + f"Total Size: {component_total:>15,} | " + f"{percentage:>6.2f}%") + + print("-" * 60) + print(f"{'Total Parameters':<20} | {total_params:>15,}") + + # Estimate memory usage (assuming bfloat16 = 2 bytes per parameter) + memory_mb = total_params * 2 / (1024**2) + memory_gb = memory_mb / 1024 + print(f"{'Est. Memory (MB)':<20} | {memory_mb:>15.2f}") + print(f"{'Est. Memory (GB)':<20} | {memory_gb:>15.2f}") + print("=" * 100) + + # Save to file if requested + if save_to_file: + output = sys.stdout.getvalue() + sys.stdout = original_stdout + with open(save_to_file, "w") as f: + f.write(output) + print(f"Architecture saved to: {save_to_file}") + print(output) # Also print to console + + finally: + if save_to_file and sys.stdout != original_stdout: + sys.stdout = original_stdout + + def get_model_info(self): + """ + Get basic model information as a dictionary. + """ + total_params = sum(p.numel() for p in self.parameters()) + + component_info = {} + for name, param in self.named_parameters(): + component = name.split(".")[0] + if component not in component_info: + component_info[component] = {"params": 0, "size": 0} + component_info[component]["params"] += 1 + component_info[component]["size"] += param.numel() + + return { + "model_name": "NemotronH_Nano_VL", + "total_parameters": total_params, + "memory_estimate_mb": total_params * 2 / (1024**2), # bfloat16 + "components": component_info, + "config": { + "image_size": getattr(self.config, "force_image_size", None), + "patch_size": getattr(self.config, "patch_size", None), + "num_image_token": self.num_image_token, + "downsample_ratio": self.downsample_ratio, + }, + } + + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): + return self.language_model.mamba_cache.copy_inputs_before_cuda_graphs( + input_buffers, **kwargs) + + def get_seqlen_agnostic_capture_inputs(self, batch_size: int): + return (self.language_model.mamba_cache. + get_seqlen_agnostic_capture_inputs(batch_size)) + + @classmethod + def get_mamba_state_shape_from_config(cls, vllm_config: "VllmConfig"): + text_config = vllm_config.model_config.hf_config.text_config + temp_vllm_config = copy.deepcopy(vllm_config) + temp_vllm_config.model_config.hf_config = text_config + return NemotronHForCausalLM.get_mamba_state_shape_from_config( + temp_vllm_config) + + @classmethod + def get_mamba_state_dtype_from_config(cls, vllm_config: "VllmConfig"): + text_config = vllm_config.model_config.hf_config.text_config + temp_vllm_config = copy.deepcopy(vllm_config) + temp_vllm_config.model_config.hf_config = text_config + return NemotronHForCausalLM.get_mamba_state_dtype_from_config( + temp_vllm_config) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index ed7797ce9d4f1..fbc369c870c57 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -223,6 +223,7 @@ _MULTIMODAL_MODELS = { "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), + "NemotronH_Nano_VL": ("nano_nemotron_vl", "NemotronH_Nano_VL"), "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501 "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501 "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), From 6cbd41909edaee82786b5a896ccc2fbccd934aa8 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Date: Wed, 10 Sep 2025 09:10:14 -0400 Subject: [PATCH 112/584] Feature/vit attention unification# 23880 (#23978) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- tests/kernels/attention/test_mha_attn.py | 22 ++++++++++++---- vllm/attention/layer.py | 25 ++++++++++++++----- .../models/idefics2_vision_model.py | 3 +++ vllm/model_executor/models/intern_vit.py | 11 ++++---- vllm/model_executor/models/interns1_vit.py | 17 ++++++------- vllm/model_executor/models/mllama.py | 20 ++++++--------- vllm/model_executor/models/step3_vl.py | 23 ++++++----------- vllm/model_executor/models/vision.py | 2 +- vllm/platforms/interface.py | 1 + 9 files changed, 68 insertions(+), 56 deletions(-) diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 53c37554b15a3..c01ea32994da0 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -23,6 +23,9 @@ def clear_cache(): """Clear lru cache to ensure each test case runs without caching. """ _cached_get_attn_backend.cache_clear() + # Clear xformers availability cache + import vllm.attention.layer as layer_module + layer_module.USE_XFORMERS_OPS = None @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) @@ -33,19 +36,28 @@ def test_mha_attn_platform(device: str): torch.set_default_dtype(torch.float16) if device == "cpu": - with patch("vllm.attention.selector.current_platform", CpuPlatform()): + with patch("vllm.attention.selector.current_platform", + CpuPlatform()), \ + patch("vllm.platforms.current_platform", CpuPlatform()): attn = MultiHeadAttention(16, 64, scale=1) - assert attn.attn_backend == _Backend.TORCH_SDPA + assert attn.attn_backend == _Backend.TORCH_SDPA_VLLM_V1 elif device == "hip": - with patch("vllm.attention.selector.current_platform", RocmPlatform()): + with patch("vllm.attention.selector.current_platform", + RocmPlatform()), \ + patch("vllm.platforms.current_platform", RocmPlatform()), \ + patch("vllm.attention.layer.current_platform", RocmPlatform()): attn = MultiHeadAttention(16, 64, scale=1) assert attn.attn_backend == _Backend.TORCH_SDPA else: - with patch("vllm.attention.selector.current_platform", CudaPlatform()): + with patch("vllm.attention.selector.current_platform", + CudaPlatform()), \ + patch("vllm.platforms.current_platform", CudaPlatform()): attn = MultiHeadAttention(16, 64, scale=1) assert attn.attn_backend == _Backend.XFORMERS - with patch("vllm.attention.selector.current_platform", CudaPlatform()): + with patch("vllm.attention.selector.current_platform", + CudaPlatform()), \ + patch("vllm.platforms.current_platform", CudaPlatform()): attn = MultiHeadAttention(16, 72, scale=1) assert attn.attn_backend == _Backend.XFORMERS diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 237802afccde9..be4dc3eb3c0da 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -360,13 +360,13 @@ class MultiHeadAttention(nn.Module): # currently, only torch_sdpa is supported on rocm self.attn_backend = _Backend.TORCH_SDPA else: - if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1, - _Backend.FLEX_ATTENTION): - backend = _Backend.XFORMERS - self.attn_backend = backend if backend in { - _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1 - } else _Backend.TORCH_SDPA + _Backend.TORCH_SDPA, + _Backend.TORCH_SDPA_VLLM_V1, + _Backend.XFORMERS, + _Backend.PALLAS_VLLM_V1, + _Backend.ROCM_AITER_FA, + } else current_platform.get_vit_attn_backend() if (self.attn_backend == _Backend.XFORMERS and not check_xformers_availability()): @@ -413,6 +413,19 @@ class MultiHeadAttention(nn.Module): from torch_xla.experimental.custom_kernel import flash_attention out = flash_attention(query, key, value, sm_scale=self.scale) out = out.transpose(1, 2) + elif self.attn_backend == _Backend.ROCM_AITER_FA: + from aiter import flash_attn_varlen_func + + # ROCm Flash Attention expects (batch, seq, heads, head_dim) + out = flash_attn_varlen_func(query, + key, + value, + softmax_scale=self.scale) + else: + # ViT attention hasn't supported this backend yet + raise NotImplementedError( + f"ViT attention hasn't supported {self.attn_backend} " + f"backend yet.") return out.reshape(bsz, q_len, -1) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 0ca2e9e4bb688..ea5d6f29f6cfd 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -170,6 +170,7 @@ class Idefics2VisionAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.out_proj", ) + # Use unified MultiHeadAttention with Flash Attention support self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) @@ -181,6 +182,8 @@ class Idefics2VisionAttention(nn.Module): hidden_states ) # batch_size, q_len, 3 * num_heads_per_partition * head_dim query_states, key_states, value_states = qkv.chunk(3, dim=-1) + + # Use unified MultiHeadAttention implementation out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 58e8163e0b26e..8e9ab9649bd44 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -255,6 +255,10 @@ class InternSdpaAttention(nn.Module): self.proj = nn.Linear(self.dummy_dim, self.embed_dim) + # Use unified MultiHeadAttention with automatic backend selection + self.attn = MultiHeadAttention(self.num_heads, self.head_dim, + self.scale) + def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape qkv = self.qkv(x) @@ -268,12 +272,9 @@ class InternSdpaAttention(nn.Module): B_, N_, H_, D_ = q.shape q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_) k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_) - q = q.transpose(1, 2) - k = k.transpose(1, 2) - v = v.transpose(1, 2) - x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) - x = x.transpose(1, 2).reshape(B, N, -1) + # Use unified MultiHeadAttention with automatic backend selection + x = self.attn(q, k, v) x = self.proj(x) return x diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index 300ed17ecaabc..eb6b685d03dc5 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -12,10 +12,10 @@ from typing import Optional import torch import torch.nn as nn -import torch.nn.functional as F from transformers import PretrainedConfig from transformers.utils import torch_int +from vllm.attention.layer import MultiHeadAttention from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -206,6 +206,10 @@ class InternSdpaAttention(nn.Module): self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim) + # Use unified MultiHeadAttention with automatic backend selection + self.attn = MultiHeadAttention(self.num_heads, self.head_dim, + self.scale) + def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape @@ -213,20 +217,13 @@ class InternSdpaAttention(nn.Module): k = self.k_proj(x) v = self.v_proj(x) - q = q.view(B, N, self.num_heads, self.head_dim) - k = k.view(B, N, self.num_heads, self.head_dim) - v = v.view(B, N, self.num_heads, self.head_dim) - if self.qk_normalization: B_, N_, H_, D_ = q.shape q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_) k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_) - q = q.transpose(1, 2) - k = k.transpose(1, 2) - v = v.transpose(1, 2) - x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) - x = x.transpose(1, 2).reshape(B, N, -1) + # Use unified MultiHeadAttention with automatic backend selection + x = self.attn(q, k, v) x = self.projection_layer(x) return x diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 68aa16f8b9ecf..048894085b360 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -35,6 +35,7 @@ from transformers.models.mllama.processing_mllama import ( import vllm.distributed.parallel_state as ps from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.attention.layer import MultiHeadAttention from vllm.attention.ops.paged_attn import PagedAttention from vllm.attention.selector import _Backend from vllm.config import VllmConfig @@ -517,6 +518,10 @@ class MllamaVisionSdpaAttention(nn.Module): prefix=f"{prefix}.o_proj", ) + # Use unified MultiHeadAttention with automatic backend selection + self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim, + 1.0 / math.sqrt(self.head_dim)) + def forward( self, hidden_state: torch.Tensor, @@ -524,21 +529,10 @@ class MllamaVisionSdpaAttention(nn.Module): ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_state) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q = q.view(q.shape[0], q.shape[1], self.num_local_heads, - self.head_dim).transpose(1, 2) - k = k.view(k.shape[0], k.shape[1], self.num_local_heads, - self.head_dim).transpose(1, 2) - v = v.view(v.shape[0], v.shape[1], self.num_local_heads, - self.head_dim).transpose(1, 2) - # TODO: remove padding in image encoder - attn_output = F.scaled_dot_product_attention(q, - k, - v, - attn_mask=attention_mask, - dropout_p=0.0) + # Use unified MultiHeadAttention with automatic backend selection + attn_output = self.attn(q, k, v) - attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(attn_output.shape[0], attn_output.shape[1], -1) output, _ = self.o_proj(attn_output) diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 17299b64978e3..2ba5f94ea3b88 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -16,6 +16,7 @@ from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from transformers import BatchFeature, PretrainedConfig, TensorType +from vllm.attention.layer import MultiHeadAttention from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn @@ -682,9 +683,9 @@ class Step3VisionAttention(nn.Module): prefix=f"{prefix}.out_proj", disable_tp=use_data_parallel) - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, - self.head_dim).transpose(1, 2).contiguous() + # Use unified MultiHeadAttention with automatic backend selection + self.attn = MultiHeadAttention(self.num_heads, self.head_dim, + self.scale) def forward( self, @@ -696,19 +697,9 @@ class Step3VisionAttention(nn.Module): # get query proj qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - q = q.view(bsz, tgt_len, self.num_heads, self.head_dim) - k = k.view(bsz, tgt_len, self.num_heads, self.head_dim) - v = v.view(bsz, tgt_len, self.num_heads, self.head_dim) - q = q.transpose(1, 2) - k = k.transpose(1, 2) - v = v.transpose(1, 2) - attn_output = F.scaled_dot_product_attention(q, - k, - v, - scale=self.scale, - is_causal=False) - attn_output = attn_output.transpose(1, 2).reshape( - bsz, tgt_len, self.num_heads * self.head_dim) + + # Use unified MultiHeadAttention with automatic backend selection + attn_output = self.attn(q, k, v) attn_output, _ = self.out_proj(attn_output) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index de30509b1ccb4..c16aa5ac608f9 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -122,4 +122,4 @@ def resolve_visual_encoder_outputs( uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1) if post_layer_norm is not None and uses_last_layer: hs_pool[-1] = post_layer_norm(encoder_outputs) - return torch.cat(hs_pool, dim=-1) + return torch.cat(hs_pool, dim=-1) \ No newline at end of file diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index fdd3764d2c35d..0cea49eece42e 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -48,6 +48,7 @@ class _Backend(enum.Enum): ROCM_AITER_MLA_VLLM_V1 = enum.auto() ROCM_AITER_FA = enum.auto() # used for ViT attn backend TORCH_SDPA = enum.auto() + TORCH_SDPA_VLLM_V1 = enum.auto() FLASHINFER = enum.auto() FLASHINFER_VLLM_V1 = enum.auto() TRITON_MLA = enum.auto() # Supported by V1 From 9e3c3a7df286b0dffffe6309ce035d2a74affb6c Mon Sep 17 00:00:00 2001 From: Yash Pratap Singh <yashsingh20001@gmail.com> Date: Wed, 10 Sep 2025 18:42:03 +0530 Subject: [PATCH 113/584] [LoRA]: Add LoRA support to Mistral's Voxtral models (#24517) Signed-off-by: Yash Pratap Singh <yashsingh20001@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/voxtral.py | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bf4e243203489..ec65bfca10096 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -765,7 +765,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | -| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ | +| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | ### Pooling Models diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 2831c4df78ba7..9451670bc60c2 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -25,6 +25,7 @@ from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import SupportsPP +from vllm.model_executor.models.module_mapping import MultiModelKeys # yapf: disable from vllm.model_executor.models.whisper import WhisperEncoder # yapf: enable @@ -44,8 +45,8 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import (MistralTokenizer, cached_tokenizer_from_config) -from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, - SupportsTranscription) +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsTranscription) from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -313,9 +314,15 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo] info=VoxtralProcessingInfo, dummy_inputs=VoxtralDummyInputsBuilder) class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP, SupportsTranscription): + SupportsPP, SupportsLoRA, + SupportsTranscription): supported_languages = ISO639_1_SUPPORTED_LANGS + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) @@ -341,6 +348,14 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model + def get_mm_mapping(self) -> MultiModelKeys: + """Get module prefix for multimodal models to filter LoRA modules.""" + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="audio_language_adapter", + tower_model=["whisper_encoder"], + ) + def forward( self, input_ids: torch.Tensor, From f36355abfd1d87d9e89fcc881fc99f3df72eeb9b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 10 Sep 2025 14:14:18 +0100 Subject: [PATCH 114/584] Move `LoadConfig` from `config/__init__.py` to `config/load.py` (#24566) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/lora/test_worker.py | 6 +- .../model_loader/test_registry.py | 3 +- .../test_runai_model_streamer_loader.py | 2 +- tests/test_config.py | 5 +- tests/v1/spec_decode/test_eagle.py | 3 +- vllm/config/__init__.py | 89 +-------------- vllm/config/load.py | 104 ++++++++++++++++++ vllm/model_executor/model_loader/__init__.py | 5 +- .../model_loader/base_loader.py | 3 +- .../model_loader/bitsandbytes_loader.py | 3 +- .../model_loader/default_loader.py | 3 +- .../model_loader/dummy_loader.py | 3 +- .../model_loader/gguf_loader.py | 3 +- .../model_loader/runai_streamer_loader.py | 3 +- .../model_loader/sharded_state_loader.py | 3 +- .../model_loader/tensorizer_loader.py | 3 +- .../model_loader/weight_utils.py | 3 +- 17 files changed, 137 insertions(+), 107 deletions(-) create mode 100644 vllm/config/load.py diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index a836ff94ba3ed..02bfe0bf914a9 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -6,9 +6,9 @@ import random import tempfile from unittest.mock import patch -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, SchedulerConfig, - VllmConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.config.load import LoadConfig from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker diff --git a/tests/model_executor/model_loader/test_registry.py b/tests/model_executor/model_loader/test_registry.py index 93a3e34835b5a..639ee6db9270f 100644 --- a/tests/model_executor/model_loader/test_registry.py +++ b/tests/model_executor/model_loader/test_registry.py @@ -4,7 +4,8 @@ import pytest from torch import nn -from vllm.config import LoadConfig, ModelConfig +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig from vllm.model_executor.model_loader import (get_model_loader, register_model_loader) from vllm.model_executor.model_loader.base_loader import BaseModelLoader diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py index 84c615b6b8dbc..22bdb3b44eb03 100644 --- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py +++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import SamplingParams -from vllm.config import LoadConfig +from vllm.config.load import LoadConfig from vllm.model_executor.model_loader import get_model_loader load_format = "runai_streamer" diff --git a/tests/test_config.py b/tests/test_config.py index 957771a4226bc..373fbd267539a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,8 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field import pytest from vllm.compilation.backends import VllmBackend -from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig, - get_field, update_config) +from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field, + update_config) +from vllm.config.load import LoadConfig from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 46e3a611c6d26..ddedc61aae296 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -12,9 +12,10 @@ from tests.v1.attention.utils import (BatchSpec, _Backend, create_common_attn_metadata, create_standard_kv_cache_spec, get_attention_backend) -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) +from vllm.config.load import LoadConfig from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform from vllm.v1.spec_decode.eagle import EagleProposer diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 4bab06a98cb21..c8d531f12a2e4 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -34,6 +34,7 @@ from vllm.config.compilation import (CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig) from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig +from vllm.config.load import LoadConfig from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -64,8 +65,6 @@ if TYPE_CHECKING: from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) - from vllm.model_executor.model_loader import LoadFormats - from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.sample.logits_processor import LogitsProcessor HfOverrides = Union[dict, Callable[[type], type]] @@ -75,8 +74,6 @@ else: QuantizationConfig = Any QuantizationMethods = Any BaseModelLoader = Any - LoadFormats = Any - TensorizerConfig = Any LogitsProcessor = Any HfOverrides = Union[dict[str, Any], Callable[[type], type]] @@ -1801,90 +1798,6 @@ class ModelConfig: return max_model_len -@config -@dataclass -class LoadConfig: - """Configuration for loading the model weights.""" - - load_format: Union[str, LoadFormats] = "auto" - """The format of the model weights to load:\n - - "auto" will try to load the weights in the safetensors format and fall - back to the pytorch bin format if safetensors format is not available.\n - - "pt" will load the weights in the pytorch bin format.\n - - "safetensors" will load the weights in the safetensors format.\n - - "npcache" will load the weights in pytorch format and store a numpy cache - to speed up the loading.\n - - "dummy" will initialize the weights with random values, which is mainly - for profiling.\n - - "tensorizer" will use CoreWeave's tensorizer library for fast weight - loading. See the Tensorize vLLM Model script in the Examples section for - more information.\n - - "runai_streamer" will load the Safetensors weights using Run:ai Model - Streamer.\n - - "bitsandbytes" will load the weights using bitsandbytes quantization.\n - - "sharded_state" will load weights from pre-sharded checkpoint files, - supporting efficient loading of tensor-parallel models.\n - - "gguf" will load weights from GGUF format files (details specified in - https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n - - "mistral" will load weights from consolidated safetensors files used by - Mistral models. - - Other custom values can be supported via plugins.""" - download_dir: Optional[str] = None - """Directory to download and load the weights, default to the default - cache directory of Hugging Face.""" - model_loader_extra_config: Union[dict, TensorizerConfig] = field( - default_factory=dict) - """Extra config for model loader. This will be passed to the model loader - corresponding to the chosen load_format.""" - device: Optional[str] = None - """Device to which model weights will be loaded, default to - device_config.device""" - ignore_patterns: Optional[Union[list[str], str]] = None - """The list of patterns to ignore when loading the model. Default to - "original/**/*" to avoid repeated loading of llama's checkpoints.""" - use_tqdm_on_load: bool = True - """Whether to enable tqdm for showing progress bar when loading model - weights.""" - pt_load_map_location: Union[str, dict[str, str]] = "cpu" - """ - pt_load_map_location: the map location for loading pytorch checkpoint, to - support loading checkpoints can only be loaded on certain devices like - "cuda", this is equivalent to {"": "cuda"}. Another supported format is - mapping from different devices like from GPU 1 to GPU 0: - {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings - in dictionary needs to be double quoted for json parsing. For more details, - see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self): - self.load_format = self.load_format.lower() - if self.ignore_patterns is not None and len(self.ignore_patterns) > 0: - logger.info( - "Ignoring the following patterns when downloading weights: %s", - self.ignore_patterns) - else: - self.ignore_patterns = ["original/**/*"] - - Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] diff --git a/vllm/config/load.py b/vllm/config/load.py new file mode 100644 index 0000000000000..e4999e36b49bf --- /dev/null +++ b/vllm/config/load.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from dataclasses import field +from typing import TYPE_CHECKING, Any, Optional, Union + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.model_executor.model_loader import LoadFormats + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +else: + LoadFormats = Any + TensorizerConfig = Any + +logger = init_logger(__name__) + + +@config +@dataclass +class LoadConfig: + """Configuration for loading the model weights.""" + + load_format: Union[str, LoadFormats] = "auto" + """The format of the model weights to load:\n + - "auto" will try to load the weights in the safetensors format and fall + back to the pytorch bin format if safetensors format is not available.\n + - "pt" will load the weights in the pytorch bin format.\n + - "safetensors" will load the weights in the safetensors format.\n + - "npcache" will load the weights in pytorch format and store a numpy cache + to speed up the loading.\n + - "dummy" will initialize the weights with random values, which is mainly + for profiling.\n + - "tensorizer" will use CoreWeave's tensorizer library for fast weight + loading. See the Tensorize vLLM Model script in the Examples section for + more information.\n + - "runai_streamer" will load the Safetensors weights using Run:ai Model + Streamer.\n + - "bitsandbytes" will load the weights using bitsandbytes quantization.\n + - "sharded_state" will load weights from pre-sharded checkpoint files, + supporting efficient loading of tensor-parallel models.\n + - "gguf" will load weights from GGUF format files (details specified in + https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n + - "mistral" will load weights from consolidated safetensors files used by + Mistral models. + - Other custom values can be supported via plugins.""" + download_dir: Optional[str] = None + """Directory to download and load the weights, default to the default + cache directory of Hugging Face.""" + model_loader_extra_config: Union[dict, TensorizerConfig] = field( + default_factory=dict) + """Extra config for model loader. This will be passed to the model loader + corresponding to the chosen load_format.""" + device: Optional[str] = None + """Device to which model weights will be loaded, default to + device_config.device""" + ignore_patterns: Optional[Union[list[str], str]] = None + """The list of patterns to ignore when loading the model. Default to + "original/**/*" to avoid repeated loading of llama's checkpoints.""" + use_tqdm_on_load: bool = True + """Whether to enable tqdm for showing progress bar when loading model + weights.""" + pt_load_map_location: Union[str, dict[str, str]] = "cpu" + """ + pt_load_map_location: the map location for loading pytorch checkpoint, to + support loading checkpoints can only be loaded on certain devices like + "cuda", this is equivalent to {"": "cuda"}. Another supported format is + mapping from different devices like from GPU 1 to GPU 0: + {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings + in dictionary needs to be double quoted for json parsing. For more details, + see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self): + self.load_format = self.load_format.lower() + if self.ignore_patterns is not None and len(self.ignore_patterns) > 0: + logger.info( + "Ignoring the following patterns when downloading weights: %s", + self.ignore_patterns) + else: + self.ignore_patterns = ["original/**/*"] diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 2dada794a8f3e..138a2ff30b622 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -5,7 +5,8 @@ from typing import Literal, Optional from torch import nn -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig +from vllm.config.load import LoadConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.bitsandbytes_loader import ( @@ -67,7 +68,7 @@ def register_model_loader(load_format: str): load_format (str): The model loader format name. Examples: - >>> from vllm.config import LoadConfig + >>> from vllm.config.load import LoadConfig >>> from vllm.model_executor.model_loader import get_model_loader, register_model_loader >>> from vllm.model_executor.model_loader.base_loader import BaseModelLoader >>> diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index 4cf6c7988960d..ab538a3c95620 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -5,7 +5,8 @@ from abc import ABC, abstractmethod import torch import torch.nn as nn -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig +from vllm.config.load import LoadConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.utils import ( initialize_model, process_weights_after_loading, set_default_torch_dtype) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index c8dd1ec0ec3c6..9c34159f9a269 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -16,7 +16,8 @@ from packaging import version from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) # yapf: enable diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 4badc31753445..f883e1e739102 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -11,7 +11,8 @@ import torch from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.weight_utils import ( diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py index f4a7da5744e04..5b8c6268f64ef 100644 --- a/vllm/model_executor/model_loader/dummy_loader.py +++ b/vllm/model_executor/model_loader/dummy_loader.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch.nn as nn -from vllm.config import LoadConfig, ModelConfig +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.weight_utils import ( initialize_dummy_weights) diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 9877cb3b7c06e..aaee8f3f76353 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -9,7 +9,8 @@ import torch.nn as nn from huggingface_hub import hf_hub_download from transformers import AutoModelForCausalLM -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig +from vllm.config.load import LoadConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.utils import ( initialize_model, process_weights_after_loading, set_default_torch_dtype) diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 19e73efc91077..dc941401a04e0 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -9,7 +9,8 @@ import torch from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 3edd4ec4007e8..a85ca065d1d27 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -10,7 +10,8 @@ from typing import Any, Optional import torch from torch import nn -from vllm.config import LoadConfig, ModelConfig +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.weight_utils import ( diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index fa01758ab4cee..65ea49c642944 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -8,7 +8,8 @@ from typing import Union import torch from torch import nn -from vllm.config import LoadConfig, ModelConfig, ParallelConfig, VllmConfig +from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.config.load import LoadConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.tensorizer import ( diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index a4eda36148d7a..0de8dbbca9c7f 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -23,7 +23,8 @@ from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm from vllm import envs -from vllm.config import LoadConfig, ModelConfig +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig from vllm.distributed import get_tensor_model_parallel_rank from vllm.logger import init_logger from vllm.model_executor.layers.quantization import (QuantizationConfig, From 4c04eef706fcd26c8e046cd797fa9b14a5fb361d Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Wed, 10 Sep 2025 06:14:27 -0700 Subject: [PATCH 115/584] [BugFix][Multi Modal] Fix TensorSchema shape mismatch in Molmo (#24559) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> --- vllm/model_executor/models/molmo.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index b2fc7be1af224..5d999a02b4e65 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -76,20 +76,22 @@ class MolmoImageInputs(TensorSchema): """ Dimensions: - bn: Batch size * number of images - - nc: Number of crops + - nc: Number of crops (dynamic) - np: Number of patches + - tp: Token sequence positions - pd: Patch dimension """ images: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", "nc", "np", "pd")] + TensorShape("bn", "nc", "np", "pd", dynamic_dims={"nc"})] + # Number of crops may vary per batch and image, so pass it as a list. image_masks: Annotated[Optional[Union[torch.Tensor, list[torch.Tensor]]], - TensorShape("bn", "nc", "np")] + TensorShape("bn", "nc", "np", dynamic_dims={"nc"})] - feat_is_patch: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", "nc", "np")] + feat_is_patch: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "nc", "tp", dynamic_dims={"nc"})] # A boolean mask indicating which image features correspond to patch tokens. - num_crops: Annotated[torch.Tensor, TensorShape("bn")] From 0b9a612fa327c4b0f4dedf6dc9b2f8c03eb23e36 Mon Sep 17 00:00:00 2001 From: lacora <hyelacora@gmail.com> Date: Wed, 10 Sep 2025 06:14:55 -0700 Subject: [PATCH 116/584] [BugFix][easy] Fix flaky test test_gpt_oss_multi_turn_chat (#24549) Signed-off-by: lacora2017 <yehu@meta.com> Co-authored-by: lacora2017 <yehu@meta.com> --- tests/entrypoints/openai/test_serving_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 04805dbca74fa..39a18b9daeca1 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -178,7 +178,7 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI, }, { "role": "user", - "content": "What is the weather in Dallas, TX?" + "content": "What is the weather in Dallas, TX with celsius?" }, ] From f4f1a8df22424eac6228629e34b0e8f254175551 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Wed, 10 Sep 2025 06:15:14 -0700 Subject: [PATCH 117/584] [BugFix] Ensure integrity of reused CPU tensors during async scheduling (#24527) Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: guoze.lin <guozelin@tencent.com> --- vllm/v1/worker/gpu_model_runner.py | 32 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 944793cad94f4..33f4d65a7a115 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -326,6 +326,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mrope_positions = self._make_buffer( (3, self.max_num_tokens + 1), dtype=torch.int64) + # CUDA event to synchronize use of reused CPU tensors between steps + # when async scheduling is enabled. + self.prepare_inputs_event: Optional[torch.cuda.Event] = None + if self.use_async_scheduling: + self.prepare_inputs_event = torch.cuda.Event() + # Start in a completed state. + self.prepare_inputs_event.record(torch.cuda.default_stream()) + # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: Optional[IntermediateTensors] = None @@ -354,11 +362,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Cudagraph dispatcher for runtime cudagraph dispatching. self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config) - self.mm_budget = (MultiModalBudget( + self.mm_budget = MultiModalBudget( self.model_config, self.scheduler_config, self.mm_registry, - ) if self.supports_mm_inputs else None) + ) if self.supports_mm_inputs else None self.reorder_batch_threshold: Optional[int] = None @@ -991,10 +999,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): builder, ) - attn_metadata_i = (builder.build( + attn_metadata_i = builder.build( common_prefix_len=common_prefix_len, common_attn_metadata=common_attn_metadata, - )) + ) for layer_name in attn_group.layer_names: attn_metadata[layer_name] = attn_metadata_i @@ -1866,10 +1874,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): "prompt tokens, tokens, please disable it when the requests" " need prompt logprobs") - # Prepare the decoder inputs. - (attn_metadata, logits_indices, spec_decode_metadata, - num_scheduled_tokens_np, spec_decode_common_attn_metadata, - max_query_len) = self._prepare_inputs(scheduler_output) + if self.prepare_inputs_event is not None: + # Ensure prior step has finished with reused CPU tensors. + self.prepare_inputs_event.synchronize() + try: + # Prepare the decoder inputs. + (attn_metadata, logits_indices, spec_decode_metadata, + num_scheduled_tokens_np, spec_decode_common_attn_metadata, + max_query_len) = self._prepare_inputs(scheduler_output) + + finally: + if self.prepare_inputs_event is not None: + self.prepare_inputs_event.record() ( num_scheduled_tokens, From 492196ed0e17c7129ccc3ac4c82eda80bccda82e Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" <yeq@meta.com> Date: Wed, 10 Sep 2025 06:16:07 -0700 Subject: [PATCH 118/584] [CI/Build] split true unit tests to Entrypoints Unit Tests (#24418) Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> --- .buildkite/test-pipeline.yaml | 21 ++++++++++++++----- .../test_api_server_process_manager.py | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8f2f6083b0305..0479c86f7a974 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -102,7 +102,18 @@ steps: commands: - pytest -v -s core -- label: Entrypoints Test (LLM) # 30min +- label: Entrypoints Unit Tests # 5min + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" @@ -119,7 +130,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" @@ -132,7 +143,7 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py - label: Distributed Tests (4 GPUs) # 35min @@ -823,7 +834,7 @@ steps: # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - pip install -e ./plugins/prithvi_io_processor_plugin - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y + - pip uninstall prithvi_io_processor_plugin -y # end io_processor plugins test # other tests continue here: - pytest -v -s plugins_tests/test_scheduler_plugins.py @@ -871,7 +882,7 @@ steps: timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_gpus: 2 optional: true source_file_dependencies: - vllm/ diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index e4af60a782651..a993e24ff838a 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -95,7 +95,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update): assert not proc.is_alive() -@patch("vllm.entrypoints.cli.serve.run_api_server_worker", +@patch("vllm.entrypoints.cli.serve.run_api_server_worker_proc", mock_run_api_server_worker) def test_wait_for_completion_or_failure(api_server_args): """Test that wait_for_completion_or_failure works with failures.""" From d6069887c6868f5f9683b4eb3e85a123d5124e53 Mon Sep 17 00:00:00 2001 From: Lifans <lifans@meta.com> Date: Wed, 10 Sep 2025 06:16:21 -0700 Subject: [PATCH 119/584] [rocm] enable torchao quantization for rocm (#24400) Signed-off-by: Lifan Shen <lifans@meta.com> --- vllm/platforms/rocm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index cf7b87cf030a9..fd899dcc9a65c 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -171,7 +171,7 @@ class RocmPlatform(Platform): supported_quantization: list[str] = [ "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf", - "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4" + "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4", "torchao" ] @classmethod From bd98842c8a018d02d09f8c4249ea0547f505286d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Wed, 10 Sep 2025 21:16:39 +0800 Subject: [PATCH 120/584] [CI] Add PPL test for generation models (#24485) Signed-off-by: wang.yuqi <noooop@126.com> --- .buildkite/test-pipeline.yaml | 10 ++ .../language/generation_ppl_test/__init__.py | 0 .../language/generation_ppl_test/ppl_utils.py | 131 ++++++++++++++++++ .../generation_ppl_test/test_gemma.py | 18 +++ .../language/generation_ppl_test/test_gpt.py | 14 ++ .../language/generation_ppl_test/test_qwen.py | 21 +++ tests/models/language/pooling/embed_utils.py | 2 +- tests/models/language/pooling/mteb_utils.py | 11 +- tests/models/utils.py | 11 +- 9 files changed, 211 insertions(+), 7 deletions(-) create mode 100644 tests/models/language/generation_ppl_test/__init__.py create mode 100644 tests/models/language/generation_ppl_test/ppl_utils.py create mode 100644 tests/models/language/generation_ppl_test/test_gemma.py create mode 100644 tests/models/language/generation_ppl_test/test_gpt.py create mode 100644 tests/models/language/generation_ppl_test/test_qwen.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0479c86f7a974..75a9c4a22cb4d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -604,6 +604,16 @@ steps: - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + - label: Language Models Test (Extended Pooling) # 36min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] diff --git a/tests/models/language/generation_ppl_test/__init__.py b/tests/models/language/generation_ppl_test/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py new file mode 100644 index 0000000000000..550e874cf8579 --- /dev/null +++ b/tests/models/language/generation_ppl_test/ppl_utils.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://huggingface.co/docs/transformers/perplexity +from typing import Optional, cast + +import pytest +import torch +from datasets import load_dataset + +from tests.models.utils import (GenerateModelInfo, + TokensTextLogprobsPromptLogprobs) +from vllm.logprobs import Logprob + +# See #24485 +PPL_TOL = 0.01 +MAX_LENGTH = 1024 + + +@torch.inference_mode +def wikitext_ppl_test(hf_runner, + vllm_runner, + model_info: GenerateModelInfo, + max_length=MAX_LENGTH, + vllm_extra_kwargs=None, + atol=PPL_TOL): + + # A model family has many models with the same architecture, + # and we don't need to test each one. + if not model_info.enable_test: + pytest.skip("Skipping test.") + + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + + # Allow vllm to test using the given dtype, such as float32 + vllm_extra_kwargs = vllm_extra_kwargs or {} + vllm_extra_kwargs["dtype"] = model_info.dtype + + # Allow vllm to test using hf_overrides + if model_info.hf_overrides is not None: + vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + + with vllm_runner(model_info.name, + gpu_memory_utilization=0.7, + max_model_len=max_length, + max_num_seqs=1, + enforce_eager=True, + **vllm_extra_kwargs) as vllm_model: + # Use max_num_seqs=1 to avoid OOM, + # and batch different requests together. + + model_config = vllm_model.llm.llm_engine.model_config + + # Confirm whether vllm is using the correct architecture + if model_info.architecture: + assert (model_info.architecture in model_config.architectures) + + max_length = min(model_config.max_model_len - 1, max_length) + stride = max_length + + tokenizer = vllm_model.llm.get_tokenizer() + tokens = tokenizer.encode("\n\n".join(dataset["text"])) + n_tokens = len(tokens) + + chunks = [] + for begin_loc in range(0, n_tokens, stride): + end_loc = min(begin_loc + max_length, n_tokens) + chunks.append(tokens[begin_loc:end_loc]) + + outputs = vllm_model.generate_greedy_logprobs(prompts=chunks, + max_tokens=1, + num_logprobs=None, + num_prompt_logprobs=0, + use_tqdm=False) + nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu") + n_tokens = 0 + for output in outputs: + output = cast(TokensTextLogprobsPromptLogprobs, output) + token_datas = cast(list[Optional[dict[int, Logprob]]], output[3]) + + assert token_datas[0] is None + token_log_probs = [] + for token_data in token_datas[1:]: + assert token_data is not None + assert len(token_data) == 1 + token_log_prob = list(token_data.values())[0].logprob + token_log_probs.append(token_log_prob) + + neg_log_likelihood = -torch.tensor( + token_log_probs, dtype=torch.float32, device="cpu").sum() + nll_sum += neg_log_likelihood + n_tokens += len(token_log_probs) + vllm_ppl = float(torch.exp(nll_sum / n_tokens)) + vllm_dtype = model_config.dtype + + # Accelerate ppl test by setting Transformers ppl score to a constant + if model_info.hf_ppl is None: + with hf_runner( + model_info.name, + dtype=model_info.hf_dtype, + ) as hf_model: + nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu") + n_tokens = 0 + for chunk in chunks: + inputs = hf_model.wrap_device( + {"input_ids": torch.tensor([chunk])}) + input_ids = inputs["input_ids"] + outputs = hf_model.model(input_ids, labels=input_ids) + neg_log_likelihood = outputs.loss + + neg_log_likelihood = neg_log_likelihood.to(torch.float32).cpu() + + num_loss_tokens = len(chunk) - 1 + nll_sum += neg_log_likelihood * num_loss_tokens + n_tokens += num_loss_tokens + + hf_ppl = float(torch.exp(nll_sum / n_tokens)) + hf_dtype = next(hf_model.model.parameters()).dtype + else: + hf_ppl = model_info.hf_ppl + hf_dtype = "Constant" + + differ = (vllm_ppl - hf_ppl) / hf_ppl + print("Model:", model_info.name) + print("VLLM:", vllm_dtype, vllm_ppl) + print("Transformers:", hf_dtype, hf_ppl) + print("Difference (%):", differ * 100) + + # PPL the smaller, the better + # We are not concerned that the vllm PPL is less than Transformers, + # so we only perform one-sided testing. + assert differ < atol diff --git a/tests/models/language/generation_ppl_test/test_gemma.py b/tests/models/language/generation_ppl_test/test_gemma.py new file mode 100644 index 0000000000000..5324de143d674 --- /dev/null +++ b/tests/models/language/generation_ppl_test/test_gemma.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from tests.models.utils import GenerateModelInfo + +from .ppl_utils import wikitext_ppl_test + +MODELS = [ + GenerateModelInfo("google/gemma-2b"), + GenerateModelInfo("google/gemma-2-2b"), + GenerateModelInfo("google/gemma-3-4b-it"), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo): + wikitext_ppl_test(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/generation_ppl_test/test_gpt.py b/tests/models/language/generation_ppl_test/test_gpt.py new file mode 100644 index 0000000000000..f3f9e55a24234 --- /dev/null +++ b/tests/models/language/generation_ppl_test/test_gpt.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from tests.models.utils import GenerateModelInfo + +from .ppl_utils import wikitext_ppl_test + +MODELS = [GenerateModelInfo("openai-community/gpt2-large")] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo): + wikitext_ppl_test(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/generation_ppl_test/test_qwen.py b/tests/models/language/generation_ppl_test/test_qwen.py new file mode 100644 index 0000000000000..0d3127cbaac47 --- /dev/null +++ b/tests/models/language/generation_ppl_test/test_qwen.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from tests.models.utils import GenerateModelInfo + +from .ppl_utils import wikitext_ppl_test + +MODELS = [ + GenerateModelInfo("Qwen/Qwen3-0.6B"), + GenerateModelInfo("Qwen/Qwen3-0.6B-FP8"), + # transformers: + # Loading a GPTQ quantized model requires optimum, gptqmodel + # GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo): + wikitext_ppl_test(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 8f8393c4e16fc..86751e0a4d5f4 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -59,7 +59,7 @@ def correctness_test_embed_models(hf_runner, with hf_runner( model_info.name, - dtype="float32", + dtype=model_info.hf_dtype, is_sentence_transformer=True, ) as hf_model: diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 7336c30bdda33..56a105e96e5ee 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -213,7 +213,7 @@ def mteb_test_embed_models(hf_runner, if model_info.mteb_score is None: with hf_runner(model_info.name, is_sentence_transformer=True, - dtype="float32") as hf_model: + dtype=model_info.hf_dtype) as hf_model: # e.g. setting default parameters for the encode method of hf_runner if hf_model_callback is not None: @@ -278,9 +278,12 @@ def run_mteb_rerank(cross_encoder, tasks, languages): return main_score -def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None): +def mteb_test_rerank_models_hf(hf_runner, + model_name, + hf_dtype="float32", + hf_model_callback=None): with hf_runner(model_name, is_cross_encoder=True, - dtype="float32") as hf_model: + dtype=hf_dtype) as hf_model: original_predict = hf_model.predict @@ -357,7 +360,7 @@ def mteb_test_rerank_models(hf_runner, # SentenceTransformers mteb score to a constant if model_info.mteb_score is None: st_main_score, st_dtype = mteb_test_rerank_models_hf( - hf_runner, model_info.name, hf_model_callback) + hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback) else: st_main_score = model_info.mteb_score st_dtype = "Constant" diff --git a/tests/models/utils.py b/tests/models/utils.py index ab0b27af4d697..44e9bf539bc17 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -347,14 +347,15 @@ class ModelInfo: name: str architecture: str = "" dtype: str = "auto" + hf_dtype: str = "float32" hf_overrides: Optional[dict[str, Any]] = None default_pooling_type: str = "" - mteb_score: Optional[float] = None enable_test: bool = True @dataclass class EmbedModelInfo(ModelInfo): + mteb_score: Optional[float] = None is_matryoshka: bool = False matryoshka_dimensions: Optional[list[int]] = None @@ -371,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo): @dataclass class RerankModelInfo(ModelInfo): - pass + mteb_score: Optional[float] = None @dataclass @@ -384,6 +385,12 @@ class LASTPoolingRerankModelInfo(RerankModelInfo): default_pooling_type: str = "LAST" +@dataclass +class GenerateModelInfo(ModelInfo): + hf_dtype: str = "auto" + hf_ppl: Optional[float] = None + + def dummy_hf_overrides( hf_config: PretrainedConfig, *, From 2f5e5c18de02429263f39fb049d87c7aa61a2b71 Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Wed, 10 Sep 2025 15:20:59 +0200 Subject: [PATCH 121/584] [CI/Build] bump timm dependency (#24189) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b78d7d88f1f83..307e9658f7175 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -519,7 +519,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ else \ BITSANDBYTES_VERSION="0.46.1"; \ fi; \ - uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] + uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3] ENV VLLM_USAGE_SOURCE production-docker-image From 3144d90217081aa7b4d3367f0614eb13c35ad4ae Mon Sep 17 00:00:00 2001 From: co63oc <co63oc@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:21:23 +0800 Subject: [PATCH 122/584] fix some typos (#24167) Signed-off-by: co63oc <co63oc@users.noreply.github.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> --- tests/v1/e2e/test_spec_decode.py | 2 +- vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py | 2 +- vllm/v1/sample/logits_processor/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 469464f777fb2..0b240b7d434e5 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -162,7 +162,7 @@ def test_eagle_correctness( # TODO: Fix this flaky test pytest.skip( "TREE_ATTN is flaky in the test disable for now until it can be " - "reolved (see https://github.com/vllm-project/vllm/issues/22922)") + "resolved (see https://github.com/vllm-project/vllm/issues/22922)") # Generate test prompts inside the function instead of using fixture test_prompts = get_test_prompts(mm_enabled) diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 2055393d7ec71..63af3ff7f4d1f 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -89,7 +89,7 @@ class Internlm2ToolParser(ToolParser): try: parsable_arr = action - # tool calls are generated in an object in inernlm2 + # tool calls are generated in an object in internlm2 # it's not support parallel tool calls try: tool_call_arr: dict = partial_json_parser.loads( diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py index a5f1cadd85241..df944873bcaf3 100644 --- a/vllm/v1/sample/logits_processor/__init__.py +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -195,7 +195,7 @@ class AdapterLogitsProcessor(LogitsProcessor): overridden in general. However, to implement custom constructor behavior - especially any logic which operates on or stores `vllm_config`, `device`, or `is_pin_memory` - `self.__init__(vllm_config, device, is_pin_memory)` - must be overriden and the override must call + must be overridden and the override must call `super().__init__(vllm_config, device, is_pin_memory)` """ From c0bd6a684aee2b47fe52b75ec97379e40ca5d36c Mon Sep 17 00:00:00 2001 From: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com> Date: Wed, 10 Sep 2025 23:22:31 +1000 Subject: [PATCH 123/584] Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217) Signed-off-by: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/model_executor/layers/quantization/auto_round.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index fb285413ba9ef..1ca92273430dd 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) + else: from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) @@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig): } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) - return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) if isinstance(layer, (LinearBase, ParallelLMHead)): if use_marlin: From ccee371e86f18643f251450397ca7e46a0356b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Wed, 10 Sep 2025 22:23:28 +0900 Subject: [PATCH 124/584] [Docs] Fix warnings in `mkdocs build` (continued) (#24092) Signed-off-by: Zerohertz <ohg3417@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../layers/fused_moe/rocm_aiter_fused_moe.py | 5 +- .../layers/fused_moe/routing_simulator.py | 8 +- .../layers/quantization/bitblas.py | 10 +- .../layers/quantization/gptq_bitblas.py | 6 +- .../kernels/mixed_precision/__init__.py | 8 +- vllm/model_executor/models/phi4mm_audio.py | 267 +++++++------- vllm/model_executor/models/phi4mm_utils.py | 337 +++++++++--------- vllm/model_executor/models/qwen2_5_vl.py | 16 +- vllm/model_executor/models/zamba2.py | 20 +- 10 files changed, 337 insertions(+), 342 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 551f284a36095..f80ba3a7aa23b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -755,7 +755,7 @@ class FusedMoE(CustomOp): intermediate_size: Intermediate size of the experts params_dtype: Data type for the parameters. reduce_results: Whether to all all_reduce on the output of the layer - renomalize: Whether to renormalize the logits in the fused_moe kernel + renormalize: Whether to renormalize the logits in the fused_moe kernel quant_config: Quantization configure. enable_eplb: Whether to enable expert parallelism load balancer. """ diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index f14f13e2ade9d..13c3ab4f06dd1 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -420,9 +420,8 @@ def shuffle_weights( Args: *tensors: Variable number of torch.Tensor objects. - layout: A pair of integers specifying the - block sizes used to divide the tensors during shuffling. - Default is (16, 16). + layout: A pair of integers specifying the block sizes used to divide + the tensors during shuffling. Default is (16, 16). Returns: A Tuple of shuffled tensors. diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py index c8b107f13cd0d..8758a570b3c63 100644 --- a/vllm/model_executor/layers/fused_moe/routing_simulator.py +++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py @@ -10,7 +10,7 @@ like uniform random routing. """ from abc import ABC, abstractmethod -from typing import Optional +from typing import Any, Optional import torch @@ -50,7 +50,9 @@ class DistributionBasedRouting(RoutingStrategy): distributions for testing different routing patterns. """ - def __init__(self, distribution: str = "uniform", **distribution_params): + def __init__(self, + distribution: str = "uniform", + **distribution_params: Any): """ Initialize distribution-based routing. @@ -244,7 +246,7 @@ class RoutingSimulator: cls._routing_strategies[name] = strategy @classmethod - def get_available_strategies(cls): + def get_available_strategies(cls) -> list[str]: """ Get list of available routing strategy names. diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py index 39bd34d351f61..d05c0c0d5473c 100644 --- a/vllm/model_executor/layers/quantization/bitblas.py +++ b/vllm/model_executor/layers/quantization/bitblas.py @@ -202,7 +202,7 @@ class BitBLASLinearMethod(LinearMethodBase): output_size: int, params_dtype: torch.dtype, **extra_weight_attrs, - ): + ) -> None: """Creates quantized weights for use in linear operations. The function initializes and returns a dictionary containing quantized @@ -211,7 +211,7 @@ class BitBLASLinearMethod(LinearMethodBase): Args: input_size_per_partition: The size of the input partition. - output_size_per_partition: The size of the output partition. + output_partition_sizes: List of output partition sizes. input_size: The total size of the input (unused). output_size: The total size of the output (unused). params_dtype: @@ -222,9 +222,9 @@ class BitBLASLinearMethod(LinearMethodBase): scales ('scales'), and zeros ('zeros'). Raises: - ValueError: If `params_dtype` is not `torch.float16` or if the - input size per partition is not divisible by the group size in - `quant_config`. + ValueError: If `params_dtype` is not `torch.float16` or if the input + size per partition is not divisible by the group size + in `quant_config`. """ del input_size, output_size # Unused arguments. weight_loader = extra_weight_attrs["weight_loader"] diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index d03074f861848..6462292586482 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -265,9 +265,9 @@ class GPTQBitBLASLinearMethod(LinearMethodBase): scales ('scales'), and zeros ('zeros'). Raises: - ValueError: If `params_dtype` is not `torch.float16` or - if the input size per partition is not divisible by the - group size in `quant_config`. + ValueError: If `params_dtype` is not `torch.float16` or if the input + size per partition is not divisible by the group size + in `quant_config`. """ if params_dtype != torch.float16: raise ValueError("Parameter data type must be torch.float16, " diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index 4bcfcd04b3d8b..f10d20999bee3 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -46,11 +46,11 @@ def choose_mp_linear_kernel( performance. Args: - config (MPLinearLayerConfig): Description of the linear layer to be - implemented. + config (MPLinearLayerConfig): Description of the linear layer to be + implemented. compute_capability (Optional[int], optional): The compute capability of - the target device, if None uses `current_platform` to get the compute - capability. Defaults to None. + the target device, if None uses `current_platform` to get + the compute capability. Defaults to None. Raises: ValueError: If no kernel can implement the given config. diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index b5e4d727bf210..a2481375f4fb8 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -7,7 +7,7 @@ #!/usr/bin/env python3 import abc import math -from typing import Literal, Optional +from typing import Any, Literal, Optional, Union import numpy as np import torch @@ -131,31 +131,31 @@ class ConformerEncoderLayer(nn.Module): def __init__( self, - d_model=512, - ext_pw_out_channel=0, - depthwise_seperable_out_channel=256, - depthwise_multiplier=1, - n_head=4, - d_ffn=2048, - ext_pw_kernel_size=1, - kernel_size=3, - dropout_rate=0.1, - causal=False, - batch_norm=False, - activation="relu", - chunk_se=0, - chunk_size=18, - conv_activation="relu", - conv_glu_type="sigmoid", - bias_in_glu=True, - linear_glu_in_convm=False, - attention_inner_dim=-1, - attention_glu_type="swish", - activation_checkpointing="", - export=False, - use_pt_scaled_dot_product_attention=False, + d_model: int = 512, + ext_pw_out_channel: int = 0, + depthwise_seperable_out_channel: int = 256, + depthwise_multiplier: int = 1, + n_head: int = 4, + d_ffn: int = 2048, + ext_pw_kernel_size: int = 1, + kernel_size: int = 3, + dropout_rate: float = 0.1, + causal: bool = False, + batch_norm: bool = False, + activation: str = "relu", + chunk_se: int = 0, + chunk_size: int = 18, + conv_activation: str = "relu", + conv_glu_type: str = "sigmoid", + bias_in_glu: bool = True, + linear_glu_in_convm: bool = False, + attention_inner_dim: int = -1, + attention_glu_type: str = "swish", + activation_checkpointing: str = "", + export: bool = False, + use_pt_scaled_dot_product_attention: bool = False, attn_group_sizes: int = 1, - ): + ) -> None: super().__init__() self.feed_forward_in = FeedForward( @@ -209,24 +209,21 @@ class ConformerEncoderLayer(nn.Module): def forward( self, - x, - pos_k, - pos_v, - mask, + x: torch.Tensor, + pos_k: torch.Tensor, + pos_v: torch.Tensor, + mask: torch.Tensor, relative_attention_bias: Optional[Tensor] = None, - ): + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ConformerEncoder forward. Args: - x: torch.Tensor - input feature of shape (batch, max_time_in, size) - pos_k: torch.Tensor - positional key embedding. - mask: torch.Tensor - mask for x (batch, max_time_in) - relative_attention_bias: Optional[torch.Tensor] - bias added to attention logits w.r.t. relative positions - (1, n_head, time1, time2) + x: input feature of shape (batch, max_time_in, size) + pos_k: positional key embedding. + pos_v: positional value embedding. + mask: mask for x (batch, max_time_in) + relative_attention_bias: bias added to attention logits w.r.t. + relative positions (1, n_head, time1, time2) """ x = x + 0.5 * self.feed_forward_in(x) norm_x = self.layer_norm_att(x) @@ -323,25 +320,25 @@ class TransformerEncoderBase(abc.ABC, nn.Module): def __init__( self, - input_size, - chunk_size, - left_chunk, - attention_dim=256, - attention_heads=4, - input_layer="nemo_conv", - cnn_out=-1, - cnn_layer_norm=False, - time_reduction=4, - dropout_rate=0.0, - padding_idx=-1, - relative_attention_bias_args=None, - positional_dropout_rate=0.0, - nemo_conv_settings=None, + input_size: int, + chunk_size: Union[int, list[int]], + left_chunk: Union[int, list[int]], + attention_dim: int = 256, + attention_heads: int = 4, + input_layer: str = "nemo_conv", + cnn_out: int = -1, + cnn_layer_norm: bool = False, + time_reduction: int = 4, + dropout_rate: float = 0.0, + padding_idx: int = -1, + relative_attention_bias_args: Optional[dict[str, Any]] = None, + positional_dropout_rate: float = 0.0, + nemo_conv_settings: Optional[dict[str, Any]] = None, conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none", - attention_group_size=1, - encoder_embedding_config=None, - ): + attention_group_size: int = 1, + encoder_embedding_config: Optional[dict[str, Any]] = None, + ) -> None: super().__init__() self.input_size = input_size self.input_layer = input_layer @@ -399,7 +396,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module): self.encoder_embedding = MeanVarianceNormLayer( self.encoder_embedding_config["input_size"]) - def compute_lens_change(self, feature_lens): + def compute_lens_change( + self, + feature_lens: Union[int, + torch.Tensor]) -> Union[int, torch.Tensor]: """feature_lens: int return updated feature lens. @@ -433,10 +433,14 @@ class TransformerEncoderBase(abc.ABC, nn.Module): return ceil_func(feature_lens / self.time_reduction) @abc.abstractmethod - def forward(self): + def forward(self) -> Any: """Abstract forward method implementation.""" - def _chunk_size_selection(self, chunk_size=None, left_chunk=None): + def _chunk_size_selection( + self, + chunk_size: Optional[Union[int, list[int]]] = None, + left_chunk: Optional[Union[int, + list[int]]] = None) -> tuple[int, int]: """If chunk size is a list, we will randomly select a chunk size.""" if chunk_size is None: @@ -463,7 +467,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module): return chunk_size_train_eff, left_chunk_train_eff - def _get_embed_class(self, embed): + def _get_embed_class(self, embed: nn.Module) -> nn.Module: # pylint: disable=protected-access is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper) is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel) @@ -474,13 +478,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module): embed_class = embed.module return embed_class - def _forward_embeddings_core(self, input_tensor, masks): + def _forward_embeddings_core( + self, input_tensor: torch.Tensor, + masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: embed_class = self._get_embed_class(self.embed) assert isinstance(embed_class, NemoConvSubsampling) input_tensor, masks = self.embed(input_tensor, masks) return input_tensor, masks - def _position_embedding(self, input_tensor): + def _position_embedding( + self, input_tensor: torch.Tensor + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: pos_k = None pos_v = None if self.relative_attention_bias_layer is None: @@ -488,7 +496,9 @@ class TransformerEncoderBase(abc.ABC, nn.Module): input_tensor) # default to add abs sinusoid embedding return pos_k, pos_v - def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk): + def _streaming_mask(self, seq_len: int, batch_size: int, + chunk_size: Union[int, list[int]], + left_chunk: Union[int, list[int]]) -> torch.Tensor: chunk_size_train_eff, left_chunk_train_eff = \ self._chunk_size_selection(chunk_size, left_chunk) @@ -502,11 +512,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module): [batch_size, -1, -1])) return enc_streaming_mask - def forward_embeddings(self, - xs_pad, - masks, - chunk_size_nc=None, - left_chunk_nc=None): + def forward_embeddings( + self, + xs_pad: torch.Tensor, + masks: torch.Tensor, + chunk_size_nc: Optional[Union[int, list[int]]] = None, + left_chunk_nc: Optional[Union[int, list[int]]] = None + ) -> Union[tuple[torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor], torch.Tensor, torch.Tensor], + tuple[torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor], torch.Tensor, torch.Tensor, + torch.Tensor]]: """Forwarding the inputs through the top embedding layers Args: @@ -569,7 +585,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module): return input_tensor, pos_k, pos_v, hs_mask, masks return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc - def get_offset(self): + def get_offset(self) -> int: """Returns offset used when retaining inputs for decoding. This is essentially, how many additional frames have to be added to @@ -605,8 +621,6 @@ class ConformerEncoder(TransformerEncoderBase): Some examples for the 2 cases: left_chunk = 6 left_chunk = [12, 9, 6, 3] - left_chunk: int - number of chunks used for masking in streaming mode. num_lang: int This parameter is used to store the number of languages in the lang_dict, only used for multiseed/multilingual models. @@ -751,46 +765,46 @@ class ConformerEncoder(TransformerEncoderBase): def __init__( # pylint: disable-all self, - input_size, - chunk_size, - left_chunk, - num_lang=None, - attention_dim=256, - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - input_layer="nemo_conv", - causal=True, - batch_norm=False, - cnn_out=-1, - cnn_layer_norm=False, - ext_pw_out_channel=0, - ext_pw_kernel_size=1, - depthwise_seperable_out_channel=256, - depthwise_multiplier=1, - chunk_se=0, - kernel_size=3, - activation="relu", - conv_activation="relu", - conv_glu_type="sigmoid", - bias_in_glu=True, - linear_glu_in_convm=False, - attention_glu_type="swish", - export=False, - extra_layer_output_idx=-1, - extra_multi_layer_output_idxs=[], # noqa - activation_checkpointing="", - relative_attention_bias_args=None, - time_reduction=4, - use_pt_scaled_dot_product_attention=False, - nemo_conv_settings=None, + input_size: int, + chunk_size: Union[int, list[int]], + left_chunk: Union[int, list[int]], + num_lang: Optional[int] = None, + attention_dim: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + input_layer: str = "nemo_conv", + causal: bool = True, + batch_norm: bool = False, + cnn_out: int = -1, + cnn_layer_norm: bool = False, + ext_pw_out_channel: int = 0, + ext_pw_kernel_size: int = 1, + depthwise_seperable_out_channel: int = 256, + depthwise_multiplier: int = 1, + chunk_se: int = 0, + kernel_size: int = 3, + activation: str = "relu", + conv_activation: str = "relu", + conv_glu_type: str = "sigmoid", + bias_in_glu: bool = True, + linear_glu_in_convm: bool = False, + attention_glu_type: str = "swish", + export: bool = False, + extra_layer_output_idx: int = -1, + extra_multi_layer_output_idxs: list[int] = [], # noqa + activation_checkpointing: str = "", + relative_attention_bias_args: Optional[dict[str, Any]] = None, + time_reduction: int = 4, + use_pt_scaled_dot_product_attention: bool = False, + nemo_conv_settings: Optional[dict[str, Any]] = None, conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none", - replication_pad_for_subsample_embedding=False, - attention_group_size=1, - encoder_embedding_config=None, - ): + replication_pad_for_subsample_embedding: bool = False, + attention_group_size: int = 1, + encoder_embedding_config: Optional[dict[str, Any]] = None, + ) -> None: super().__init__( input_size, chunk_size, @@ -852,11 +866,13 @@ class ConformerEncoder(TransformerEncoderBase): # the device and the needed dtype: self.register_buffer("dev_type", torch.zeros(()), persistent=False) - def init_relative_attention_bias(self, input_tensor): + def init_relative_attention_bias( + self, input_tensor: torch.Tensor) -> Optional[torch.Tensor]: if self.relative_attention_bias_layer: return self.relative_attention_bias_layer(input_tensor) - def calculate_hs_mask(self, xs_pad, device, mask): + def calculate_hs_mask(self, xs_pad: torch.Tensor, device: torch.device, + mask: Optional[torch.Tensor]) -> torch.Tensor: max_audio_length = xs_pad.shape[1] batch_size = xs_pad.shape[0] enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size, @@ -877,7 +893,8 @@ class ConformerEncoder(TransformerEncoderBase): return pad_mask @torch.jit.ignore - def forward(self, xs_pad, masks): + def forward(self, xs_pad: torch.Tensor, + masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """Conformer Forward function Args: @@ -997,7 +1014,12 @@ class WindowQformer(nn.Module): if normalize_before else None) self.window_size = window_size - def forward(self, audio_embed, mask, embed_len=None): + def forward( + self, + audio_embed: torch.Tensor, + mask: Optional[torch.Tensor], + embed_len: Optional[int] = None + ) -> tuple[torch.Tensor, Optional[int]]: """forward decoder""" # audio_embed: N x T x D => N x D x T @@ -1042,7 +1064,7 @@ class WindowQformer(nn.Module): class AudioEmbedding(nn.Module): """Image embedding.""" - def __init__(self, config: PretrainedConfig, **kwargs) -> None: + def __init__(self, config: PretrainedConfig, **kwargs: Any) -> None: super().__init__() self.config = config # n_embed or hidden_size for text LM @@ -1148,19 +1170,18 @@ class AudioEmbedding(nn.Module): self.input_embeds = None self.audio_embed_sizes = None - def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None: + def set_audio_embeds(self, input_embeds: torch.Tensor) -> None: self.input_embeds = input_embeds - def set_audio_embed_sizes(self, - audio_embed_sizes: torch.LongTensor) -> None: + def set_audio_embed_sizes(self, audio_embed_sizes: torch.Tensor) -> None: self.audio_embed_sizes = audio_embed_sizes def get_audio_features( self, - input_embeds: torch.FloatTensor, - audio_attention_mask: torch.Tensor = None, + input_embeds: torch.Tensor, + audio_attention_mask: Optional[torch.Tensor] = None, audio_projection_mode: str = "speech", - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ arguments: input_embeds: audio features (B, T, D) B: num audios in a sequence @@ -1214,10 +1235,10 @@ class AudioEmbedding(nn.Module): def forward( self, - audio_features: torch.FloatTensor, - audio_attention_mask: torch.Tensor = None, + audio_features: torch.Tensor, + audio_attention_mask: Optional[torch.Tensor] = None, audio_projection_mode: str = "speech", - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ arguments: audio_features: audio features (T, D) diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index 59535503822d4..6fbfca619a42f 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -16,13 +16,13 @@ from torch import Tensor, nn class BlockBase(nn.Module): """Block abstract module""" - def __init__(self, input_size, output_size): + def __init__(self, input_size: int, output_size: int) -> None: super().__init__() self.input_size = input_size self.output_size = output_size -def get_activation(name="relu"): +def get_activation(name: str = "relu") -> torch.nn.Module: """Select an activation function by name Args: @@ -43,15 +43,18 @@ def get_activation(name="relu"): return nn.Identity() -def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0): +def adaptive_enc_mask(x_len: int, + chunk_start_idx: list[int], + left_window: int = 0, + right_window: int = 0) -> torch.Tensor: """ The function is very important for Transformer Transducer Streaming mode Args: - xs_len (int): sequence length - chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. + x_len: sequence length + chunk_start_idx: first idx of each chunk, such as [0,18,36,48]. It also supports adaptive chunk size [0,10,15,45] - left_window (int): how many left chunks can be seen - right_window (int): how many right chunks can be seen. It is used for + left_window: how many left chunks can be seen + right_window: how many right chunks can be seen. It is used for chunk overlap model. Returns: mask (torch.Tensor): a mask tensor for streaming model @@ -172,13 +175,13 @@ class GLUPointWiseConv(nn.Module): def __init__( self, - input_dim, - output_dim, - kernel_size, - glu_type="sigmoid", - bias_in_glu=True, - causal=False, - ): + input_dim: int, + output_dim: int, + kernel_size: int, + glu_type: str = "sigmoid", + bias_in_glu: bool = True, + causal: bool = False, + ) -> None: super().__init__() self.glu_type = glu_type @@ -216,11 +219,10 @@ class GLUPointWiseConv(nn.Module): self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1)) self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1)) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """ Args: - x: torch.Tensor - input tensor + x: input tensor """ # to be consistent with GLULinear, we assume the input always has the # #channel (#dim) in the last dimension of the tensor, so need to @@ -272,12 +274,12 @@ class DepthWiseSeperableConv1d(nn.Module): def __init__( self, - input_dim, - depthwise_seperable_out_channel, - kernel_size, - depthwise_multiplier, - padding=0, - ): + input_dim: int, + depthwise_seperable_out_channel: int, + kernel_size: int, + depthwise_multiplier: int, + padding: int = 0, + ) -> None: super().__init__() self.dw_conv = nn.Conv1d( @@ -301,12 +303,11 @@ class DepthWiseSeperableConv1d(nn.Module): self.pw_conv = nn.Identity() self.depthwise_seperable_out_channel = depthwise_seperable_out_channel - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """ Args: - x: torch.Tensor - input tensor + x: input tensor """ x = self.dw_conv(x) if self.depthwise_seperable_out_channel != 0: @@ -375,23 +376,23 @@ class ConvModule(nn.Module): def __init__( self, - input_dim, - ext_pw_out_channel, - depthwise_seperable_out_channel, - ext_pw_kernel_size, - kernel_size, - depthwise_multiplier, - dropout_rate, - causal=False, - batch_norm=False, - chunk_se=0, - chunk_size=18, - activation="relu", - glu_type="sigmoid", - bias_in_glu=True, - linear_glu_in_convm=False, - export=False, - ): + input_dim: int, + ext_pw_out_channel: int, + depthwise_seperable_out_channel: int, + ext_pw_kernel_size: int, + kernel_size: int, + depthwise_multiplier: int, + dropout_rate: float, + causal: bool = False, + batch_norm: bool = False, + chunk_se: int = 0, + chunk_size: int = 18, + activation: str = "relu", + glu_type: str = "sigmoid", + bias_in_glu: bool = True, + linear_glu_in_convm: bool = False, + export: bool = False, + ) -> None: super().__init__() self.layer_norm = nn.LayerNorm(input_dim) self.input_dim = input_dim @@ -437,7 +438,7 @@ class ConvModule(nn.Module): self.ln2 = nn.Linear(input_dim * depthwise_multiplier, input_dim) - def _add_ext_pw_layer(self): + def _add_ext_pw_layer(self) -> None: """ This function is an extension of __init__ function and dedicated to the convolution module creation @@ -497,12 +498,11 @@ class ConvModule(nn.Module): self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3)) self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3)) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """ConvModule Forward. Args: - x: torch.Tensor - input tensor. + x: input tensor. """ x = self.layer_norm(x) @@ -567,21 +567,20 @@ class GLULinear(nn.Module): def __init__( self, - input_dim, - output_dim, - glu_type="sigmoid", - bias_in_glu=True, - ): + input_dim: int, + output_dim: int, + glu_type: str = "sigmoid", + bias_in_glu: bool = True, + ) -> None: super().__init__() self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu) self.glu_act = GLU(-1, glu_type) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """GLULinear forward Args: - x: torch.Tensor - inpute tensor. + x: input tensor. """ x = self.linear(x) return self.glu_act(x) @@ -609,12 +608,12 @@ class FeedForward(nn.Module): def __init__( self, - d_model, - d_inner, - dropout_rate, - activation="sigmoid", - bias_in_glu=True, - ): + d_model: int, + d_inner: int, + dropout_rate: float, + activation: str = "sigmoid", + bias_in_glu: bool = True, + ) -> None: super().__init__() self.d_model = d_model self.d_inner = d_inner @@ -628,12 +627,11 @@ class FeedForward(nn.Module): nn.Dropout(dropout_rate), ) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """FeedForward forward function. Args: - x: torch.Tensor - input tensor. + x: input tensor. """ out = self.net(self.layer_norm(x)) @@ -642,14 +640,14 @@ class FeedForward(nn.Module): #### positional encoding starts here def _pre_hook( - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, -): + state_dict: dict, + prefix: str, + local_metadata: dict, + strict: bool, + missing_keys: list[str], + unexpected_keys: list[str], + error_msgs: list[str], +) -> None: """Perform pre-hook in load_state_dict for backward compatibility. Note: @@ -708,10 +706,10 @@ class T5RelativeAttentionLogitBias(nn.Module): """ def __init__(self, - num_heads, - num_buckets=-1, - max_distance=1000, - symmetric=False): + num_heads: int, + num_buckets: int = -1, + max_distance: int = 1000, + symmetric: bool = False) -> None: super().__init__() self.num_heads = num_heads self.num_buckets = num_buckets @@ -727,7 +725,7 @@ class T5RelativeAttentionLogitBias(nn.Module): self.num_buckets *= 2 self.bias_values = nn.Embedding(self.num_buckets, self.num_heads) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: # instantiate bias compatible with shape of x maxpos = x.size(1) context_position = torch.arange(maxpos, @@ -760,7 +758,7 @@ class T5RelativeAttentionLogitBias(nn.Module): return t5_rel_att_bias - def _bucket_relative_position(self, relative_position): + def _bucket_relative_position(self, relative_position: Tensor) -> Tensor: # this is a placeholder (isn't tested, likely buggy) using HuggingFace # implem as a reference this also needs to be extended to support # asymmetric +/- ve positions @@ -810,7 +808,10 @@ class AbsolutePositionalEncoding(nn.Module): """ - def __init__(self, d_model, dropout_rate, max_len=5000): + def __init__(self, + d_model: int, + dropout_rate: float, + max_len: int = 5000) -> None: """Construct an PositionalEncoding object.""" super().__init__() self.d_model = d_model @@ -820,11 +821,11 @@ class AbsolutePositionalEncoding(nn.Module): self.extend_pe(torch.tensor(0.0).expand(1, max_len)) self._register_load_state_dict_pre_hook(_pre_hook) - def extend_pe(self, x): + def extend_pe(self, x: torch.Tensor) -> None: """Reset the positional encodings. Args: - x: torch.Tensor + x: input tensor """ if self.pe is not None and self.pe.size(1) >= x.size(1): if self.pe.dtype != x.dtype or self.pe.device != x.device: @@ -840,15 +841,14 @@ class AbsolutePositionalEncoding(nn.Module): pe = pe.unsqueeze(0) self.pe = pe.to(device=x.device, dtype=x.dtype) - def forward(self, x: torch.Tensor): + def forward(self, x: torch.Tensor) -> torch.Tensor: """Add positional encoding. Args: - x: torch.Tensor - Input tensor. shape is (batch, time, ...) + x: Input tensor. shape is (batch, time, ...) Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) + Encoded tensor. Its shape is (batch, time, ...) """ self.extend_pe(x) @@ -868,7 +868,7 @@ class MeanVarianceNormLayer(nn.Module): layer input size. """ - def __init__(self, input_size): + def __init__(self, input_size: int) -> None: super().__init__() self.input_size = input_size self.global_mean = nn.Parameter(torch.zeros(input_size)) @@ -878,8 +878,7 @@ class MeanVarianceNormLayer(nn.Module): """MeanVarianceNormLayer Forward Args: - input_: torch.Tensor - input tensor. + input_: input tensor. """ return (input_ - self.global_mean) * self.global_invstd @@ -949,7 +948,10 @@ class CausalConv1D(nn.Conv1d): dtype=dtype, ) - def update_cache(self, x, cache=None): + def update_cache( + self, + x: Tensor, + cache: Optional[Tensor] = None) -> tuple[Tensor, Optional[Tensor]]: if cache is None: new_x = F.pad(x, pad=(self._left_padding, self._right_padding)) next_cache = cache @@ -963,7 +965,11 @@ class CausalConv1D(nn.Conv1d): next_cache = next_cache[:, :, -cache.size(-1):] return new_x, next_cache - def forward(self, x, cache=None): + def forward( + self, + x: Tensor, + cache: Optional[Tensor] = None + ) -> Union[Tensor, tuple[Tensor, Optional[Tensor]]]: x, cache = self.update_cache(x, cache=cache) x = super().forward(x) if cache is None: @@ -1017,8 +1023,8 @@ class CausalConv2D(nn.Conv2d): def forward( self, - x, - ): + x: Tensor, + ) -> Tensor: x = F.pad( x, pad=(self._left_padding, self._right_padding, 0, 0), @@ -1062,16 +1068,16 @@ class NemoConvSubsampling(torch.nn.Module): """ def __init__( - self, - feat_in, - feat_out, - subsampling_factor=4, - subsampling="dw_striding", - conv_channels=256, - subsampling_conv_chunking_factor=1, - activation=nn.ReLU(), # noqa: B008 - is_causal=False, - ): + self, + feat_in: int, + feat_out: int, + subsampling_factor: int = 4, + subsampling: str = "dw_striding", + conv_channels: int = 256, + subsampling_conv_chunking_factor: int = 1, + activation: torch.nn.Module = nn.ReLU(), # noqa: B008 + is_causal: bool = False, + ) -> None: super().__init__() self._subsampling = subsampling self._conv_channels = conv_channels @@ -1328,28 +1334,25 @@ class NemoConvSubsampling(torch.nn.Module): self.conv = torch.nn.Sequential(*layers) - def get_sampling_frames(self): + def get_sampling_frames(self) -> list[int]: return [1, self.subsampling_factor] - def get_streaming_cache_size(self): + def get_streaming_cache_size(self) -> list[int]: return [0, self.subsampling_factor + 1] - def forward(self, x, mask): + def forward(self, x: Tensor, + mask: Optional[Tensor]) -> tuple[Tensor, Optional[Tensor]]: """ Forward method for NeMo subsampling. Args: - x[Batch, Time, Filters]: torch.Tensor - input tensor - x_mask: torch.Tensor - input mask + x: input tensor + mask: input mask Returns: - x: torch.Tensor - Resulting tensor from subsampling (B, T // + x: Resulting tensor from subsampling (B, T // time_reduction_factor, feat_out) - pad_mask: torch.Tensor - tensor of padded hidden state sequences (B, 1, T // + pad_mask: tensor of padded hidden state sequences (B, 1, T // time_reduction_factor) """ x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2) @@ -1403,7 +1406,7 @@ class NemoConvSubsampling(torch.nn.Module): padding_length.size(0), -1) < padding_length.unsqueeze(1) return x, pad_mask.unsqueeze(1) - def reset_parameters(self): + def reset_parameters(self) -> None: # initialize weights if self._subsampling == "dw_striding": with torch.no_grad(): @@ -1433,7 +1436,7 @@ class NemoConvSubsampling(torch.nn.Module): torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale) torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale) - def conv_split_by_batch(self, x): + def conv_split_by_batch(self, x: Tensor) -> tuple[Tensor, bool]: """Tries to split input by batch, run conv and concat results""" b, _, _, _ = x.size() if b == 1: # can't split if batch size is 1 @@ -1460,7 +1463,7 @@ class NemoConvSubsampling(torch.nn.Module): True, ) - def conv_split_by_channel(self, x): + def conv_split_by_channel(self, x: Tensor) -> Tensor: """For dw convs, tries to split input by time, run conv and concat results""" x = self.conv[0](x) # full conv2D @@ -1500,7 +1503,8 @@ class NemoConvSubsampling(torch.nn.Module): x = self.conv[i * 3 + 4](x) # activation return x - def channel_chunked_conv(self, conv, chunk_size, x): + def channel_chunked_conv(self, conv: torch.nn.Module, chunk_size: int, + x: Tensor) -> Tensor: """Performs channel chunked convolution""" ind = 0 @@ -1541,7 +1545,7 @@ class NemoConvSubsampling(torch.nn.Module): return torch.cat(out_chunks, 1) def change_subsampling_conv_chunking_factor( - self, subsampling_conv_chunking_factor: int): + self, subsampling_conv_chunking_factor: int) -> None: if (subsampling_conv_chunking_factor != -1 and subsampling_conv_chunking_factor != 1 and subsampling_conv_chunking_factor % 2 != 0): @@ -1552,12 +1556,12 @@ class NemoConvSubsampling(torch.nn.Module): self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor -def calc_length(lengths, - all_paddings, - kernel_size, - stride, - ceil_mode, - repeat_num=1): +def calc_length(lengths: Tensor, + all_paddings: int, + kernel_size: int, + stride: int, + ceil_mode: bool, + repeat_num: int = 1) -> Tensor: """Calculates the output length of a Tensor passed through a convolution or max pooling layer""" add_pad: float = all_paddings - kernel_size @@ -1573,11 +1577,11 @@ def calc_length(lengths, class AttModule(nn.Module): """Attention abstraction module""" - def __init__(self): + def __init__(self) -> None: super().__init__() self.export_mode = False - def set_export(self, mode=True): + def set_export(self, mode: bool = True) -> None: """set the export mode""" self.export_mode = mode @@ -1591,14 +1595,10 @@ class AttModule(nn.Module): """AttModule forward Args: - x: torch.Tensor - input tensor. - memory: torch.Tensor, optional - memory tensor. - pos_emb: torch.Tensor, optional - positional encoder embedding. - att_mask: torch.Tensor, optional - attention mask tensor. + x: input tensor. + memory: memory tensor. + pos_emb: positional encoder embedding. + att_mask: attention mask tensor. """ return x, memory, pos_emb, att_mask @@ -1606,15 +1606,15 @@ class AttModule(nn.Module): class AttBlock(BlockBase, AttModule): """Attention Block module to support both Attention and Block module.""" - def memory_dims(self, max_len=False): + def memory_dims(self, max_len: bool = False) -> tuple[int, int]: """memory dimensions""" return (1, self.input_size) def masked_softmax( - scores, + scores: Tensor, mask: Optional[Tensor], -): +) -> Tensor: if mask is not None: mask = mask.unsqueeze(1).eq(0) # (batch, 1, time1, time2) scores = scores.masked_fill(mask, -torch.inf) @@ -1636,10 +1636,6 @@ class MultiHeadedAttention(nn.Module): input size features. dropout_rate: float dropout rate. - use_LN: bool - apply layer norm or not - dropout_at_output: bool - whether to apply dropout at output attention_inner_dim: int, optional the attention dimension used in the class, it can be different from the input dimension n_feat. @@ -1666,16 +1662,16 @@ class MultiHeadedAttention(nn.Module): def __init__( self, - n_head, - n_feat, - dropout_rate, - attention_inner_dim=-1, - glu_type="swish", - bias_in_glu=True, - use_pt_scaled_dot_product_attention=False, - n_value=-1, + n_head: int, + n_feat: int, + dropout_rate: float, + attention_inner_dim: int = -1, + glu_type: str = "swish", + bias_in_glu: bool = True, + use_pt_scaled_dot_product_attention: bool = False, + n_value: int = -1, group_size: int = 1, - ): + ) -> None: super().__init__() if n_value == -1: n_value = n_feat @@ -1718,28 +1714,22 @@ class MultiHeadedAttention(nn.Module): query: Tensor, key: Tensor, value: Tensor, - pos_k: Tensor, - pos_v: Tensor, + pos_k: Optional[Tensor], + pos_v: Optional[Tensor], mask: Optional[Tensor], relative_attention_bias: Optional[Tensor] = None, - ): + ) -> Tensor: """Compute 'Scaled Dot Product Attention'. Args: - query: torch.Tensor - query tensor (batch, time1, size) - key: torch.Tensor - key tensor (batch, time2, size) - value: torch.Tensor - value tensor (batch, time1, size) - pos_k: torch.Tensor - key tensor used for relative positional embedding. - pos_v: torch.Tensor - value tensor used for relative positional embedding. - mask: torch.Tensor - mask tensor (batch, time1, time2) - relative_attention_bias: torch.Tensor - bias added to attention logits w.r.t. relative positions + query: query tensor (batch, time1, size) + key: key tensor (batch, time2, size) + value: value tensor (batch, time1, size) + pos_k: key tensor used for relative positional embedding. + pos_v: value tensor used for relative positional embedding. + mask: mask tensor (batch, time1, time2) + relative_attention_bias: bias added to attention logits w.r.t. + relative positions (1, n_head, time1, time2) """ n_batch = query.size(0) @@ -1832,20 +1822,20 @@ class MultiSequential(torch.nn.Sequential): """Multi-input multi-output torch.nn.Sequential""" @torch.jit.ignore - def forward(self, *args): + def forward(self, *args) -> tuple: """Forward method implementation.""" for m in self: args = m(*args) return args -def get_offset(input_layer: str, time_reduction: int): +def get_offset(input_layer: str, time_reduction: int) -> int: """Get an offset. We will use the offset for determining #frames of a subsampled feature. Args: - input_layer (str): Type of an input layer - time_reduction (int): time reduction factor for downsampling a feature + input_layer: Type of an input layer + time_reduction: time reduction factor for downsampling a feature Returns: int: offset """ @@ -1858,13 +1848,14 @@ def get_offset(input_layer: str, time_reduction: int): return 0 -def unfold_tensor(xs_pad, max_seq_len): +def unfold_tensor(xs_pad: Tensor, max_seq_len: int) -> Tensor: """ For a given tensor with shape of (N, T, D), if sequence length T is longer than max_seq_len, this function unfold it to a (NT', max_seq_len, D) where T' is T // max_seq_len. Args: - xs_pad: N, T, D + xs_pad: input tensor with shape (N, T, D) + max_seq_len: maximum sequence length """ _, _, D = xs_pad.shape xs_pad = xs_pad.transpose(-1, -2) # convert to N, D, T diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index a052b2a486f6a..020142b2c1c1a 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1193,21 +1193,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, input_ids: Flattened (concatenated) input_ids corresponding to a batch. positions: Flattened (concatenated) position ids corresponding to a - batch. - **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL - opensource models), the shape will be `(3, seq_len)`, + batch. **NOTE**: If mrope is enabled (default setting for + Qwen2.5-VL opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. - second_per_grid_ts: Tensor `(num_videos)` of video time interval ( - in seconds) for each grid along the temporal dimension in the - 3D position IDs. `None` if no videos are passed. """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index ed65944c109bd..34b9c1ad07d76 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -9,7 +9,7 @@ model alternates between state space model layers and attention-based layers. """ from collections.abc import Iterable from itertools import cycle -from typing import Optional, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -528,8 +528,6 @@ class Zamba2MambaDecoderLayer(nn.Module): hidden_states: Input tensor [batch_size, seq_len, hidden_size] mamba_cache_params: Parameters for Mamba's state caches (one for conv, one for ssm) - sequence_idx: Index tensor for identifying sequences in batch - Required for proper chunked processing in prefill transformer_hidden_states: Optional output from transformer path Added to input if provided (used in hybrid architecture) positions: Optional position IDs (unused in Mamba) @@ -591,8 +589,6 @@ class Zamba2HybridLayer(nn.Module): Args: shared_transformer: Transformer decoder layer for attention pathway - linear: Linear projection for transformer output before Mamba - mamba: Mamba decoder layer for state space pathway """ super().__init__() self.block_idx = block_idx @@ -630,8 +626,6 @@ class Zamba2HybridLayer(nn.Module): positions: Position IDs for positional embeddings mamba_cache_params: Parameters for Mamba's state caches (one for conv, one for ssm) - sequence_idx: Indices for identifying sequences in batch, - required for proper chunked processing in prefill Returns: Output tensor combining transformer and Mamba representations @@ -915,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): prefix: Optional prefix for parameter names Raises: - AssertionError: If prefix caching is enabled (not supported by - Mamba) + AssertionError: If prefix caching is enabled + (not supported by Mamba) """ config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -971,7 +965,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): input_ids: torch.Tensor, positions: torch.Tensor, inputs_embeds: Optional[torch.Tensor] = None, - **kwargs) -> torch.Tensor: + **kwargs: Any) -> torch.Tensor: """Forward pass through the model. Args: @@ -1012,9 +1006,9 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str, - torch.Tensor], - **kwargs) -> dict[str, torch.Tensor]: + def copy_inputs_before_cuda_graphs( + self, input_buffers: dict[str, torch.Tensor], + **kwargs: Any) -> dict[str, torch.Tensor]: """Copy inputs before CUDA graph capture. Args: From 2eb9986a2d103466f37f92551f42cc0370596f2b Mon Sep 17 00:00:00 2001 From: Kay Yan <kay.yan@daocloud.io> Date: Wed, 10 Sep 2025 21:25:33 +0800 Subject: [PATCH 125/584] [BugFix] `python collect_env.py` and `vllm collect-env` compatibility with uv venv (#24066) Signed-off-by: Kay Yan <kay.yan@daocloud.io> --- vllm/collect_env.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/collect_env.py b/vllm/collect_env.py index 0291f64e84f0a..fb9d3657790cf 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -489,6 +489,16 @@ def get_libc_version(): return '-'.join(platform.libc_ver()) +def is_uv_venv(): + if os.environ.get("UV"): + return True + pyvenv_cfg_path = os.path.join(sys.prefix, 'pyvenv.cfg') + if os.path.exists(pyvenv_cfg_path): + with open(pyvenv_cfg_path, 'r') as f: + return any(line.startswith('uv = ') for line in f) + return False + + def get_pip_packages(run_lambda, patterns=None): """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" if patterns is None: @@ -504,7 +514,7 @@ def get_pip_packages(run_lambda, patterns=None): if pip_available: cmd = [sys.executable, '-mpip', 'list', '--format=freeze'] - elif os.environ.get("UV") is not None: + elif is_uv_venv(): print("uv is set") cmd = ["uv", "pip", "list", "--format=freeze"] else: From 736569da8d856d5a66d7d7523a7e05f97a4d21f1 Mon Sep 17 00:00:00 2001 From: zzhxxx <96690582+zzhx1@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:26:31 +0800 Subject: [PATCH 126/584] [Platform] Custom ops support for LMhead and LogitsProcessor (#23564) Signed-off-by: zzhx1 <zzh_201018@outlook.com> --- vllm/model_executor/layers/logits_processor.py | 5 +++-- vllm/model_executor/layers/vocab_parallel_embedding.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index e93be9bfb1657..8a4ac214443eb 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -6,11 +6,11 @@ from concurrent.futures import ThreadPoolExecutor from typing import Optional import torch -import torch.nn as nn import vllm.envs as envs from vllm.distributed import (tensor_model_parallel_all_gather, tensor_model_parallel_gather) +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -22,7 +22,8 @@ if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: envs.VLLM_LOGITS_PROCESSOR_THREADS) -class LogitsProcessor(nn.Module): +@CustomOp.register("logits_processor") +class LogitsProcessor(CustomOp): """Process logits and apply logits processors from sampling metadata. This layer does the following: diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index c92a7978195bc..15e628177b3f6 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -429,6 +429,7 @@ class VocabParallelEmbedding(CustomOp): return s +@CustomOp.register("parallel_lm_head") class ParallelLMHead(VocabParallelEmbedding): """Parallelized LM head. From fcc0a3130a9ffcb8d6ed12503372c2513f982d92 Mon Sep 17 00:00:00 2001 From: pwschuurman <psch@google.com> Date: Wed, 10 Sep 2025 06:57:36 -0700 Subject: [PATCH 127/584] [CI] Fix tensorizer test assertion (#24545) Signed-off-by: Peter Schuurman <psch@google.com> --- tests/tensorizer_loader/test_tensorizer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 0fb142a1b6e56..e00d7c2f80c67 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -161,11 +161,11 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref): model = vllm_runner( model_ref, model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) + pytest.fail("Expected RuntimeError for extra config keys") except RuntimeError: out, err = capfd.readouterr() combined_output = out + err - assert ("ValueError: Model loader extra config " - "is not supported for load " + assert ("ValueError: Unexpected extra config keys for load " "format auto") in combined_output finally: del model @@ -181,11 +181,12 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, model_ref, load_format="safetensors", model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) + pytest.fail("Expected RuntimeError for extra config keys") except RuntimeError: out, err = capfd.readouterr() combined_output = out + err - assert ("ValueError: Model loader extra config is not supported " + assert ("ValueError: Unexpected extra config keys " "for load format safetensors") in combined_output finally: del model From bb3eb80d92f66eed4016fa5ed9d9e66125a14482 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Wed, 10 Sep 2025 22:47:51 +0800 Subject: [PATCH 128/584] [Core] Split LoRA layers (#24574) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- tests/lora/test_layers.py | 10 +- vllm/lora/fully_sharded_layers.py | 355 ------ vllm/lora/layers.py | 1192 ------------------ vllm/lora/layers/__init__.py | 34 + vllm/lora/layers/base.py | 69 + vllm/lora/layers/base_linear.py | 184 +++ vllm/lora/layers/column_parallel_linear.py | 622 +++++++++ vllm/lora/layers/logits_processor.py | 247 ++++ vllm/lora/layers/qkv_x_parallel_linear.py | 8 + vllm/lora/layers/replicated_linear.py | 61 + vllm/lora/layers/row_parallel_linear.py | 201 +++ vllm/lora/layers/utils.py | 60 + vllm/lora/layers/vocal_parallel_embedding.py | 172 +++ vllm/lora/utils.py | 10 +- 14 files changed, 1668 insertions(+), 1557 deletions(-) delete mode 100644 vllm/lora/fully_sharded_layers.py delete mode 100644 vllm/lora/layers.py create mode 100644 vllm/lora/layers/__init__.py create mode 100644 vllm/lora/layers/base.py create mode 100644 vllm/lora/layers/base_linear.py create mode 100644 vllm/lora/layers/column_parallel_linear.py create mode 100644 vllm/lora/layers/logits_processor.py create mode 100644 vllm/lora/layers/qkv_x_parallel_linear.py create mode 100644 vllm/lora/layers/replicated_linear.py create mode 100644 vllm/lora/layers/row_parallel_linear.py create mode 100644 vllm/lora/layers/utils.py create mode 100644 vllm/lora/layers/vocal_parallel_embedding.py diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 891bc75fcdee0..b0038a28ed89d 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -12,20 +12,20 @@ import torch import torch.nn.functional as F from vllm.config import LoRAConfig -from vllm.lora.fully_sharded_layers import ( - ColumnParallelLinearWithShardedLoRA, - MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA, - RowParallelLinearWithShardedLoRA) # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, + ColumnParallelLinearWithShardedLoRA, LogitsProcessorWithLoRA, LoRAMapping, MergedColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithLoRA, + QKVParallelLinearWithShardedLoRA, ReplicatedLinearWithLoRA, RowParallelLinearWithLoRA, + RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA) # yapf: enable from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py deleted file mode 100644 index 7fc4cfe026aee..0000000000000 --- a/vllm/lora/fully_sharded_layers.py +++ /dev/null @@ -1,355 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# pylint: disable=unused-argument -from typing import TYPE_CHECKING, Optional, Union, cast - -import torch -import torch.nn as nn -from transformers import PretrainedConfig - -from vllm.config import LoRAConfig -from vllm.distributed.communication_op import ( - tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import get_tensor_model_parallel_rank -from vllm.lora.layers import (ColumnParallelLinearWithLoRA, - MergedColumnParallelLinearWithLoRA, - MergedQKVParallelLinearWithLoRA, - QKVParallelLinearWithLoRA, - RowParallelLinearWithLoRA) -from vllm.platforms import current_platform - -if TYPE_CHECKING: - pass - - -def _fully_sharded_can_replace(can_replace): - """ - decorator which adds the condition of fully sharded loras - intended to wrap can_replace_layer() - """ - - def dec(*args, **kwargs): - return (can_replace(*args, **kwargs) - and kwargs["lora_config"].fully_sharded_loras) - - return dec - - -def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): - """ - For `ColumnParallelLinearWithLoRA` or classes that inherit from - `ColumnParallelLinearWithLoRA`, they share the same `apply` logic. - """ - assert (layer.n_slices == len(layer.lora_a_stacked) == len( - layer.lora_b_stacked) == len(layer.output_slices)) - if layer.lora_bias_stacked is not None: - assert layer.n_slices == len(layer.lora_bias_stacked) - - output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - - # Since communication is needed, the buffer is directly initialized as a - # tensor rather than a tuple of tensor. - buffers = torch.zeros( - (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device, - ) - - shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink( - buffers, x, layer.lora_a_stacked, 1.0) - - if not current_platform.can_update_inplace(): - buffers = shrunk_buffers - - buffers = tensor_model_parallel_all_gather(buffers) - - lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand( - output, - buffers, - layer.lora_b_stacked, - layer.lora_bias_stacked, - layer.output_slices, - offset_start=0, - add_input=True) - - if not current_platform.can_update_inplace(): - output = lora_output - - output = output.view(*out_orig_shape) - # now have column partitioned and packed output - return output - - -# these layers are based on the tensor parallelism strategy given in -# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, -# https://arxiv.org/abs/2311.03285. - - -class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): - """ - Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also. - - Based on S-LoRA, slicing happens along the rank dim. - """ - - # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`, - # their `lora_a` and `lora_b` have different sharding patterns. After - # completing the `lora_a` GEMM , a gather operation is performed. - # Therefore, the sharding of `lora_a` only needs to correspond with the - # gather operation. - def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked[0].shape[2] - start_idx = tp_rank * shard_size - lora_a = lora_a[:, start_idx:start_idx + shard_size] - return lora_a - - def apply(self, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - return _mcp_apply(x, bias, self) - - @classmethod - @_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - # specifying kwargs so they can be easily accessed in decorator - return super().can_replace_layer( - source_layer=source_layer, - lora_config=lora_config, - packed_modules_list=packed_modules_list, - model_config=model_config, - decorate=False, - ) - - -class MergedColumnParallelLinearWithShardedLoRA( - MergedColumnParallelLinearWithLoRA): - """ - Differs from MergedColumnParallelLinearWithLoRA by slicing the - LoRA A's also. - - Based on S-LoRA, slicing happens along the rank dim. - """ - - def slice_lora_a( - self, lora_a: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: - #NOTE: lora_a contains 2 subloras, and each sublora could be None. - output_shard_size = self.lora_a_stacked[0].shape[2] - output_start_idx = self.tp_rank * output_shard_size - lora_a = [ - lora_a[0][:, output_start_idx:output_start_idx + - output_shard_size] if lora_a[0] is not None else None, - lora_a[1][:, output_start_idx:output_start_idx + - output_shard_size] if lora_a[1] is not None else None, - ] - return lora_a - - def apply(self, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - return _mcp_apply(x, bias, self) - - @classmethod - @_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - # specifying kwargs so they can be easily accessed in decorator - return super().can_replace_layer( - source_layer=source_layer, - lora_config=lora_config, - packed_modules_list=packed_modules_list, - model_config=model_config, - decorate=False, - ) - - -class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): - """ - Differs from QKVParallelLinearWithLoRA by slicing the - LoRA A's also. - - Based on S-LoRA, slicing happens along the rank dim. - """ - - def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked[0].shape[2] - start_idx = tp_rank * shard_size - lora_a = lora_a[:, start_idx:start_idx + shard_size] - return lora_a - - def apply(self, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - return _mcp_apply(x, bias, self) - - @classmethod - @_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: list, - model_config: Optional[PretrainedConfig]) -> bool: - # specifying kwargs so they can be easily accessed in decorator - return super().can_replace_layer( - source_layer=source_layer, - lora_config=lora_config, - packed_modules_list=packed_modules_list, - model_config=model_config, - decorate=False, - ) - - -class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): - """ - Differs from MergedQKVParallelLinearWithLoRA by slicing the - LoRA A's also. - - Based on S-LoRA, slicing happens along the rank dim. - """ - - def slice_lora_a( - self, lora_a: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: - # NOTE: lora_a contains 3 subloras, and each sublora could be None. - shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] - start_idx = [self.tp_rank * shard_size[i] for i in range(3)] - lora_a = [ - lora_a[0][:, start_idx[0]:start_idx[0] + - shard_size[0]] if lora_a[0] is not None else None, - lora_a[1][:, start_idx[1]:start_idx[1] + - shard_size[1]] if lora_a[1] is not None else None, - lora_a[2][:, start_idx[2]:start_idx[2] + - shard_size[2]] if lora_a[2] is not None else None, - ] - return lora_a - - def apply(self, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - return _mcp_apply(x, bias, self) - - @classmethod - @_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - # specifying kwargs so they can be easily accessed in decorator - return super().can_replace_layer( - source_layer=source_layer, - lora_config=lora_config, - packed_modules_list=packed_modules_list, - model_config=model_config, - decorate=False, - ) - - -class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): - """ - Differs from RowParallelLinearWithLoRA by slicing the - LoRA B's also. - - Based on S-LoRA, slicing happens along the output dim. - This yields a combined partial sum from the row parallel base - layer and column partitioned output from the LoRA. - """ - - def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - shard_size = self.lora_b_stacked[0].shape[2] - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] - return lora_b - - def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: - if bias is None: - return bias - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], - self.lora_bias_stacked) - shard_size = self.lora_bias_stacked[0].shape[2] - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size - bias = bias[start_idx:end_idx] - return bias - - def apply(self, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, - output.shape[-1]), output.shape - buffer = torch.zeros( - (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device, - ) - - shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink( - buffer, x, self.lora_a_stacked, 1.0) - if not current_platform.can_update_inplace(): - buffer = shrunk_buffer - - buffer = tensor_model_parallel_all_reduce(buffer) - - # following S-LoRA, allows the fusing of all_gather and all_reduce - # by adding the column partitioned lora output to a slice of output - # tensor, which is a partial sum due to row parallel. All that - # remains is a standard all_reduce. User should be aware though that - # the output is not the same as a normal row_parallel, it should be - # reduced before being used - # NOTE offset are based on the rank. - shard_size = self.lora_b_stacked[0].shape[2] - offset_start = self.tp_rank * shard_size - lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand( - output, - buffer, - self.lora_b_stacked, - self.lora_bias_stacked, - self.output_slices, - offset_start=offset_start, - add_input=True, - ) - - if not current_platform.can_update_inplace(): - output = lora_output - - output = output.view(*out_orig_shape) - return output - - @classmethod - @_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - # specifying kwargs so they can be easily accessed in decorator - return super().can_replace_layer( - source_layer=source_layer, - lora_config=lora_config, - packed_modules_list=packed_modules_list, - model_config=model_config, - decorate=False, - ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py deleted file mode 100644 index 6e4b69c303254..0000000000000 --- a/vllm/lora/layers.py +++ /dev/null @@ -1,1192 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# pylint: disable=unused-argument -import math -from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Union, cast - -import torch -import torch.nn as nn -import torch.nn.functional as F -from transformers import PretrainedConfig - -from vllm.adapter_commons.layers import AdapterMapping -from vllm.config import LoRAConfig -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - split_tensor_along_last_dim, - tensor_model_parallel_all_gather, - tensor_model_parallel_all_reduce) -from vllm.distributed.utils import divide -# yapf: disable -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearBase, - MergedColumnParallelLinear, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear) -# yapf: enable -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) -from vllm.platforms import current_platform - -if TYPE_CHECKING: - from vllm.lora.punica_wrapper import PunicaWrapperBase - - -def _get_lora_device(base_layer: nn.Module) -> torch.device: - # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34 - """Returns the device for where to place the LoRA tensors.""" - # unquantizedLinear - if hasattr(base_layer, "weight"): - return base_layer.weight.device - # Compressed Tensor - elif hasattr(base_layer, "weight_packed"): - return base_layer.weight_packed.device - # GPTQ/AWQ - elif hasattr(base_layer, "qweight"): - return base_layer.qweight.device - # HQQ marlin - elif hasattr(base_layer, "W_q"): - return base_layer.W_q.device - else: - raise ValueError(f"Unsupported base layer: {base_layer}") - - -def _not_fully_sharded_can_replace(can_replace): - """ - decorator which adds the condition of not using fully sharded loras - intended to wrap can_replace_layer() - """ - - def dec(*args, **kwargs): - decorate = kwargs.pop("decorate") if "decorate" in kwargs else True - condition = (not kwargs["lora_config"].fully_sharded_loras - if decorate else True) - return can_replace(*args, **kwargs) and condition - - return dec - - -@dataclass -class LoRAMapping(AdapterMapping): - is_prefill: bool = False - - -class BaseLayerWithLoRA(nn.Module): - - def slice_lora_a( - self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]] - ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: - """Slice lora a if splitting for tensor parallelism.""" - ... - - def slice_lora_b( - self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]] - ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: - """Slice lora b if splitting with tensor parallelism.""" - ... - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - """Initializes lora matrices.""" - ... - - def reset_lora(self, index: int): - """Resets the lora weights at index back to 0.""" - ... - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - """Overwrites lora tensors at index.""" - ... - - def set_mapping( - self, - punica_wrapper, - ): - self.punica_wrapper: PunicaWrapperBase = punica_wrapper - - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - """Returns True if the layer can be replaced by this LoRA layer.""" - raise NotImplementedError - - -class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): - - def __init__(self, base_layer: VocabParallelEmbedding) -> None: - super().__init__() - self.base_layer = base_layer - self.embeddings_slice: Optional[tuple[int, int]] - self.embeddings_weights: Optional[torch.Tensor] - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: - - if self.base_layer.num_added_embeddings_per_partition > 0: - # We can start adding lora weights - self.embeddings_weights = self.base_layer.weight.data[ - self.base_layer.num_org_embeddings_per_partition:self. - base_layer.num_org_embeddings_per_partition + - self.base_layer.num_added_embeddings_per_partition] - self.embeddings_slice = ( - self.base_layer.shard_indices.added_vocab_start_index - - self.base_layer.org_vocab_size, - self.base_layer.shard_indices.added_vocab_end_index - - self.base_layer.org_vocab_size) - self.base_layer.weight.data[ - self.base_layer.num_org_embeddings_per_partition:].fill_(0) - else: - self.embeddings_slice = None - self.embeddings_weights = None - - self.embeddings_tensors = torch.zeros( - ( - max_loras, - lora_config.lora_extra_vocab_size, - self.base_layer.embedding_dim, - ), - dtype=self.base_layer.weight.dtype, - device=self.base_layer.weight.device, - ) - self.lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.org_vocab_size + - lora_config.lora_extra_vocab_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - self.base_layer.embedding_dim, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_a_stacked_2d = self.lora_a_stacked.view( - self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], - self.lora_a_stacked.shape[2], - ) - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = 0 - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( - lora_a, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, - :embeddings_tensor.shape[0], - :embeddings_tensor.shape[1], - ].copy_(embeddings_tensor, non_blocking=True) - if self.embeddings_slice is not None: - # TODO(yard1): Optimize this copy, we don't need to copy - # everything, just the modified part - embeddings = self.embeddings_tensors.view( - self.embeddings_tensors.shape[0] * - self.embeddings_tensors.shape[1], - self.embeddings_tensors.shape[2], - )[self.embeddings_slice[0]:self.embeddings_slice[1]] - assert self.embeddings_weights is not None - self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, - 1, 0) - - # NB: Don't use torch.narrow here. torch.narrow triggers some - # Dynamic Shape specialization in torch.compile - num_tokens = x.shape[0] - indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens] - indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens] - - full_lora_a_embeddings = F.embedding( - x + indices_1, - self.lora_a_stacked_2d, - ) - full_output = self.base_layer.forward(x + - (indices_0 * added_tokens_mask)) - - full_output_org = full_output - if full_output.ndim == 3: - full_output = full_output.view( - full_output.shape[0] * full_output.shape[1], -1) - if full_lora_a_embeddings.ndim == 3: - full_lora_a_embeddings = full_lora_a_embeddings.view( - full_lora_a_embeddings.shape[0] * - full_lora_a_embeddings.shape[1], - -1, - ) - - lora_output: Optional[ - torch.Tensor] = self.punica_wrapper.add_lora_embedding( - full_output, - full_lora_a_embeddings, - self.lora_b_stacked, - add_input=True) - - if not current_platform.can_update_inplace(): - full_output = lora_output - - return full_output.view_as(full_output_org) - - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return type(source_layer) is VocabParallelEmbedding - - @property - def weight(self): - return self.base_layer.weight - - -class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): - - def __init__(self, base_layer: LinearBase): - super().__init__() - self.base_layer = base_layer - self.input_size = self.base_layer.input_size - self.device = _get_lora_device(self.base_layer) - self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None - - self.output_slices: tuple[int, ...] - self.tp_size: int - self.output_size: int - self.n_slices: int - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config - # - if isinstance(self.base_layer, ReplicatedLinear): - lora_a_out_size = lora_config.max_lora_rank - lora_b_out_size = self.output_size - - elif isinstance(self.base_layer, ColumnParallelLinear): - lora_a_out_size = (lora_config.max_lora_rank if - not lora_config.fully_sharded_loras else divide( - lora_config.max_lora_rank, self.tp_size)) - lora_b_out_size = self.output_size - - elif isinstance(self.base_layer, RowParallelLinear): - lora_a_out_size = lora_config.max_lora_rank - lora_b_out_size = (self.output_size if - not lora_config.fully_sharded_loras else divide( - self.output_size, self.tp_size)) - else: - raise NotImplementedError - - self.lora_a_stacked = tuple( - torch.zeros( - max_loras, - 1, - lora_a_out_size, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) for _ in range(self.n_slices)) - self.lora_b_stacked = tuple( - torch.zeros( - max_loras, - 1, - lora_b_out_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) for _ in range(self.n_slices)) - if lora_config.bias_enabled: - lora_bias_out_size = lora_b_out_size - self.lora_bias_stacked = tuple( - torch.zeros( - max_loras, - 1, - lora_bias_out_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) for _ in range(self.n_slices)) - self.output_slices = (self.lora_b_stacked[0].shape[2], ) - - def reset_lora(self, index: int): - for s_index in range(self.n_slices): - self.lora_a_stacked[s_index][index] = 0 - self.lora_b_stacked[s_index][index] = 0 - if self.lora_config.bias_enabled: - # Make mypy happy - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], - self.lora_bias_stacked) - self.lora_bias_stacked[s_index][index] = 0 - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - lora_bias: Optional[torch.Tensor] = None, - ): - # Except for QKVParallelLinearWithLoRA and - # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers - # store weights in a tuple of size 1. These two layers will - # override this function. - assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == - self.n_slices == 1) - - self.reset_lora(index) - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if lora_bias is not None: - lora_bias = self.slice_bias(lora_bias) - - self.lora_a_stacked[0][index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[0][index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if lora_bias is not None: - - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], - self.lora_bias_stacked) - assert len(self.lora_bias_stacked) - self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( - lora_bias.T, non_blocking=True) - - def apply(self, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - - # In transformers backend, x and output have extra batch dimension like - # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), - # therefore we need to flatten the batch dimensions. - if x.ndim == 3 and output.ndim == 3: - output = output.flatten(0, 1) - x = x.flatten(0, 1) - - lora_output: Optional[ - torch.Tensor] = self.punica_wrapper.add_lora_linear( - output, x, self.lora_a_stacked, self.lora_b_stacked, - self.lora_bias_stacked, 1.0, self.output_slices) - if not current_platform.can_update_inplace(): - output = lora_output - - return output - - @property - def weight(self) -> torch.Tensor: - - # unquantizedLinear - if hasattr(self.base_layer, "weight"): - return self.base_layer.weight - # Compressed Tensor - elif hasattr(self.base_layer, "weight_packed"): - return self.base_layer.weight_packed - # GPTQ/AWQ - elif hasattr(self.base_layer, "qweight"): - return self.base_layer.qweight - # marlin - elif hasattr(self.base_layer, "B"): - return self.base_layer.B - # HQQ marlin - elif hasattr(self.base_layer, "W_q"): - return self.base_layer.W_q - else: - raise ValueError(f"Unsupported base layer: {self.base_layer}") - - @property - def bias(self) -> Optional[torch.Tensor]: - if hasattr(self.base_layer, "bias"): - return self.base_layer.bias - else: - return None - - -class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): - - def __init__(self, base_layer: ReplicatedLinear) -> None: - super().__init__(base_layer, ) - # To ensure interface compatibility, set to 1 always. - self.tp_size = 1 - self.output_size = self.base_layer.output_size - self.n_slices = 1 - - def forward( - self, input_: torch.Tensor - ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]: - """Forward of ReplicatedLinearWithLoRA - - Args: - input_: Tensor whose last dimension is `input_size`. - - Returns: - - output - - bias - """ - bias = (self.base_layer.bias - if not self.base_layer.skip_bias_add else None) - - # Matrix multiply. - output = self.apply(input_, bias) - - output_bias = (self.base_layer.bias - if self.base_layer.skip_bias_add else None) - - if not self.base_layer.return_bias: - return output - - return output, output_bias - - # ReplicatedLinear should always be replaced, regardless of the fully - # sharded LoRAs setting, because it is, by definition, copied per GPU. - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return type(source_layer) is ReplicatedLinear - - -class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): - """ - LoRA on top of ColumnParallelLinear layer. - LoRA B is sliced for tensor parallelism. - There are two types for the `base_layer`: - 1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`. - 2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`. - """ - - def __init__(self, base_layer: ColumnParallelLinear) -> None: - super().__init__(base_layer) - # The base_layer type is ColumnParallelLinear or - # MergedColumnParallelLinear, their weight sharding logic is - # inconsistent when TP is greater than 1. - self.is_merged_col_linear = type( - base_layer) is MergedColumnParallelLinear - self.tp_size = get_tensor_model_parallel_world_size() - self.output_size = self.base_layer.output_size_per_partition - # There is only one LoRA layer - self.n_slices = 1 - - def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - return lora_a - - def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - # Applicable to cases where the base_layer is - # MergedColumnParallelLinear. - if self.is_merged_col_linear: - tp_rank = get_tensor_model_parallel_rank() - shard_size = self.output_size // 2 - offset = lora_b.shape[-1] // 2 - - left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) * - shard_size] - right_weight = lora_b[:, offset + tp_rank * shard_size:offset + - (tp_rank + 1) * shard_size] - lora_b = torch.cat([left_weight, right_weight], dim=1) - # Applicable to cases where the base_layer is - # ColumnParallelLinear. - else: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] - return lora_b - - def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: - # TODO: Fix the slicing logic of bias. - if bias is None: - return bias - tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size - bias = bias[start_idx:end_idx] - return bias - - def forward( - self, input_: torch.Tensor - ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]: - """Forward of ColumnParallelLinear - - Args: - input_: Tensor whose last dimension is `input_size`. - - Returns: - - output - - bias - """ - bias = (self.base_layer.bias - if not self.base_layer.skip_bias_add else None) - - # Matrix multiply. - output_parallel = self.apply(input_, bias) - if self.base_layer.gather_output: - # All-gather across the partitions. - output = tensor_model_parallel_all_gather(output_parallel) - else: - output = output_parallel - - if not self.base_layer.return_bias: - return output - - output_bias = (self.base_layer.bias - if self.base_layer.skip_bias_add else None) - return output, output_bias - - @classmethod - @_not_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return type(source_layer) is ColumnParallelLinear or ( - type(source_layer) is MergedColumnParallelLinear - and len(packed_modules_list) == 1) - - -class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): - """ColumnParallelLinear layer that is composed of 2 sublayers (slices) - packed together (e.g. gate_proj + up_proj -> gate_up_proj). - - This means we have 2 LoRAs, each applied to one half of the layer. - - Both slices must have the same size. - """ - - def __init__( - self, base_layer: Union[MergedColumnParallelLinear, - QKVParallelLinear]) -> None: - super().__init__(base_layer) - # There are two LoRA layers - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - # the output_sizes in MergedColumnParallelLinear is not sharded by tp - # we need to divide it by the tp_size to get correct slices size - output_sizes = self.base_layer.output_sizes - self.output_slices = tuple( - divide(output_size, self.tp_size) for output_size in output_sizes) - self.n_slices = len(self.output_slices) - self.output_ids = (self.tp_rank, ) * self.n_slices - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - """ - The main reason for overriding this function is to enhance code - maintainability. - """ - self.lora_config = lora_config - - lora_a_output_size_per_partition = ( - lora_config.max_lora_rank if not lora_config.fully_sharded_loras - else divide(lora_config.max_lora_rank, self.tp_size)) - - self.lora_a_stacked = tuple( - torch.zeros( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) for _ in range(self.n_slices)) - self.lora_b_stacked = tuple( - torch.zeros( - max_loras, - 1, - output_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) for output_size in self.output_slices) - if lora_config.bias_enabled: - self.lora_bias_stacked = tuple( - torch.zeros( - max_loras, - 1, - output_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) for output_size in self.output_slices) - - def slice_lora_a( - self, lora_a: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: - return lora_a - - def slice_lora_b( - self, lora_b: list[Union[torch.Tensor, None]] - ) -> list[Union[torch.Tensor, None]]: - sliced_lora_b = [None] * self.n_slices - for i, (shard_id, shard_size) in enumerate( - zip(self.output_ids, self.output_slices)): - if (lora_b_i := lora_b[i]) is not None: - sliced_lora_b[i] = lora_b_i[:, - shard_size * shard_id:shard_size * - (shard_id + 1)] - return sliced_lora_b - - def slice_bias( - self, bias: list[Union[torch.Tensor, - None]]) -> list[Union[torch.Tensor, None]]: - for i, (shard_id, shard_size) in enumerate( - zip(self.output_ids, self.output_slices)): - if (bias_i := bias[i]) is not None: - bias[i] = bias_i[shard_size * shard_id:shard_size * - (shard_id + 1)] - return bias - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - lora_bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if lora_bias is not None: - lora_bias = self.slice_bias(lora_bias) - - for i in range(self.n_slices): - if (lora_a_i := lora_a[i]) is not None: - self.lora_a_stacked[i][ - index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_( - lora_a_i.T, non_blocking=True) - if (lora_b_i := lora_b[i]) is not None: - self.lora_b_stacked[i][ - index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_( - lora_b_i.T, non_blocking=True) - - if lora_bias is not None: - self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], - self.lora_bias_stacked) - for i in range(self.n_slices): - if (lora_bias_i := lora_bias[i]) is not None: - self.lora_bias_stacked[i][index, - 0, :lora_bias_i.shape[0]].copy_( - lora_bias_i.T, - non_blocking=True) - - @classmethod - @_not_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return (type(source_layer) is MergedColumnParallelLinear - and len(packed_modules_list) == 2) - - -class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): - """ - ColumnParallelLinear layer that is specifically designed for - qkv_proj. Certain models, such as chatglm3 and baichuan-7b, - only contains a single LoRA within their qkv_proj layer. - - During inference with Tensor Parallel, the weights of lora_b - must be accurately partitioned according to the respective ranks. - - Q slice may have different shape than K and V slices (which both have - the same shape). - """ - - def __init__(self, base_layer: QKVParallelLinear) -> None: - super().__init__(base_layer) - self.q_proj_total_size = (self.base_layer.total_num_heads * - self.base_layer.head_size) - self.q_proj_shard_size = (self.base_layer.num_heads * - self.base_layer.head_size) - self.kv_proj_shard_size = (self.base_layer.num_kv_heads * - self.base_layer.head_size) - self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * - self.base_layer.head_size) - # There is only one LoRA layer - self.n_slices = 1 - - def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - tp_rank = get_tensor_model_parallel_rank() - self.q_shard_id = tp_rank - self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas - lora_b_q = lora_b[:, self.q_proj_shard_size * - self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] - k_offset = self.q_proj_total_size - lora_b_k = lora_b[:, k_offset + - self.kv_proj_shard_size * self.kv_shard_id:k_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - v_offset = k_offset + self.kv_proj_total_size - lora_b_v = lora_b[:, v_offset + - self.kv_proj_shard_size * self.kv_shard_id:v_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) - return lora_b - - def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: - bias_q = bias[self.q_proj_shard_size * - self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] - k_offset = self.q_proj_total_size - bias_k = bias[k_offset + - self.kv_proj_shard_size * self.kv_shard_id:k_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - v_offset = k_offset + self.kv_proj_total_size - bias_v = bias[v_offset + - self.kv_proj_shard_size * self.kv_shard_id:v_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - bias = torch.cat([bias_q, bias_k, bias_v], dim=1) - return bias - - @classmethod - @_not_fully_sharded_can_replace - def can_replace_layer(cls, source_layer: nn.Module, - lora_config: LoRAConfig, packed_modules_list: list, - model_config: Optional[PretrainedConfig]) -> bool: - return type(source_layer) is QKVParallelLinear and len( - packed_modules_list) == 1 - - -class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): - """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices) - packed together in qkv proj fashion - (q_proj + k_proj + v_proj -> qkv_proj). - - This means we have 3 LoRAs, each applied to one slice of the layer. - - Q slice may have different shape than K and V slices (which both have - the same shape). - """ - - def __init__(self, base_layer: QKVParallelLinear) -> None: - super().__init__(base_layer) - # There are three LoRA layer. - self.n_slices = len(self.base_layer.output_sizes) - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - self.q_proj_shard_size = (self.base_layer.num_heads * - self.base_layer.head_size) - self.kv_proj_shard_size = (self.base_layer.num_kv_heads * - self.base_layer.head_size) - self.q_shard_id = self.tp_rank - self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas - - self.output_slices = ( - self.q_proj_shard_size, - self.kv_proj_shard_size, - self.kv_proj_shard_size, - ) - self.output_ids = ( - self.q_shard_id, - self.kv_shard_id, - self.kv_shard_id, - ) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - """ - The main reason for overloading this function is to handle inconsistent - weight dimensions in qkv lora. - """ - super().create_lora_weights(max_loras, lora_config, model_config) - - @classmethod - @_not_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return (type(source_layer) is QKVParallelLinear - and len(packed_modules_list) == 3) - - -#TODO: Implement this -class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA): - pass - - -class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): - - def __init__(self, base_layer: RowParallelLinear) -> None: - super().__init__(base_layer) - - self.tp_size = get_tensor_model_parallel_world_size() - # reset input_size - self.input_size = self.base_layer.input_size_per_partition - self.output_size = self.base_layer.output_size - - self.tp_rank = get_tensor_model_parallel_rank() - # There is only one LoRA layer. - self.n_slices = 1 - - def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - - shard_size = self.input_size - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size - lora_a = lora_a[start_idx:end_idx, :] - return lora_a - - def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - return lora_b - - def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: - return bias - - def forward( - self, input_: torch.Tensor - ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]: - """Forward of RowParallelLinear - - Args: - input_: tensor whose last dimension is `input_size`. If - `input_is_parallel` is set, then the last dimension - is `input_size // tp_size`. - - Returns: - - output - - bias - """ - # set up backprop all-reduce. - if self.base_layer.input_is_parallel: - input_parallel = input_ - else: - # TODO: simplify code below - splitted_input = split_tensor_along_last_dim( - input_, num_partitions=self.base_layer.tp_size) - input_parallel = splitted_input[self.tp_rank].contiguous() - - # Matrix multiply. - output_parallel = self.apply(input_parallel) - if self.base_layer.reduce_results and self.base_layer.tp_size > 1: - output_ = tensor_model_parallel_all_reduce(output_parallel) - else: - output_ = output_parallel - - if not self.base_layer.skip_bias_add: - output = (output_ + self.base_layer.bias - if self.base_layer.bias is not None else output_) - output_bias = None - else: - output = output_ - output_bias = self.base_layer.bias - - if not self.base_layer.return_bias: - return output - - return output, output_bias - - @classmethod - @_not_fully_sharded_can_replace - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - return type(source_layer) is RowParallelLinear - - -class LogitsProcessorWithLoRA(BaseLayerWithLoRA): - """ - LoRA wrapper for LogitsProcessor, with extra logic to handle the - application of the LoRA adapter and added LoRA vocabulary. - - Args: - base_layer: LogitsProcessor layer - hidden_size: hidden size of the model - dtype: data type of the model - device: device of the model - sharded_to_full_mapping: index mapping from sharded vocab to full vocab - received from base_layer.get_sharded_to_full_mapping(). If None, - no reindexing will be done. - """ - - def __init__(self, base_layer: LogitsProcessor, hidden_size: int, - dtype: torch.dtype, device: torch.device, - sharded_to_full_mapping: Optional[list[int]]) -> None: - super().__init__() - self.base_layer = base_layer - self.hidden_size = hidden_size - self.dtype = dtype - self.device = device - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - self.sharded_to_full_mapping = sharded_to_full_mapping - - @property - def logits_as_input(self): - return self.base_layer.logits_as_input - - @property - def vocab_size(self): - return self.base_layer.vocab_size - - @property - def scale(self): - return self.base_layer.scale - - @property - def soft_cap(self): - return self.base_layer.soft_cap - - @property - def use_all_gather(self): - return self.base_layer.use_all_gather - - @property - def org_vocab_size(self): - return self.base_layer.org_vocab_size - - @property - def include_gpu_probs_tensor(self): - return self.base_layer.include_gpu_probs_tensor - - @property - def should_modify_greedy_probs_inplace(self): - return self.base_layer.should_modify_greedy_probs_inplace - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - # TODO: Verify if this condition can be further relaxed - if 32000 < self.base_layer.vocab_size > 257024: - raise ValueError("When using LoRA, vocab size must be " - "32000 >= vocab_size <= 257024") - self.lora_a_stacked = torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - # Pad for kernel compatibility - math.ceil(self.base_layer.vocab_size / - lora_config.lora_vocab_padding_size) * - lora_config.lora_vocab_padding_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.embeddings_tensors = torch.full( - (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), - fill_value=float("-inf"), - dtype=self.dtype, - device=self.device, - ) - if self.sharded_to_full_mapping is not None: - self.sharded_to_full_mapping_gpu = torch.tensor( - self.sharded_to_full_mapping, - device=self.device, - dtype=torch.long) - else: - self.sharded_to_full_mapping_gpu = None - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = float("-inf") - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, - :embeddings_tensor.shape[0], - :embeddings_tensor.shape[1], - ] = embeddings_tensor - - def _get_logits( - self, - hidden_states: torch.Tensor, - lm_head: VocabParallelEmbedding, - embedding_bias: Optional[torch.Tensor] = None, - ) -> Optional[torch.Tensor]: - # Get the logits for the next tokens. - logits = lm_head.quant_method.apply(lm_head, hidden_states) - if embedding_bias is not None: - logits += embedding_bias - - # Gather logits for TP - logits = self.base_layer._gather_logits(logits) - - if logits is None: - return None - - if self.sharded_to_full_mapping_gpu is not None: - # Reindex full logits tensor to ensure 1:1 mapping between - # index and token_id - # Example for: - # org_vocab_size = 4 - # added_vocab_size = 2 - # pad_to_size = 8 - # tp_size = 2 - - # indices: [0, 1, 2, 3, 4, 5, 6, 7] - # token_id: [0, 1, 4, -1, 2, 3, 5, -1] - - # Therefore, the mapping is expected to be: - # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex, - # we get: - # indices: [0, 1, 2, 3, 4, 5, 6, 7] - # token_id: [0, 1, 2, 3, 4, 5, -1, -1] - logits = logits[:, self.sharded_to_full_mapping_gpu] - - lora_logits = torch.empty( - self.embeddings_tensors.shape[0] + 1, - self.embeddings_tensors.shape[1], - hidden_states.shape[0], - dtype=self.embeddings_tensors.dtype, - device=self.embeddings_tensors.device, - ) - torch.matmul(self.embeddings_tensors, - hidden_states.T, - out=lora_logits[:-1]) - - neg_inf, pos_inf = current_platform.get_infinity_values( - lora_logits.dtype) - - lora_logits[-1] = neg_inf - lora_logits = lora_logits.mT - indices_padded = self.punica_wrapper.sampler_indices_padded - - if current_platform.is_tpu() or current_platform.is_xpu(): - indices_padded = indices_padded[:logits.size(0)] - - lora_logits = (lora_logits.reshape( - lora_logits.shape[0] * lora_logits.shape[1], - lora_logits.shape[2], - ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf, - posinf=pos_inf, - neginf=neg_inf)) - - logits[:, - self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + - lora_logits.shape[1]] = lora_logits - - lora_output: Optional[ - torch.Tensor] = self.punica_wrapper.add_lora_logits( - logits, hidden_states, self.lora_a_stacked, - self.lora_b_stacked, 1.0) - - if not current_platform.can_update_inplace(): - logits = lora_output - - # Remove paddings in vocab (if any). - logits = logits[:, :self.base_layer.vocab_size] - return logits - - def forward(self, *args, **kwargs): - return type(self.base_layer).forward(self, *args, **kwargs) - - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - # Special handling for the LogitsProcessor. - return False diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py new file mode 100644 index 0000000000000..d3bb145dc7bf8 --- /dev/null +++ b/vllm/lora/layers/__init__.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.lora.layers.base import BaseLayerWithLoRA +from vllm.lora.layers.column_parallel_linear import ( + ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA, + MergedColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithLoRA, + QKVParallelLinearWithShardedLoRA) +from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA +from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA +from vllm.lora.layers.row_parallel_linear import ( + RowParallelLinearWithLoRA, RowParallelLinearWithShardedLoRA) +from vllm.lora.layers.utils import LoRAMapping +from vllm.lora.layers.vocal_parallel_embedding import ( + VocabParallelEmbeddingWithLoRA) + +__all__ = [ + "BaseLayerWithLoRA", + "VocabParallelEmbeddingWithLoRA", + "LogitsProcessorWithLoRA", + "ColumnParallelLinearWithLoRA", + "ColumnParallelLinearWithShardedLoRA", + "MergedColumnParallelLinearWithLoRA", + "MergedColumnParallelLinearWithShardedLoRA", + "MergedQKVParallelLinearWithLoRA", + "MergedQKVParallelLinearWithShardedLoRA", + "QKVParallelLinearWithLoRA", + "QKVParallelLinearWithShardedLoRA", + "RowParallelLinearWithLoRA", + "RowParallelLinearWithShardedLoRA", + "ReplicatedLinearWithLoRA", + "LoRAMapping", +] diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py new file mode 100644 index 0000000000000..0e759d5d5719b --- /dev/null +++ b/vllm/lora/layers/base.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING, Optional, Union + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig + +if TYPE_CHECKING: + from vllm.lora.punica_wrapper import PunicaWrapperBase + + +class BaseLayerWithLoRA(nn.Module): + + def slice_lora_a( + self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]] + ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: + """Slice lora a if splitting for tensor parallelism.""" + ... + + def slice_lora_b( + self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]] + ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]: + """Slice lora b if splitting with tensor parallelism.""" + ... + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: + """Initializes lora matrices.""" + ... + + def reset_lora(self, index: int): + """Resets the lora weights at index back to 0.""" + ... + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, + ): + """Overwrites lora tensors at index.""" + ... + + def set_mapping( + self, + punica_wrapper, + ): + self.punica_wrapper: PunicaWrapperBase = punica_wrapper + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + """Returns True if the layer can be replaced by this LoRA layer.""" + raise NotImplementedError diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py new file mode 100644 index 0000000000000..4e062971d9188 --- /dev/null +++ b/vllm/lora/layers/base_linear.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional, cast + +import torch +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.distributed.utils import divide +# yapf: disable +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearBase, ReplicatedLinear, + RowParallelLinear) +from vllm.platforms import current_platform + +from .base import BaseLayerWithLoRA +from .utils import _get_lora_device + + +class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): + + def __init__(self, base_layer: LinearBase): + super().__init__() + self.base_layer = base_layer + self.input_size = self.base_layer.input_size + self.device = _get_lora_device(self.base_layer) + self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None + + self.output_slices: tuple[int, ...] + self.tp_size: int + self.output_size: int + self.n_slices: int + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: + self.lora_config = lora_config + # + if isinstance(self.base_layer, ReplicatedLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, ColumnParallelLinear): + lora_a_out_size = (lora_config.max_lora_rank if + not lora_config.fully_sharded_loras else divide( + lora_config.max_lora_rank, self.tp_size)) + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, RowParallelLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = (self.output_size if + not lora_config.fully_sharded_loras else divide( + self.output_size, self.tp_size)) + else: + raise NotImplementedError + + self.lora_a_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_a_out_size, + self.input_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_b_out_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + if lora_config.bias_enabled: + lora_bias_out_size = lora_b_out_size + self.lora_bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_bias_out_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.output_slices = (self.lora_b_stacked[0].shape[2], ) + + def reset_lora(self, index: int): + for s_index in range(self.n_slices): + self.lora_a_stacked[s_index][index] = 0 + self.lora_b_stacked[s_index][index] = 0 + if self.lora_config.bias_enabled: + # Make mypy happy + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[s_index][index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + lora_bias: Optional[torch.Tensor] = None, + ): + # Except for QKVParallelLinearWithLoRA and + # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers + # store weights in a tuple of size 1. These two layers will + # override this function. + assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == + self.n_slices == 1) + + self.reset_lora(index) + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) + + self.lora_a_stacked[0][index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[0][index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + if lora_bias is not None: + + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked) + assert len(self.lora_bias_stacked) + self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( + lora_bias.T, non_blocking=True) + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + + # In transformers backend, x and output have extra batch dimension like + # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), + # therefore we need to flatten the batch dimensions. + if x.ndim == 3 and output.ndim == 3: + output = output.flatten(0, 1) + x = x.flatten(0, 1) + + lora_output: Optional[ + torch.Tensor] = self.punica_wrapper.add_lora_linear( + output, x, self.lora_a_stacked, self.lora_b_stacked, + self.lora_bias_stacked, 1.0, self.output_slices) + if not current_platform.can_update_inplace(): + output = lora_output + + return output + + @property + def weight(self) -> torch.Tensor: + + # unquantizedLinear + if hasattr(self.base_layer, "weight"): + return self.base_layer.weight + # Compressed Tensor + elif hasattr(self.base_layer, "weight_packed"): + return self.base_layer.weight_packed + # GPTQ/AWQ + elif hasattr(self.base_layer, "qweight"): + return self.base_layer.qweight + # marlin + elif hasattr(self.base_layer, "B"): + return self.base_layer.B + # HQQ marlin + elif hasattr(self.base_layer, "W_q"): + return self.base_layer.W_q + else: + raise ValueError(f"Unsupported base layer: {self.base_layer}") + + @property + def bias(self) -> Optional[torch.Tensor]: + if hasattr(self.base_layer, "bias"): + return self.base_layer.bias + else: + return None diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py new file mode 100644 index 0000000000000..d2f8e05554c84 --- /dev/null +++ b/vllm/lora/layers/column_parallel_linear.py @@ -0,0 +1,622 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional, Union, cast + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) +from vllm.distributed.utils import divide +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear) +from vllm.platforms import current_platform + +from .base_linear import BaseLinearLayerWithLoRA +from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace + + +def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"): + """ + For `ColumnParallelLinearWithLoRA` or classes that inherit from + `ColumnParallelLinearWithLoRA`, they share the same `apply` logic. + """ + assert (layer.n_slices == len(layer.lora_a_stacked) == len( + layer.lora_b_stacked) == len(layer.output_slices)) + if layer.lora_bias_stacked is not None: + assert layer.n_slices == len(layer.lora_bias_stacked) + + output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape + + # Since communication is needed, the buffer is directly initialized as a + # tensor rather than a tuple of tensor. + buffers = torch.zeros( + (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + + shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink( + buffers, x, layer.lora_a_stacked, 1.0) + + if not current_platform.can_update_inplace(): + buffers = shrunk_buffers + + buffers = tensor_model_parallel_all_gather(buffers) + + lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand( + output, + buffers, + layer.lora_b_stacked, + layer.lora_bias_stacked, + layer.output_slices, + offset_start=0, + add_input=True) + + if not current_platform.can_update_inplace(): + output = lora_output + + output = output.view(*out_orig_shape) + # now have column partitioned and packed output + return output + + +class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): + """ + LoRA on top of ColumnParallelLinear layer. + LoRA B is sliced for tensor parallelism. + There are two types for the `base_layer`: + 1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`. + 2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`. + """ + + def __init__(self, base_layer: ColumnParallelLinear) -> None: + super().__init__(base_layer) + # The base_layer type is ColumnParallelLinear or + # MergedColumnParallelLinear, their weight sharding logic is + # inconsistent when TP is greater than 1. + self.is_merged_col_linear = type( + base_layer) is MergedColumnParallelLinear + self.tp_size = get_tensor_model_parallel_world_size() + self.output_size = self.base_layer.output_size_per_partition + # There is only one LoRA layer + self.n_slices = 1 + + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + return lora_a + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + # Applicable to cases where the base_layer is + # MergedColumnParallelLinear. + if self.is_merged_col_linear: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.output_size // 2 + offset = lora_b.shape[-1] // 2 + + left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) * + shard_size] + right_weight = lora_b[:, offset + tp_rank * shard_size:offset + + (tp_rank + 1) * shard_size] + lora_b = torch.cat([left_weight, right_weight], dim=1) + # Applicable to cases where the base_layer is + # ColumnParallelLinear. + else: + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.output_size + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + lora_b = lora_b[:, start_idx:end_idx] + return lora_b + + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + # TODO: Fix the slicing logic of bias. + if bias is None: + return bias + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.output_size + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + + def forward( + self, input_: torch.Tensor + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]: + """Forward of ColumnParallelLinear + + Args: + input_: Tensor whose last dimension is `input_size`. + + Returns: + - output + - bias + """ + bias = (self.base_layer.bias + if not self.base_layer.skip_bias_add else None) + + # Matrix multiply. + output_parallel = self.apply(input_, bias) + if self.base_layer.gather_output: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + + if not self.base_layer.return_bias: + return output + + output_bias = (self.base_layer.bias + if self.base_layer.skip_bias_add else None) + return output, output_bias + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return type(source_layer) is ColumnParallelLinear or ( + type(source_layer) is MergedColumnParallelLinear + and len(packed_modules_list) == 1) + + +class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): + """ColumnParallelLinear layer that is composed of 2 sublayers (slices) + packed together (e.g. gate_proj + up_proj -> gate_up_proj). + + This means we have 2 LoRAs, each applied to one half of the layer. + + Both slices must have the same size. + """ + + def __init__( + self, base_layer: Union[MergedColumnParallelLinear, + QKVParallelLinear]) -> None: + super().__init__(base_layer) + # There are two LoRA layers + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + # the output_sizes in MergedColumnParallelLinear is not sharded by tp + # we need to divide it by the tp_size to get correct slices size + output_sizes = self.base_layer.output_sizes + self.output_slices = tuple( + divide(output_size, self.tp_size) for output_size in output_sizes) + self.n_slices = len(self.output_slices) + self.output_ids = (self.tp_rank, ) * self.n_slices + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: + """ + The main reason for overriding this function is to enhance code + maintainability. + """ + self.lora_config = lora_config + + lora_a_output_size_per_partition = ( + lora_config.max_lora_rank if not lora_config.fully_sharded_loras + else divide(lora_config.max_lora_rank, self.tp_size)) + + self.lora_a_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_a_output_size_per_partition, + self.input_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( + max_loras, + 1, + output_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.device, + ) for output_size in self.output_slices) + if lora_config.bias_enabled: + self.lora_bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for output_size in self.output_slices) + + def slice_lora_a( + self, lora_a: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: + return lora_a + + def slice_lora_b( + self, lora_b: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: + sliced_lora_b = [None] * self.n_slices + for i, (shard_id, shard_size) in enumerate( + zip(self.output_ids, self.output_slices)): + if (lora_b_i := lora_b[i]) is not None: + sliced_lora_b[i] = lora_b_i[:, + shard_size * shard_id:shard_size * + (shard_id + 1)] + return sliced_lora_b + + def slice_bias( + self, bias: list[Union[torch.Tensor, + None]]) -> list[Union[torch.Tensor, None]]: + for i, (shard_id, shard_size) in enumerate( + zip(self.output_ids, self.output_slices)): + if (bias_i := bias[i]) is not None: + bias[i] = bias_i[shard_size * shard_id:shard_size * + (shard_id + 1)] + return bias + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + lora_bias: Optional[torch.Tensor] = None, + ): + self.reset_lora(index) + + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) + + for i in range(self.n_slices): + if (lora_a_i := lora_a[i]) is not None: + self.lora_a_stacked[i][ + index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_( + lora_a_i.T, non_blocking=True) + if (lora_b_i := lora_b[i]) is not None: + self.lora_b_stacked[i][ + index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_( + lora_b_i.T, non_blocking=True) + + if lora_bias is not None: + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked) + for i in range(self.n_slices): + if (lora_bias_i := lora_bias[i]) is not None: + self.lora_bias_stacked[i][index, + 0, :lora_bias_i.shape[0]].copy_( + lora_bias_i.T, + non_blocking=True) + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return (type(source_layer) is MergedColumnParallelLinear + and len(packed_modules_list) == 2) + + +class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): + """ + ColumnParallelLinear layer that is specifically designed for + qkv_proj. Certain models, such as chatglm3 and baichuan-7b, + only contains a single LoRA within their qkv_proj layer. + + During inference with Tensor Parallel, the weights of lora_b + must be accurately partitioned according to the respective ranks. + + Q slice may have different shape than K and V slices (which both have + the same shape). + """ + + def __init__(self, base_layer: QKVParallelLinear) -> None: + super().__init__(base_layer) + self.q_proj_total_size = (self.base_layer.total_num_heads * + self.base_layer.head_size) + self.q_proj_shard_size = (self.base_layer.num_heads * + self.base_layer.head_size) + self.kv_proj_shard_size = (self.base_layer.num_kv_heads * + self.base_layer.head_size) + self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * + self.base_layer.head_size) + # There is only one LoRA layer + self.n_slices = 1 + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + tp_rank = get_tensor_model_parallel_rank() + self.q_shard_id = tp_rank + self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas + lora_b_q = lora_b[:, self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + k_offset = self.q_proj_total_size + lora_b_k = lora_b[:, k_offset + + self.kv_proj_shard_size * self.kv_shard_id:k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + v_offset = k_offset + self.kv_proj_total_size + lora_b_v = lora_b[:, v_offset + + self.kv_proj_shard_size * self.kv_shard_id:v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) + return lora_b + + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + bias_q = bias[self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + k_offset = self.q_proj_total_size + bias_k = bias[k_offset + + self.kv_proj_shard_size * self.kv_shard_id:k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + v_offset = k_offset + self.kv_proj_total_size + bias_v = bias[v_offset + + self.kv_proj_shard_size * self.kv_shard_id:v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + bias = torch.cat([bias_q, bias_k, bias_v], dim=1) + return bias + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer(cls, source_layer: nn.Module, + lora_config: LoRAConfig, packed_modules_list: list, + model_config: Optional[PretrainedConfig]) -> bool: + return type(source_layer) is QKVParallelLinear and len( + packed_modules_list) == 1 + + +class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): + """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices) + packed together in qkv proj fashion + (q_proj + k_proj + v_proj -> qkv_proj). + + This means we have 3 LoRAs, each applied to one slice of the layer. + + Q slice may have different shape than K and V slices (which both have + the same shape). + """ + + def __init__(self, base_layer: QKVParallelLinear) -> None: + super().__init__(base_layer) + # There are three LoRA layer. + self.n_slices = len(self.base_layer.output_sizes) + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.q_proj_shard_size = (self.base_layer.num_heads * + self.base_layer.head_size) + self.kv_proj_shard_size = (self.base_layer.num_kv_heads * + self.base_layer.head_size) + self.q_shard_id = self.tp_rank + self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas + + self.output_slices = ( + self.q_proj_shard_size, + self.kv_proj_shard_size, + self.kv_proj_shard_size, + ) + self.output_ids = ( + self.q_shard_id, + self.kv_shard_id, + self.kv_shard_id, + ) + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: + """ + The main reason for overloading this function is to handle inconsistent + weight dimensions in qkv lora. + """ + super().create_lora_weights(max_loras, lora_config, model_config) + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return (type(source_layer) is QKVParallelLinear + and len(packed_modules_list) == 3) + + +# These following layers are based on the tensor parallelism strategy given in +# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, +# https://arxiv.org/abs/2311.03285. + + +class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): + """ + Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`, + # their `lora_a` and `lora_b` have different sharding patterns. After + # completing the `lora_a` GEMM , a gather operation is performed. + # Therefore, the sharding of `lora_a` only needs to correspond with the + # gather operation. + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.lora_a_stacked[0].shape[2] + start_idx = tp_rank * shard_size + lora_a = lora_a[:, start_idx:start_idx + shard_size] + return lora_a + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class MergedColumnParallelLinearWithShardedLoRA( + MergedColumnParallelLinearWithLoRA): + """ + Differs from MergedColumnParallelLinearWithLoRA by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + def slice_lora_a( + self, lora_a: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: + #NOTE: lora_a contains 2 subloras, and each sublora could be None. + output_shard_size = self.lora_a_stacked[0].shape[2] + output_start_idx = self.tp_rank * output_shard_size + lora_a = [ + lora_a[0][:, output_start_idx:output_start_idx + + output_shard_size] if lora_a[0] is not None else None, + lora_a[1][:, output_start_idx:output_start_idx + + output_shard_size] if lora_a[1] is not None else None, + ] + return lora_a + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): + """ + Differs from QKVParallelLinearWithLoRA by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.lora_a_stacked[0].shape[2] + start_idx = tp_rank * shard_size + lora_a = lora_a[:, start_idx:start_idx + shard_size] + return lora_a + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer(cls, source_layer: nn.Module, + lora_config: LoRAConfig, packed_modules_list: list, + model_config: Optional[PretrainedConfig]) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): + """ + Differs from MergedQKVParallelLinearWithLoRA by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + def slice_lora_a( + self, lora_a: list[Union[torch.Tensor, None]] + ) -> list[Union[torch.Tensor, None]]: + # NOTE: lora_a contains 3 subloras, and each sublora could be None. + shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] + start_idx = [self.tp_rank * shard_size[i] for i in range(3)] + lora_a = [ + lora_a[0][:, start_idx[0]:start_idx[0] + + shard_size[0]] if lora_a[0] is not None else None, + lora_a[1][:, start_idx[1]:start_idx[1] + + shard_size[1]] if lora_a[1] is not None else None, + lora_a[2][:, start_idx[2]:start_idx[2] + + shard_size[2]] if lora_a[2] is not None else None, + ] + return lora_a + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py new file mode 100644 index 0000000000000..db974147ccca7 --- /dev/null +++ b/vllm/lora/layers/logits_processor.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from typing import Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.platforms import current_platform + +from .base import BaseLayerWithLoRA + + +class LogitsProcessorWithLoRA(BaseLayerWithLoRA): + """ + LoRA wrapper for LogitsProcessor, with extra logic to handle the + application of the LoRA adapter and added LoRA vocabulary. + + Args: + base_layer: LogitsProcessor layer + hidden_size: hidden size of the model + dtype: data type of the model + device: device of the model + sharded_to_full_mapping: index mapping from sharded vocab to full vocab + received from base_layer.get_sharded_to_full_mapping(). If None, + no reindexing will be done. + """ + + def __init__(self, base_layer: LogitsProcessor, hidden_size: int, + dtype: torch.dtype, device: torch.device, + sharded_to_full_mapping: Optional[list[int]]) -> None: + super().__init__() + self.base_layer = base_layer + self.hidden_size = hidden_size + self.dtype = dtype + self.device = device + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.sharded_to_full_mapping = sharded_to_full_mapping + + @property + def logits_as_input(self): + return self.base_layer.logits_as_input + + @property + def vocab_size(self): + return self.base_layer.vocab_size + + @property + def scale(self): + return self.base_layer.scale + + @property + def soft_cap(self): + return self.base_layer.soft_cap + + @property + def use_all_gather(self): + return self.base_layer.use_all_gather + + @property + def org_vocab_size(self): + return self.base_layer.org_vocab_size + + @property + def include_gpu_probs_tensor(self): + return self.base_layer.include_gpu_probs_tensor + + @property + def should_modify_greedy_probs_inplace(self): + return self.base_layer.should_modify_greedy_probs_inplace + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: + # TODO: Verify if this condition can be further relaxed + if 32000 < self.base_layer.vocab_size > 257024: + raise ValueError("When using LoRA, vocab size must be " + "32000 >= vocab_size <= 257024") + self.lora_a_stacked = torch.zeros( + ( + max_loras, + 1, + lora_config.max_lora_rank, + self.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + # Pad for kernel compatibility + math.ceil(self.base_layer.vocab_size / + lora_config.lora_vocab_padding_size) * + lora_config.lora_vocab_padding_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.embeddings_tensors = torch.full( + (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), + fill_value=float("-inf"), + dtype=self.dtype, + device=self.device, + ) + if self.sharded_to_full_mapping is not None: + self.sharded_to_full_mapping_gpu = torch.tensor( + self.sharded_to_full_mapping, + device=self.device, + dtype=torch.long) + else: + self.sharded_to_full_mapping_gpu = None + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + self.embeddings_tensors[index] = float("-inf") + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, + ): + self.reset_lora(index) + self.lora_a_stacked[index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + if embeddings_tensor is not None: + self.embeddings_tensors[ + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ] = embeddings_tensor + + def _get_logits( + self, + hidden_states: torch.Tensor, + lm_head: VocabParallelEmbedding, + embedding_bias: Optional[torch.Tensor] = None, + ) -> Optional[torch.Tensor]: + # Get the logits for the next tokens. + logits = lm_head.quant_method.apply(lm_head, hidden_states) + if embedding_bias is not None: + logits += embedding_bias + + # Gather logits for TP + logits = self.base_layer._gather_logits(logits) + + if logits is None: + return None + + if self.sharded_to_full_mapping_gpu is not None: + # Reindex full logits tensor to ensure 1:1 mapping between + # index and token_id + # Example for: + # org_vocab_size = 4 + # added_vocab_size = 2 + # pad_to_size = 8 + # tp_size = 2 + + # indices: [0, 1, 2, 3, 4, 5, 6, 7] + # token_id: [0, 1, 4, -1, 2, 3, 5, -1] + + # Therefore, the mapping is expected to be: + # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex, + # we get: + # indices: [0, 1, 2, 3, 4, 5, 6, 7] + # token_id: [0, 1, 2, 3, 4, 5, -1, -1] + logits = logits[:, self.sharded_to_full_mapping_gpu] + + lora_logits = torch.empty( + self.embeddings_tensors.shape[0] + 1, + self.embeddings_tensors.shape[1], + hidden_states.shape[0], + dtype=self.embeddings_tensors.dtype, + device=self.embeddings_tensors.device, + ) + torch.matmul(self.embeddings_tensors, + hidden_states.T, + out=lora_logits[:-1]) + + neg_inf, pos_inf = current_platform.get_infinity_values( + lora_logits.dtype) + + lora_logits[-1] = neg_inf + lora_logits = lora_logits.mT + indices_padded = self.punica_wrapper.sampler_indices_padded + + if current_platform.is_tpu() or current_platform.is_xpu(): + indices_padded = indices_padded[:logits.size(0)] + + lora_logits = (lora_logits.reshape( + lora_logits.shape[0] * lora_logits.shape[1], + lora_logits.shape[2], + ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf, + posinf=pos_inf, + neginf=neg_inf)) + + logits[:, + self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + + lora_logits.shape[1]] = lora_logits + + lora_output: Optional[ + torch.Tensor] = self.punica_wrapper.add_lora_logits( + logits, hidden_states, self.lora_a_stacked, + self.lora_b_stacked, 1.0) + + if not current_platform.can_update_inplace(): + logits = lora_output + + # Remove paddings in vocab (if any). + logits = logits[:, :self.base_layer.vocab_size] + return logits + + def forward(self, *args, **kwargs): + return type(self.base_layer).forward(self, *args, **kwargs) + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + # Special handling for the LogitsProcessor. + return False diff --git a/vllm/lora/layers/qkv_x_parallel_linear.py b/vllm/lora/layers/qkv_x_parallel_linear.py new file mode 100644 index 0000000000000..367482d0ee078 --- /dev/null +++ b/vllm/lora/layers/qkv_x_parallel_linear.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from .base import BaseLayerWithLoRA + + +#TODO: Implement this +class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA): + pass diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py new file mode 100644 index 0000000000000..db922a02d40b0 --- /dev/null +++ b/vllm/lora/layers/replicated_linear.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional, Union + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.model_executor.layers.linear import ReplicatedLinear + +from .base_linear import BaseLinearLayerWithLoRA + + +class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): + + def __init__(self, base_layer: ReplicatedLinear) -> None: + super().__init__(base_layer, ) + # To ensure interface compatibility, set to 1 always. + self.tp_size = 1 + self.output_size = self.base_layer.output_size + self.n_slices = 1 + + def forward( + self, input_: torch.Tensor + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]: + """Forward of ReplicatedLinearWithLoRA + + Args: + input_: Tensor whose last dimension is `input_size`. + + Returns: + - output + - bias + """ + bias = (self.base_layer.bias + if not self.base_layer.skip_bias_add else None) + + # Matrix multiply. + output = self.apply(input_, bias) + + output_bias = (self.base_layer.bias + if self.base_layer.skip_bias_add else None) + + if not self.base_layer.return_bias: + return output + + return output, output_bias + + # ReplicatedLinear should always be replaced, regardless of the fully + # sharded LoRAs setting, because it is, by definition, copied per GPU. + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return type(source_layer) is ReplicatedLinear diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py new file mode 100644 index 0000000000000..bf1d9ae374f48 --- /dev/null +++ b/vllm/lora/layers/row_parallel_linear.py @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional, Union, cast + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + split_tensor_along_last_dim, + tensor_model_parallel_all_reduce) +# yapf: disable +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.platforms import current_platform + +from .base_linear import BaseLinearLayerWithLoRA +from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace + + +class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): + + def __init__(self, base_layer: RowParallelLinear) -> None: + super().__init__(base_layer) + + self.tp_size = get_tensor_model_parallel_world_size() + # reset input_size + self.input_size = self.base_layer.input_size_per_partition + self.output_size = self.base_layer.output_size + + self.tp_rank = get_tensor_model_parallel_rank() + # There is only one LoRA layer. + self.n_slices = 1 + + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + + shard_size = self.input_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + lora_a = lora_a[start_idx:end_idx, :] + return lora_a + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + return lora_b + + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + return bias + + def forward( + self, input_: torch.Tensor + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]: + """Forward of RowParallelLinear + + Args: + input_: tensor whose last dimension is `input_size`. If + `input_is_parallel` is set, then the last dimension + is `input_size // tp_size`. + + Returns: + - output + - bias + """ + # set up backprop all-reduce. + if self.base_layer.input_is_parallel: + input_parallel = input_ + else: + # TODO: simplify code below + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.base_layer.tp_size) + input_parallel = splitted_input[self.tp_rank].contiguous() + + # Matrix multiply. + output_parallel = self.apply(input_parallel) + if self.base_layer.reduce_results and self.base_layer.tp_size > 1: + output_ = tensor_model_parallel_all_reduce(output_parallel) + else: + output_ = output_parallel + + if not self.base_layer.skip_bias_add: + output = (output_ + self.base_layer.bias + if self.base_layer.bias is not None else output_) + output_bias = None + else: + output = output_ + output_bias = self.base_layer.bias + + if not self.base_layer.return_bias: + return output + + return output, output_bias + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return type(source_layer) is RowParallelLinear + + + +# The following layer is based on the tensor parallelism strategy given in +# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, +# https://arxiv.org/abs/2311.03285. + +class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): + """ + Differs from RowParallelLinearWithLoRA by slicing the + LoRA B's also. + + Based on S-LoRA, slicing happens along the output dim. + This yields a combined partial sum from the row parallel base + layer and column partitioned output from the LoRA. + """ + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + shard_size = self.lora_b_stacked[0].shape[2] + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + lora_b = lora_b[:, start_idx:end_idx] + return lora_b + + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + if bias is None: + return bias + self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], + self.lora_bias_stacked) + shard_size = self.lora_bias_stacked[0].shape[2] + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, + output.shape[-1]), output.shape + buffer = torch.zeros( + (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + + shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink( + buffer, x, self.lora_a_stacked, 1.0) + if not current_platform.can_update_inplace(): + buffer = shrunk_buffer + + buffer = tensor_model_parallel_all_reduce(buffer) + + # following S-LoRA, allows the fusing of all_gather and all_reduce + # by adding the column partitioned lora output to a slice of output + # tensor, which is a partial sum due to row parallel. All that + # remains is a standard all_reduce. User should be aware though that + # the output is not the same as a normal row_parallel, it should be + # reduced before being used + # NOTE offset are based on the rank. + shard_size = self.lora_b_stacked[0].shape[2] + offset_start = self.tp_rank * shard_size + lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand( + output, + buffer, + self.lora_b_stacked, + self.lora_bias_stacked, + self.output_slices, + offset_start=offset_start, + add_input=True, + ) + + if not current_platform.can_update_inplace(): + output = lora_output + + output = output.view(*out_orig_shape) + return output + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py new file mode 100644 index 0000000000000..27dcd720fbdea --- /dev/null +++ b/vllm/lora/layers/utils.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import torch +import torch.nn as nn + +from vllm.adapter_commons.layers import AdapterMapping + + +@dataclass +class LoRAMapping(AdapterMapping): + is_prefill: bool = False + + +def _get_lora_device(base_layer: nn.Module) -> torch.device: + # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34 + """Returns the device for where to place the LoRA tensors.""" + # unquantizedLinear + if hasattr(base_layer, "weight"): + return base_layer.weight.device + # Compressed Tensor + elif hasattr(base_layer, "weight_packed"): + return base_layer.weight_packed.device + # GPTQ/AWQ + elif hasattr(base_layer, "qweight"): + return base_layer.qweight.device + # HQQ marlin + elif hasattr(base_layer, "W_q"): + return base_layer.W_q.device + else: + raise ValueError(f"Unsupported base layer: {base_layer}") + + +def _not_fully_sharded_can_replace(can_replace): + """ + decorator which adds the condition of not using fully sharded loras + intended to wrap can_replace_layer() + """ + + def dec(*args, **kwargs): + decorate = kwargs.pop("decorate") if "decorate" in kwargs else True + condition = (not kwargs["lora_config"].fully_sharded_loras + if decorate else True) + return can_replace(*args, **kwargs) and condition + + return dec + + +def _fully_sharded_can_replace(can_replace): + """ + decorator which adds the condition of fully sharded loras + intended to wrap can_replace_layer() + """ + + def dec(*args, **kwargs): + return (can_replace(*args, **kwargs) + and kwargs["lora_config"].fully_sharded_loras) + + return dec diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py new file mode 100644 index 0000000000000..192e154fe56a6 --- /dev/null +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.platforms import current_platform + +from .base import BaseLayerWithLoRA + + +class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): + + def __init__(self, base_layer: VocabParallelEmbedding) -> None: + super().__init__() + self.base_layer = base_layer + self.embeddings_slice: Optional[tuple[int, int]] + self.embeddings_weights: Optional[torch.Tensor] + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None) -> None: + + if self.base_layer.num_added_embeddings_per_partition > 0: + # We can start adding lora weights + self.embeddings_weights = self.base_layer.weight.data[ + self.base_layer.num_org_embeddings_per_partition:self. + base_layer.num_org_embeddings_per_partition + + self.base_layer.num_added_embeddings_per_partition] + self.embeddings_slice = ( + self.base_layer.shard_indices.added_vocab_start_index - + self.base_layer.org_vocab_size, + self.base_layer.shard_indices.added_vocab_end_index - + self.base_layer.org_vocab_size) + self.base_layer.weight.data[ + self.base_layer.num_org_embeddings_per_partition:].fill_(0) + else: + self.embeddings_slice = None + self.embeddings_weights = None + + self.embeddings_tensors = torch.zeros( + ( + max_loras, + lora_config.lora_extra_vocab_size, + self.base_layer.embedding_dim, + ), + dtype=self.base_layer.weight.dtype, + device=self.base_layer.weight.device, + ) + self.lora_a_stacked = torch.zeros( + ( + max_loras, + self.base_layer.org_vocab_size + + lora_config.lora_extra_vocab_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + self.base_layer.embedding_dim, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_a_stacked_2d = self.lora_a_stacked.view( + self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], + self.lora_a_stacked.shape[2], + ) + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + self.embeddings_tensors[index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, + ): + self.reset_lora(index) + self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( + lora_a, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + if embeddings_tensor is not None: + self.embeddings_tensors[ + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ].copy_(embeddings_tensor, non_blocking=True) + if self.embeddings_slice is not None: + # TODO(yard1): Optimize this copy, we don't need to copy + # everything, just the modified part + embeddings = self.embeddings_tensors.view( + self.embeddings_tensors.shape[0] * + self.embeddings_tensors.shape[1], + self.embeddings_tensors.shape[2], + )[self.embeddings_slice[0]:self.embeddings_slice[1]] + assert self.embeddings_weights is not None + self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, + 1, 0) + + # NB: Don't use torch.narrow here. torch.narrow triggers some + # Dynamic Shape specialization in torch.compile + num_tokens = x.shape[0] + indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens] + indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens] + + full_lora_a_embeddings = F.embedding( + x + indices_1, + self.lora_a_stacked_2d, + ) + full_output = self.base_layer.forward(x + + (indices_0 * added_tokens_mask)) + + full_output_org = full_output + if full_output.ndim == 3: + full_output = full_output.view( + full_output.shape[0] * full_output.shape[1], -1) + if full_lora_a_embeddings.ndim == 3: + full_lora_a_embeddings = full_lora_a_embeddings.view( + full_lora_a_embeddings.shape[0] * + full_lora_a_embeddings.shape[1], + -1, + ) + + lora_output: Optional[ + torch.Tensor] = self.punica_wrapper.add_lora_embedding( + full_output, + full_lora_a_embeddings, + self.lora_b_stacked, + add_input=True) + + if not current_platform.can_update_inplace(): + full_output = lora_output + + return full_output.view_as(full_output_org) + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return type(source_layer) is VocabParallelEmbedding + + @property + def weight(self): + return self.base_layer.weight diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 1fc214c12b5d1..2b05a2cf4d40c 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -13,21 +13,21 @@ from transformers import PretrainedConfig from vllm.config import LoRAConfig from vllm.logger import init_logger -from vllm.lora.fully_sharded_layers import ( - ColumnParallelLinearWithShardedLoRA, - MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA, - RowParallelLinearWithShardedLoRA) # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, + ColumnParallelLinearWithShardedLoRA, LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithLoRA, + QKVParallelLinearWithShardedLoRA, ReplicatedLinearWithLoRA, RowParallelLinearWithLoRA, + RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA) from vllm.model_executor.layers.linear import LinearBase From 36cacd09584cb16668487105b04b3ae8193fa28b Mon Sep 17 00:00:00 2001 From: Robin <863579016@qq.com> Date: Wed, 10 Sep 2025 22:50:55 +0800 Subject: [PATCH 129/584] [Doc] Add documentation for GLM-4.5 series models: tool-calling and reasoning parser (#24589) Signed-off-by: WangErXiao <863579016@qq.com> --- docs/features/reasoning_outputs.md | 1 + docs/features/tool_calling.md | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index d9a785eb73fbe..d518e7f0cff43 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models: | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ | | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ | +| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ | !!! note IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 5401603832273..a8c0db0a7ac13 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -311,6 +311,15 @@ Flags: * For non-reasoning: `--tool-call-parser hunyuan_a13b` * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning` +### GLM-4.5 Models (`glm45`) + +Supported models: + +* `ZhipuAI/GLM-4.5` +* `ZhipuAI/GLM-4.5-Air` + +Flags: `--tool-call-parser glm45` + ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. From 2bef2d1405faa1d1bd80b5183336a068396ec02d Mon Sep 17 00:00:00 2001 From: Shiyan Deng <dsy842974287@meta.com> Date: Wed, 10 Sep 2025 08:02:01 -0700 Subject: [PATCH 130/584] [Logging] allow config logging stream (#24336) Signed-off-by: Shiyan Deng <dsy842974287@meta.com> --- vllm/envs.py | 5 +++++ vllm/logger.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 8d199da45b082..184d339089a21 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -37,6 +37,7 @@ if TYPE_CHECKING: VLLM_CONFIGURE_LOGGING: int = 1 VLLM_LOGGING_LEVEL: str = "INFO" VLLM_LOGGING_PREFIX: str = "" + VLLM_LOGGING_STREAM: str = "ext://sys.stdout" VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None VLLM_LOG_STATS_INTERVAL: float = 10. @@ -435,6 +436,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LOGGING_LEVEL": lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(), + # this is used for configuring the default logging stream + "VLLM_LOGGING_STREAM": + lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"), + # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), diff --git a/vllm/logger.py b/vllm/logger.py index a453aa308aaf2..2861e0f1686c4 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -20,6 +20,7 @@ VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX +VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM _FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " "[%(fileinfo)s:%(lineno)d] %(message)s") @@ -38,7 +39,7 @@ DEFAULT_LOGGING_CONFIG = { "class": "logging.StreamHandler", "formatter": "vllm", "level": VLLM_LOGGING_LEVEL, - "stream": "ext://sys.stdout", + "stream": VLLM_LOGGING_STREAM, }, }, "loggers": { From 08abfa78ec2fca59889045bb5aec41e383dac9bd Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Wed, 10 Sep 2025 20:20:46 +0300 Subject: [PATCH 131/584] [Bugfix] fix modelopt exclude_modules name mapping (#24178) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- .../layers/mamba/mamba_mixer2.py | 3 + .../layers/quantization/modelopt.py | 24 +++++-- vllm/model_executor/models/nemotron_h.py | 68 ++++++++++--------- 3 files changed, 58 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index bb3fdd38dbef3..04ebdbca85e5d 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -291,6 +291,7 @@ class MambaMixer2(MambaBase, CustomOp): output_size=self.conv_dim, bias=use_conv_bias, quant_config=None, + prefix=f"{prefix}.conv1d", ) # unsqueeze to fit conv1d weights shape into the linear weights shape. # Can't do this in `weight_loader` since it already exists in @@ -303,6 +304,7 @@ class MambaMixer2(MambaBase, CustomOp): output_size=intermediate_size + self.conv_dim + self.num_heads, bias=use_bias, quant_config=quant_config, + prefix=f"{prefix}.in_proj", ) # - because in_proj is a concatenation of 3 weights, we @@ -402,6 +404,7 @@ class MambaMixer2(MambaBase, CustomOp): bias=use_bias, input_is_parallel=True, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.norm = Mixer2RMSNormGated(intermediate_size, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index e140807879177..9b99931e7b43f 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Optional, Union import torch from torch.nn import Module @@ -45,6 +45,9 @@ from vllm.utils import next_power_of_2 from vllm.utils.flashinfer import (flashinfer_scaled_fp4_mm, has_flashinfer, has_flashinfer_moe) +if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper + logger = init_logger(__name__) QUANT_ALGOS = ["FP8", "NVFP4"] @@ -63,7 +66,7 @@ class ModelOptFp8Config(QuantizationConfig): super().__init__() self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized self.kv_cache_quant_method = kv_cache_quant_method - self.exclude_modules = exclude_modules + self.exclude_modules = exclude_modules or [] if is_checkpoint_fp8_serialized: logger.warning("Detected ModelOpt fp8 checkpoint. Please note that" " the format is experimental and could change.") @@ -84,6 +87,11 @@ class ModelOptFp8Config(QuantizationConfig): def get_config_filenames(cls) -> list[str]: return ["hf_quant_config.json"] + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.exclude_modules is not None: + self.exclude_modules = hf_to_vllm_mapper.apply_list( + self.exclude_modules) + @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: @@ -170,7 +178,9 @@ class ModelOptFp8Config(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if self.is_layer_excluded(prefix): + if (is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping) + or self.is_layer_excluded(prefix)): return UnquantizedLinearMethod() return ModelOptFp8LinearMethod(self) elif isinstance(layer, Attention): @@ -615,6 +625,11 @@ class ModelOptNvFp4Config(QuantizationConfig): def get_config_filenames(cls) -> list[str]: return ["hf_quant_config.json"] + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.exclude_modules is not None: + self.exclude_modules = hf_to_vllm_mapper.apply_list( + self.exclude_modules) + @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: @@ -763,7 +778,8 @@ class ModelOptNvFp4Config(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if (is_layer_skipped(prefix, self.exclude_modules) + if (is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping) or self.is_layer_excluded(prefix, self.exclude_modules)): return UnquantizedLinearMethod() return ModelOptNvFp4LinearMethod(self) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 8a563288cb4d6..da8628df1fe57 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -44,15 +44,16 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, SupportsQuant) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.models.utils import ( - AutoWeightsLoader, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + AutoWeightsLoader, WeightsMapper, make_empty_intermediate_tensors_factory, + make_layers, maybe_prefix) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import NemotronHConfig @@ -426,38 +427,36 @@ class NemotronHModel(nn.Module): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - attb_params_mapping = { - "q_proj": "q", - "k_proj": "k", - "v_proj": "v", - } + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if "embeddings" in name: - name = name.replace("embeddings", "embed_tokens") + if "scale" in name: + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue - if "A_log" in name: - name = name.replace("A_log", "A") - loaded_weight = loaded_weight.to(torch.float32) + # load stacked params + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue - if "D" in name: - loaded_weight = loaded_weight.to(torch.float32) - - if "dt_bias" in name: - loaded_weight = loaded_weight.to(torch.float32) - - # load attn params - if any(proj in name for proj in ["q_proj", "k_proj", "v_proj"]): - weight_name = next(proj - for proj in ["q_proj", "k_proj", "v_proj"] - if proj in name) - name = name.replace(weight_name, "qkv_proj") param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, loaded_weight, - attb_params_mapping[weight_name]) + weight_loader(param, loaded_weight, shard_id) + break + # load other params else: param = params_dict[name] @@ -471,6 +470,14 @@ class NemotronHModel(nn.Module): class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid, SupportsQuant): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={"backbone": "model"}, + orig_to_new_substr={ + "A_log": "A", + "embeddings": "embed_tokens" + }, + ) + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -622,10 +629,5 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - # update name in weights before passing to loader - updated_weights = [] - for name, loaded_weight in weights: - name = name.replace("backbone", "model") - updated_weights.append((name, loaded_weight)) loader = AutoWeightsLoader(self) - return loader.load_weights(updated_weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 40329496301a86044df7b57b8b6f57f5b71e701f Mon Sep 17 00:00:00 2001 From: Ming Yang <minos.future@gmail.com> Date: Wed, 10 Sep 2025 10:37:56 -0700 Subject: [PATCH 132/584] [Bugfix] Fix DeepEP config for DP4TP4 (#23619) Signed-off-by: Ming Yang <minos.future@gmail.com> --- .../layers/fused_moe/deepep_ht_prepare_finalize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 2bbe523b4bf98..2a3ae478f3eab 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -49,14 +49,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return torch.int64 def _get_dispatch_config(self) -> Optional[deep_ep.Config]: - if self.dp_size not in self.available_rank_configs: + if self.num_dispatchers_ not in self.available_rank_configs: return None - return deep_ep.Buffer.get_dispatch_config(self.dp_size) + return deep_ep.Buffer.get_dispatch_config(self.num_dispatchers_) def _get_combine_config(self) -> Optional[deep_ep.Config]: - if self.dp_size not in self.available_rank_configs: + if self.num_dispatchers_ not in self.available_rank_configs: return None - return deep_ep.Buffer.get_combine_config(self.dp_size) + return deep_ep.Buffer.get_combine_config(self.num_dispatchers_) def _do_dispatch( self, From 9fb74c27a78cdc8d8c6bd0a917d036b776784956 Mon Sep 17 00:00:00 2001 From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com> Date: Wed, 10 Sep 2025 11:32:43 -0700 Subject: [PATCH 133/584] [Core] Support configuration parsing plugin (#24277) Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com> Signed-off-by: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/transformers_utils/__init__.py | 0 .../test_config_parser_registry.py | 37 +++ vllm/config/__init__.py | 5 +- vllm/engine/arg_utils.py | 7 +- vllm/transformers_utils/config.py | 275 +++++++++++------- vllm/transformers_utils/config_parser_base.py | 20 ++ 6 files changed, 237 insertions(+), 107 deletions(-) create mode 100644 tests/transformers_utils/__init__.py create mode 100644 tests/transformers_utils/test_config_parser_registry.py create mode 100644 vllm/transformers_utils/config_parser_base.py diff --git a/tests/transformers_utils/__init__.py b/tests/transformers_utils/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/transformers_utils/test_config_parser_registry.py b/tests/transformers_utils/test_config_parser_registry.py new file mode 100644 index 0000000000000..13c654e05d2ac --- /dev/null +++ b/tests/transformers_utils/test_config_parser_registry.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from pathlib import Path +from typing import Optional, Union + +import pytest +from transformers import PretrainedConfig + +from vllm.transformers_utils.config import (get_config_parser, + register_config_parser) +from vllm.transformers_utils.config_parser_base import ConfigParserBase + + +@register_config_parser("custom_config_parser") +class CustomConfigParser(ConfigParserBase): + + def parse(self, + model: Union[str, Path], + trust_remote_code: bool, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + **kwargs) -> tuple[dict, PretrainedConfig]: + raise NotImplementedError + + +def test_register_config_parser(): + assert isinstance(get_config_parser("custom_config_parser"), + CustomConfigParser) + + +def test_invalid_config_parser(): + with pytest.raises(ValueError): + + @register_config_parser("invalid_config_parser") + class InvalidConfigParser: + pass diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index c8d531f12a2e4..aa26c0d05a336 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -419,7 +419,7 @@ class ModelConfig: `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ use_async_output_proc: bool = True """Whether to use async output processor.""" - config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value + config_format: Union[str, ConfigFormat] = "auto" """The format of the model config to load:\n - "auto" will try to load the config in hf format if available else it will try to load in mistral format.\n @@ -624,9 +624,6 @@ class ModelConfig: raise ValueError( "Sleep mode is not supported on current platform.") - if isinstance(self.config_format, str): - self.config_format = ConfigFormat(self.config_format) - hf_config = get_config(self.hf_config_path or self.model, self.trust_remote_code, self.revision, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a4b5abbb35a3f..3fa1d0922f50c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -22,9 +22,9 @@ from typing_extensions import TypeIs, deprecated import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, - ConfigFormat, ConfigType, ConvertOption, - DecodingConfig, DetailedTraceModules, Device, - DeviceConfig, DistributedExecutorBackend, EPLBConfig, + ConfigType, ConvertOption, DecodingConfig, + DetailedTraceModules, Device, DeviceConfig, + DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, @@ -547,7 +547,6 @@ class EngineArgs: help="Disable async output processing. This may result in " "lower performance.") model_group.add_argument("--config-format", - choices=[f.value for f in ConfigFormat], **model_kwargs["config_format"]) # This one is a special case because it can bool # or str. TODO: Handle this in get_kwargs diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 95e4ed1ccf07f..d6ebcdf805252 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,13 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import enum import json import os import time from functools import cache, partial from pathlib import Path -from typing import Any, Callable, Optional, TypeVar, Union +from typing import Any, Callable, Literal, Optional, TypeVar, Union import huggingface_hub from huggingface_hub import get_safetensors_metadata, hf_hub_download @@ -27,6 +26,7 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs from vllm.logger import init_logger +from vllm.transformers_utils.config_parser_base import ConfigParserBase from vllm.transformers_utils.utils import check_gguf_file if envs.VLLM_USE_MODELSCOPE: @@ -100,10 +100,163 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = { } -class ConfigFormat(str, enum.Enum): - AUTO = "auto" - HF = "hf" - MISTRAL = "mistral" +class HFConfigParser(ConfigParserBase): + + def parse(self, + model: Union[str, Path], + trust_remote_code: bool, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + **kwargs) -> tuple[dict, PretrainedConfig]: + kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE + config_dict, _ = PretrainedConfig.get_config_dict( + model, + revision=revision, + code_revision=code_revision, + token=_get_hf_token(), + **kwargs, + ) + # Use custom model class if it's in our registry + model_type = config_dict.get("model_type") + if model_type is None: + model_type = "speculators" if config_dict.get( + "speculators_config") is not None else model_type + + if model_type in _CONFIG_REGISTRY: + config_class = _CONFIG_REGISTRY[model_type] + config = config_class.from_pretrained( + model, + revision=revision, + code_revision=code_revision, + token=_get_hf_token(), + **kwargs, + ) + else: + try: + kwargs = _maybe_update_auto_config_kwargs( + kwargs, model_type=model_type) + config = AutoConfig.from_pretrained( + model, + trust_remote_code=trust_remote_code, + revision=revision, + code_revision=code_revision, + token=_get_hf_token(), + **kwargs, + ) + except ValueError as e: + if (not trust_remote_code + and "requires you to execute the configuration file" + in str(e)): + err_msg = ( + "Failed to load the model config. If the model " + "is a custom model not yet available in the " + "HuggingFace transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + config = _maybe_remap_hf_config_attrs(config) + return config_dict, config + + +class MistralConfigParser(ConfigParserBase): + + def parse(self, + model: Union[str, Path], + trust_remote_code: bool, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + **kwargs) -> tuple[dict, PretrainedConfig]: + # This function loads a params.json config which + # should be used when loading models in mistral format + config_dict = _download_mistral_config_file(model, revision) + if (max_position_embeddings := + config_dict.get("max_position_embeddings")) is None: + max_position_embeddings = _maybe_retrieve_max_pos_from_hf( + model, revision, **kwargs) + config_dict["max_position_embeddings"] = max_position_embeddings + + from vllm.transformers_utils.configs.mistral import adapt_config_dict + + config = adapt_config_dict(config_dict) + + # Mistral configs may define sliding_window as list[int]. Convert it + # to int and add the layer_types list[str] to make it HF compatible + if ((sliding_window := getattr(config, "sliding_window", None)) + and isinstance(sliding_window, list)): + pattern_repeats = config.num_hidden_layers // len(sliding_window) + layer_types = sliding_window * pattern_repeats + config.layer_types = [ + "full_attention" if layer_type is None else "sliding_attention" + for layer_type in layer_types + ] + config.sliding_window = next(filter(None, sliding_window), None) + + return config_dict, config + + +_CONFIG_FORMAT_TO_CONFIG_PARSER: dict[str, type[ConfigParserBase]] = { + "hf": HFConfigParser, + "mistral": MistralConfigParser, +} + +ConfigFormat = Literal[ + "auto", + "hf", + "mistral", +] + + +def get_config_parser(config_format: str) -> ConfigParserBase: + """Get the config parser for a given config format.""" + if config_format not in _CONFIG_FORMAT_TO_CONFIG_PARSER: + raise ValueError(f"Unknown config format `{config_format}`.") + return _CONFIG_FORMAT_TO_CONFIG_PARSER[config_format]() + + +def register_config_parser(config_format: str): + + """Register a customized vllm config parser. + When a config format is not supported by vllm, you can register a customized + config parser to support it. + Args: + config_format (str): The config parser format name. + Examples: + + >>> from vllm.transformers_utils.config import (get_config_parser, + register_config_parser) + >>> from vllm.transformers_utils.config_parser_base import ConfigParserBase + >>> + >>> @register_config_parser("custom_config_parser") + ... class CustomConfigParser(ConfigParserBase): + ... def parse(self, + ... model: Union[str, Path], + ... trust_remote_code: bool, + ... revision: Optional[str] = None, + ... code_revision: Optional[str] = None, + ... **kwargs) -> tuple[dict, PretrainedConfig]: + ... raise NotImplementedError + >>> + >>> type(get_config_parser("custom_config_parser")) + <class 'CustomConfigParser'> + """ # noqa: E501 + + def _wrapper(config_parser_cls): + if config_format in _CONFIG_FORMAT_TO_CONFIG_PARSER: + logger.warning( + "Config format `%s` is already registered, and will be " + "overwritten by the new parser class `%s`.", config_format, + config_parser_cls) + if not issubclass(config_parser_cls, ConfigParserBase): + raise ValueError("The config parser must be a subclass of " + "`ConfigParserBase`.") + _CONFIG_FORMAT_TO_CONFIG_PARSER[config_format] = config_parser_cls + logger.info("Registered config parser `%s` with config format `%s`", + config_parser_cls, config_format) + return config_parser_cls + + return _wrapper _R = TypeVar("_R") @@ -350,7 +503,7 @@ def get_config( trust_remote_code: bool, revision: Optional[str] = None, code_revision: Optional[str] = None, - config_format: ConfigFormat = ConfigFormat.AUTO, + config_format: Union[str, ConfigFormat] = "auto", hf_overrides_kw: Optional[dict[str, Any]] = None, hf_overrides_fn: Optional[Callable[[PretrainedConfig], PretrainedConfig]] = None, @@ -363,20 +516,22 @@ def get_config( kwargs["gguf_file"] = Path(model).name model = Path(model).parent - if config_format == ConfigFormat.AUTO: + if config_format == "auto": try: if is_gguf or file_or_path_exists( model, HF_CONFIG_NAME, revision=revision): - config_format = ConfigFormat.HF + config_format = "hf" elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision): - config_format = ConfigFormat.MISTRAL + config_format = "mistral" else: raise ValueError( "Could not detect config format for no config file found. " - "Ensure your model has either config.json (HF format) " - "or params.json (Mistral format).") + "With config_format 'auto', ensure your model has either" + "config.json (HF format) or params.json (Mistral format)." + "Otherwise please specify your_custom_config_format" + "in engine args for customized config parser") except Exception as e: error_message = ( @@ -395,92 +550,14 @@ def get_config( raise ValueError(error_message) from e - if config_format == ConfigFormat.HF: - kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE - config_dict, _ = PretrainedConfig.get_config_dict( - model, - revision=revision, - code_revision=code_revision, - token=_get_hf_token(), - **kwargs, - ) - # Use custom model class if it's in our registry - model_type = config_dict.get("model_type") - if model_type is None: - model_type = "speculators" if config_dict.get( - "speculators_config") is not None else model_type - - if model_type in _CONFIG_REGISTRY: - config_class = _CONFIG_REGISTRY[model_type] - config = config_class.from_pretrained( - model, - revision=revision, - code_revision=code_revision, - token=_get_hf_token(), - **kwargs, - ) - else: - try: - kwargs = _maybe_update_auto_config_kwargs( - kwargs, model_type=model_type) - config = AutoConfig.from_pretrained( - model, - trust_remote_code=trust_remote_code, - revision=revision, - code_revision=code_revision, - token=_get_hf_token(), - **kwargs, - ) - except ValueError as e: - if (not trust_remote_code - and "requires you to execute the configuration file" - in str(e)): - err_msg = ( - "Failed to load the model config. If the model " - "is a custom model not yet available in the " - "HuggingFace transformers library, consider setting " - "`trust_remote_code=True` in LLM or using the " - "`--trust-remote-code` flag in the CLI.") - raise RuntimeError(err_msg) from e - else: - raise e - config = _maybe_remap_hf_config_attrs(config) - - elif config_format == ConfigFormat.MISTRAL: - # This function loads a params.json config which - # should be used when loading models in mistral format - config_dict = _download_mistral_config_file(model, revision) - if (max_position_embeddings := - config_dict.get("max_position_embeddings")) is None: - max_position_embeddings = _maybe_retrieve_max_pos_from_hf( - model, revision, **kwargs) - config_dict["max_position_embeddings"] = max_position_embeddings - - from vllm.transformers_utils.configs.mistral import adapt_config_dict - - config = adapt_config_dict(config_dict) - - # Mistral configs may define sliding_window as list[int]. Convert it - # to int and add the layer_types list[str] to make it HF compatible - if ((sliding_window := getattr(config, "sliding_window", None)) - and isinstance(sliding_window, list)): - pattern_repeats = config.num_hidden_layers // len(sliding_window) - layer_types = sliding_window * pattern_repeats - config.layer_types = [ - "full_attention" if layer_type is None else "sliding_attention" - for layer_type in layer_types - ] - config.sliding_window = next(filter(None, sliding_window), None) - else: - supported_formats = [ - fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO - ] - raise ValueError( - f"Unsupported config format: {config_format}. " - f"Supported formats are: {', '.join(supported_formats)}. " - f"Ensure your model uses one of these configuration formats " - f"or specify the correct format explicitly.") - + config_parser = get_config_parser(config_format) + config_dict, config = config_parser.parse( + model, + trust_remote_code=trust_remote_code, + revision=revision, + code_revision=code_revision, + **kwargs, + ) # Special architecture mapping check for GGUF models if is_gguf: if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: @@ -914,7 +991,7 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int: hf_config = get_config(model=model, trust_remote_code=trust_remote_code_val, revision=revision, - config_format=ConfigFormat.HF) + config_format="hf") if hf_value := hf_config.get_text_config().max_position_embeddings: max_position_embeddings = hf_value except Exception as e: diff --git a/vllm/transformers_utils/config_parser_base.py b/vllm/transformers_utils/config_parser_base.py new file mode 100644 index 0000000000000..c27177f74d4ba --- /dev/null +++ b/vllm/transformers_utils/config_parser_base.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional, Union + +from transformers import PretrainedConfig + + +class ConfigParserBase(ABC): + + @abstractmethod + def parse(self, + model: Union[str, Path], + trust_remote_code: bool, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + **kwargs) -> tuple[dict, PretrainedConfig]: + raise NotImplementedError From 09e68bce34175c6e6e57c7efcf2d1db216f5e8dd Mon Sep 17 00:00:00 2001 From: "rongfu.leng" <rongfu.leng@daocloud.io> Date: Thu, 11 Sep 2025 02:32:57 +0800 Subject: [PATCH 134/584] [Misc] update log level debug to warning when process port is used by (#24226) Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- vllm/entrypoints/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 4e852ba594930..887e277109240 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -95,7 +95,7 @@ async def serve_http(app: FastAPI, port = uvicorn_kwargs["port"] process = find_process_using_port(port) if process is not None: - logger.debug( + logger.warning( "port %s is used by process %s launched with command:\n%s", port, process, " ".join(process.cmdline())) logger.info("Shutting down FastAPI HTTP server.") From a0933c3bd67c1e210c7851690f22e79575f07d91 Mon Sep 17 00:00:00 2001 From: Thien Tran <gau.nernst@yahoo.com.sg> Date: Thu, 11 Sep 2025 03:33:41 +0800 Subject: [PATCH 135/584] [Bugfix] Enable FP8 KV cache for FlashInfer and Triton backend on non-sm100 GPUs (#24577) Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg> --- vllm/platforms/cuda.py | 4 ++++ vllm/v1/attention/backends/flashinfer.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index fc1a399d6f43f..f15c5594c27b2 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -530,6 +530,10 @@ class CudaPlatformBase(Platform): supported = flash_attn_supports_fp8() else: supported = True + elif attention_backend == "FLASHINFER": + supported = True + elif attention_backend == "TRITON_ATTN_VLLM_V1": + supported = cls.supports_fp8() return supported @classmethod diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index c7a565810b45b..4a491eeaafd10 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -202,7 +202,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): else: assert self.kv_cache_spec.dtype == self.model_config.dtype self.kv_cache_dtype = self.kv_cache_spec.dtype - self.q_data_type = self.kv_cache_dtype + + if supports_trtllm_attention()[0]: + self.q_data_type = self.kv_cache_dtype + else: + self.q_data_type = self.model_config.dtype self._cascade_wrapper = None # Wrapper for cascade attention From 4db44264047c2d30d9ba4eac55acaef821ae18b8 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Wed, 10 Sep 2025 13:53:21 -0700 Subject: [PATCH 136/584] [CI] Fail subprocess tests with root-cause error (#23795) Signed-off-by: Nick Hill <nhill@redhat.com> --- requirements/test.in | 1 + requirements/test.txt | 4 +- tests/async_engine/test_api_server.py | 26 +++++ tests/conftest.py | 10 ++ tests/utils.py | 120 +++++++++++++++++----- vllm/executor/ray_distributed_executor.py | 10 +- 6 files changed, 138 insertions(+), 33 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 1bbf0074a8881..744cfbe885278 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests +tblib # for pickling test exceptions timm >=1.0.17 # required for internvl and gemma3n-mm test torch==2.8.0 torchaudio==2.8.0 diff --git a/requirements/test.txt b/requirements/test.txt index 65ef7c3c64ba6..5eebdc788aa3d 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -137,7 +137,7 @@ contourpy==1.3.0 # via matplotlib cramjam==2.9.0 # via fastparquet -cupy-cuda12x==13.3.0 +cupy-cuda12x==13.6.0 # via ray cycler==0.12.1 # via matplotlib @@ -1032,6 +1032,8 @@ tabledata==1.3.3 # via pytablewriter tabulate==0.9.0 # via sacrebleu +tblib==3.1.0 + # via -r requirements/test.in tcolorpy==0.1.6 # via pytablewriter tenacity==9.0.0 diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 90f63e7ea17db..07370a8803291 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copyreg import os import subprocess import sys @@ -10,6 +11,30 @@ from pathlib import Path import pytest import requests +import urllib3.exceptions + + +def _pickle_new_connection_error(obj): + """Custom pickler for NewConnectionError to fix tblib compatibility.""" + # Extract the original message by removing the "conn: " prefix + full_message = obj.args[0] if obj.args else "" + if ': ' in full_message: + # Split off the connection part and keep the actual message + _, actual_message = full_message.split(': ', 1) + else: + actual_message = full_message + return _unpickle_new_connection_error, (actual_message, ) + + +def _unpickle_new_connection_error(message): + """Custom unpickler for NewConnectionError.""" + # Create with None as conn and the actual message + return urllib3.exceptions.NewConnectionError(None, message) + + +# Register the custom pickle/unpickle functions for tblib compatibility +copyreg.pickle(urllib3.exceptions.NewConnectionError, + _pickle_new_connection_error) def _query_server(prompt: str, max_tokens: int = 5) -> dict: @@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str): uvicorn_process.terminate() +@pytest.mark.timeout(300) @pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) def test_api_server(api_server, distributed_executor_backend: str): """ diff --git a/tests/conftest.py b/tests/conftest.py index 1052aeb35bac7..0440e859fe02d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# ruff: noqa + +from tblib import pickling_support + +# Install support for pickling exceptions so that we can nicely propagate +# failures from tests running in a subprocess. +# This should be run before any custom exception subclasses are defined. +pickling_support.install() + import http.server import json import math diff --git a/tests/utils.py b/tests/utils.py index e47235002657d..16e1e60393290 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import contextlib import copy import functools import importlib @@ -13,7 +14,7 @@ import sys import tempfile import time import warnings -from contextlib import contextmanager, suppress +from contextlib import ExitStack, contextmanager, suppress from multiprocessing import Process from pathlib import Path from typing import Any, Callable, Literal, Optional, Union @@ -800,43 +801,106 @@ _P = ParamSpec("_P") def fork_new_process_for_each_test( - f: Callable[_P, None]) -> Callable[_P, None]: + func: Callable[_P, None]) -> Callable[_P, None]: """Decorator to fork a new process for each test function. See https://github.com/vllm-project/vllm/issues/7053 for more details. """ - @functools.wraps(f) + @functools.wraps(func) def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: # Make the process the leader of its own process group # to avoid sending SIGTERM to the parent process os.setpgrp() from _pytest.outcomes import Skipped - pid = os.fork() - print(f"Fork a new process to run a test {pid}") - if pid == 0: - try: - f(*args, **kwargs) - except Skipped as e: - # convert Skipped to exit code 0 - print(str(e)) - os._exit(0) - except Exception: - import traceback - traceback.print_exc() - os._exit(1) + + # Create a unique temporary file to store exception info from child + # process. Use test function name and process ID to avoid collisions. + with tempfile.NamedTemporaryFile( + delete=False, + mode='w+b', + prefix=f"vllm_test_{func.__name__}_{os.getpid()}_", + suffix=".exc") as exc_file, ExitStack() as delete_after: + exc_file_path = exc_file.name + delete_after.callback(os.remove, exc_file_path) + + pid = os.fork() + print(f"Fork a new process to run a test {pid}") + if pid == 0: + # Parent process responsible for deleting, don't delete + # in child. + delete_after.pop_all() + try: + func(*args, **kwargs) + except Skipped as e: + # convert Skipped to exit code 0 + print(str(e)) + os._exit(0) + except Exception as e: + import traceback + tb_string = traceback.format_exc() + + # Try to serialize the exception object first + exc_to_serialize: dict[str, Any] + try: + # First, try to pickle the actual exception with + # its traceback. + exc_to_serialize = {'pickled_exception': e} + # Test if it can be pickled + cloudpickle.dumps(exc_to_serialize) + except (Exception, KeyboardInterrupt): + # Fall back to string-based approach. + exc_to_serialize = { + 'exception_type': type(e).__name__, + 'exception_msg': str(e), + 'traceback': tb_string, + } + try: + with open(exc_file_path, 'wb') as f: + cloudpickle.dump(exc_to_serialize, f) + except Exception: + # Fallback: just print the traceback. + print(tb_string) + os._exit(1) + else: + os._exit(0) else: - os._exit(0) - else: - pgid = os.getpgid(pid) - _pid, _exitcode = os.waitpid(pid, 0) - # ignore SIGTERM signal itself - old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) - # kill all child processes - os.killpg(pgid, signal.SIGTERM) - # restore the signal handler - signal.signal(signal.SIGTERM, old_signal_handler) - assert _exitcode == 0, (f"function {f} failed when called with" - f" args {args} and kwargs {kwargs}") + pgid = os.getpgid(pid) + _pid, _exitcode = os.waitpid(pid, 0) + # ignore SIGTERM signal itself + old_signal_handler = signal.signal(signal.SIGTERM, + signal.SIG_IGN) + # kill all child processes + os.killpg(pgid, signal.SIGTERM) + # restore the signal handler + signal.signal(signal.SIGTERM, old_signal_handler) + if _exitcode != 0: + # Try to read the exception from the child process + exc_info = {} + if os.path.exists(exc_file_path): + with contextlib.suppress(Exception), \ + open(exc_file_path, 'rb') as f: + exc_info = cloudpickle.load(f) + + if (original_exception := + exc_info.get('pickled_exception')) is not None: + # Re-raise the actual exception object if it was + # successfully pickled. + assert isinstance(original_exception, Exception) + raise original_exception + + if (original_tb := exc_info.get("traceback")) is not None: + # Use string-based traceback for fallback case + raise AssertionError( + f"Test {func.__name__} failed when called with" + f" args {args} and kwargs {kwargs}" + f" (exit code: {_exitcode}):\n{original_tb}" + ) from None + + # Fallback to the original generic error + raise AssertionError( + f"function {func.__name__} failed when called with" + f" args {args} and kwargs {kwargs}" + f" (exit code: {_exitcode})") from None return wrapper diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 37c3fe59c65dd..78d0ee6c1e3fc 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -117,10 +117,12 @@ class RayDistributedExecutor(DistributedExecutorBase): self.driver_worker.execute_method) def shutdown(self) -> None: - logger.info( - "Shutting down Ray distributed executor. If you see error log " - "from logging.cc regarding SIGTERM received, please ignore because " - "this is the expected termination process in Ray.") + if logger: + # Somehow logger can be None here. + logger.info( + "Shutting down Ray distributed executor. If you see error log " + "from logging.cc regarding SIGTERM received, please ignore " + "because this is the expected termination process in Ray.") if hasattr(self, "forward_dag") and self.forward_dag is not None: self.forward_dag.teardown() import ray From 37e8182bfe155f9decd1277471f1b1b238d50c70 Mon Sep 17 00:00:00 2001 From: Russell Bryant <rbryant@redhat.com> Date: Wed, 10 Sep 2025 16:53:35 -0400 Subject: [PATCH 137/584] [v1] Add Whisper model support (encoder-decoder) (#21088) Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: NickLucche <nlucches@redhat.com> --- .buildkite/test-pipeline.yaml | 6 +- examples/offline_inference/encoder_decoder.py | 2 + .../encoder_decoder_multimodal.py | 3 + tests/encoder_decoder/test_e2e_correctness.py | 1 + .../openai/test_encoder_decoder.py | 1 + tests/models/language/generation/test_bart.py | 2 + .../multimodal/generation/test_whisper.py | 3 +- .../processing/test_tensor_schema.py | 1 + tests/models/test_initialization.py | 6 + tests/v1/test_oracle.py | 1 - vllm/attention/layers/cross_attention.py | 160 ++++++++++++++++++ vllm/config/__init__.py | 37 +++- vllm/model_executor/models/voxtral.py | 1 - vllm/model_executor/models/whisper.py | 74 +++++--- vllm/transformers_utils/configs/mistral.py | 1 + vllm/v1/attention/backends/cpu_attn.py | 4 +- vllm/v1/attention/backends/flash_attn.py | 3 +- vllm/v1/attention/backends/flashinfer.py | 4 +- vllm/v1/attention/backends/flex_attention.py | 3 +- vllm/v1/attention/backends/linear_attn.py | 2 +- vllm/v1/attention/backends/mamba_attn.py | 9 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 5 +- vllm/v1/attention/backends/short_conv_attn.py | 2 +- vllm/v1/attention/backends/tree_attn.py | 3 +- vllm/v1/attention/backends/triton_attn.py | 4 +- vllm/v1/attention/backends/utils.py | 6 + vllm/v1/attention/backends/xformers.py | 3 +- vllm/v1/core/sched/scheduler.py | 32 ++-- vllm/v1/engine/processor.py | 5 - vllm/v1/worker/gpu_model_runner.py | 124 ++++++++++++-- vllm/v1/worker/utils.py | 13 +- 31 files changed, 429 insertions(+), 92 deletions(-) create mode 100644 vllm/attention/layers/cross_attention.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 75a9c4a22cb4d..06cb557e2ecf9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -321,7 +321,6 @@ steps: - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py @@ -644,7 +643,7 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] @@ -818,7 +817,8 @@ steps: # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' # test sequence parallel - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index df6c1eaf4a21e..957db3c23b863 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -5,6 +5,8 @@ Demonstrate prompting of text-to-text encoder/decoder models, specifically BART and mBART. This script is refactored to allow model selection via command-line arguments. + +NOTE: This example is not yet supported in V1. """ import argparse diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 655f9f3fce7ae..35e9203d1caf0 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -5,6 +5,7 @@ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. """ +import os import time from collections.abc import Sequence from dataclasses import asdict @@ -130,6 +131,8 @@ def run_mllama(): def run_whisper(): + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + engine_args = EngineArgs( model="openai/whisper-large-v3-turbo", max_model_len=448, diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 8b99d9d6e21fb..3cf4c377fb581 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -63,6 +63,7 @@ def clear_cache(): current_platform.is_cpu(), reason="CPU backend is not currently supported with encoder/decoder models" ) +@pytest.mark.skip(reason="bart not supported in V1") def test_encoder_decoder_e2e( hf_runner, vllm_runner, diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py index 9c2aef23e8772..75612962c95f7 100644 --- a/tests/entrypoints/openai/test_encoder_decoder.py +++ b/tests/entrypoints/openai/test_encoder_decoder.py @@ -30,6 +30,7 @@ async def client(server): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.skip(reason="bart is not yet supported in V1") async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): completion = await client.completions.create(model=model_name, prompt="Hello, my name is", diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py index b4c771840196c..22ceb27869ac4 100644 --- a/tests/models/language/generation/test_bart.py +++ b/tests/models/language/generation/test_bart.py @@ -178,6 +178,7 @@ def run_test( @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) +@pytest.mark.skip(reason="bart not supported in V1") def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: @@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM]) +@pytest.mark.skip(reason="bart not supported in V1") def test_models_distributed(hf_runner, vllm_runner, example_encoder_decoder_prompts, distributed_executor_backend, model, dtype, diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 4a65e8c95204e..e0e9980b88339 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -122,8 +122,7 @@ def run_test( @pytest.mark.core_model -@pytest.mark.parametrize( - "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) +@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @create_new_process_for_each_test() def test_models(vllm_runner, model) -> None: run_test( diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index b678313752d65..3b87b669dbbe3 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -31,6 +31,7 @@ from ...utils import dummy_hf_overrides ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", + "Florence2ForConditionalGeneration": "not supported in V1", } ARCH_NEEDS_EXTRAS = [ "InternVLChatModel", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index aaa04f52f7794..792b93fbcd0f3 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -68,6 +68,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when # L4 supports FA3. m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1") + if model_arch == "Florence2ForConditionalGeneration": + # An encoder-decoder model that's V0-only. Just skip it + # since V0 is about to be removed. + pytest.skip("Skipping Florence2ForConditionalGeneration") + if model_arch == "WhisperForConditionalGeneration": + m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") LLM( model_info.default, tokenizer=model_info.tokenizer, diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 1f16e92f657e0..efa604dd6b5a8 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -10,7 +10,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine UNSUPPORTED_MODELS_V1 = [ - "openai/whisper-large-v3", # transcription "facebook/bart-large-cnn", # encoder decoder ] diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py new file mode 100644 index 0000000000000..5f814b23888b3 --- /dev/null +++ b/vllm/attention/layers/cross_attention.py @@ -0,0 +1,160 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +from copy import copy +from typing import Optional + +import numpy as np +import torch +from transformers import CacheConfig + +from vllm import envs +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadata, AttentionType) +from vllm.attention.layer import Attention +from vllm.attention.selector import get_attn_backend +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.utils import cdiv +from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, + subclass_attention_backend) +from vllm.v1.kv_cache_interface import CrossAttentionSpec + +logger = init_logger(__name__) + + +def _get_max_encoder_len(vllm_config: VllmConfig) -> int: + return MULTIMODAL_REGISTRY.get_encdec_max_encoder_len( + vllm_config.model_config) + + +def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray, + block_table_tensor: torch.Tensor, + kv_cache_spec: CrossAttentionSpec, + device: torch.device) -> torch.Tensor: + """Get cross-attention slot mappings.""" + + block_size = kv_cache_spec.block_size + slot_mappings = [] + + # Find indices with non-zero encoder sequence lengths + # The majority of parallel requests will be running the + # decoder, so this list should be relatively small. + active_indices = np.nonzero(encoder_seq_lens)[0] + + for req_index in active_indices: + encoder_seq_len = encoder_seq_lens[req_index].item() + + # Calculate the number of blocks needed for this request + num_blocks_needed = cdiv(encoder_seq_len, block_size) + + # Get the block IDs for this request from the tensor + req_block_ids = block_table_tensor[req_index] + + # Get only the blocks we need (first num_blocks_needed blocks) + needed_block_ids = req_block_ids[:num_blocks_needed] + + # All needed blocks are allocated + i_values = torch.arange(encoder_seq_len, + dtype=torch.int64, + device=device) + block_indices = i_values // block_size + block_offsets = i_values % block_size + block_numbers = needed_block_ids[block_indices] + slot_mapping = block_numbers * block_size + block_offsets + + slot_mappings.append(slot_mapping) + + if slot_mappings: + return torch.cat(slot_mappings) + else: + return torch.empty(0, dtype=torch.int64, device=device) + + +@functools.lru_cache +def create_cross_attention_backend( + underlying_attn_backend: AttentionBackend, ) -> type[AttentionBackend]: + prefix = "CrossAttention_" + underlying_builder = underlying_attn_backend.get_builder_cls() + + class CrossAttentionBuilder(underlying_builder): # type: ignore + + def build(self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False) -> AttentionMetadata: + new_metadata = copy(common_attn_metadata) + new_metadata.causal = False + max_encoder_len = _get_max_encoder_len(self.vllm_config) + new_metadata.max_seq_len = max_encoder_len + + new_metadata.seq_lens = torch.full( + (new_metadata.num_reqs, ), + max_encoder_len, + dtype=torch.int32, + device=self.device, + ) + new_metadata.seq_lens_cpu = torch.full( + (new_metadata.num_reqs, ), + max_encoder_len, + dtype=torch.int32, + device="cpu", + ) + new_metadata.slot_mapping = _get_cross_slot_mapping( + new_metadata.encoder_seq_lens, new_metadata.block_table_tensor, + self.kv_cache_spec, self.device) + return super().build(common_prefix_len, new_metadata, fast_build) + + attn_backend = subclass_attention_backend( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + builder_cls=CrossAttentionBuilder) + + return attn_backend + + +class CrossAttention(Attention): + """ + Cross-attention for encoder-decoder models. + Handles attention between decoder queries and encoder keys/values. + """ + + def __init__(self, + num_heads: int, + head_size: int, + scale: float, + cache_config: Optional[CacheConfig] = None, + attn_type: Optional[str] = None, + **kwargs): + dtype = torch.get_default_dtype() + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + else: + kv_cache_dtype = "auto" + block_size = 16 + + if envs.VLLM_USE_V1: + underlying_attn_backend = get_attn_backend(head_size, dtype, + kv_cache_dtype, + block_size) + + attn_backend = create_cross_attention_backend( + underlying_attn_backend) + else: + # in v0 cross attention is handled inside the backends + attn_backend = None + + if attn_type is not None: + assert attn_type == AttentionType.ENCODER_DECODER, ( + "CrossAttention only supports AttentionType.ENCODER_DECODER") + + super().__init__(num_heads=num_heads, + head_size=head_size, + scale=scale, + cache_config=cache_config, + attn_backend=attn_backend, + attn_type=AttentionType.ENCODER_DECODER, + **kwargs) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index aa26c0d05a336..538d44d5337cc 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -8,6 +8,7 @@ import enum import hashlib import inspect import json +import os import textwrap import warnings from collections.abc import Mapping @@ -41,6 +42,7 @@ from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.utils import ConfigType, config from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.platforms import current_platform from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, @@ -3509,16 +3511,33 @@ class VllmConfig: disable_chunked_prefill_reasons: list[str] = [] - if self.model_config and self.model_config.pooler_config: - pooling_type = self.model_config.pooler_config.pooling_type - if pooling_type is None or pooling_type.lower() != "last": + if self.model_config: + if self.model_config.pooler_config: + pooling_type = self.model_config.pooler_config.pooling_type + if pooling_type is None or pooling_type.lower() != "last": + disable_chunked_prefill_reasons.append( + "Only \"last\" pooling supports chunked " + "prefill and prefix caching; disabling both.") + elif self.model_config.is_encoder_decoder: + self.scheduler_config.max_num_encoder_input_tokens = \ + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + logger.debug( + "Encoder-decoder model detected: setting " + "`max_num_encoder_input_tokens` to encoder length (%s)", + self.scheduler_config.max_num_encoder_input_tokens) + self.scheduler_config.disable_chunked_mm_input = True disable_chunked_prefill_reasons.append( - "Only \"last\" pooling supports chunked " - "prefill and prefix caching; disabling both.") - elif not getattr(self.model_config.hf_config, "is_causal", True): - disable_chunked_prefill_reasons.append( - "Only models using causal attention supports chunked " - "prefill and prefix caching; disabling both.") + "Encoder-decoder models do not support chunked prefill nor" + " prefix caching; disabling both.") + if (self.model_config.architecture + == "WhisperForConditionalGeneration" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") + != "spawn"): + logger.warning( + "Whisper is known to have issues with " + "forked workers. If startup is hanging, " + "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " + "to 'spawn'.") if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 9451670bc60c2..27e8b6fa55351 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -600,7 +600,6 @@ class VoxtralEncoderModel(nn.Module): self.whisper_encoder = WhisperEncoder(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "whisper_encoder"), - is_standalone_encoder=True, init_in_fp32=True) mel_filters = mel_filter_bank( num_frequency_bins=1 + self.config.window_size // 2, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 97e8cd6e76957..41ae7b129782d 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -15,6 +15,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids from vllm.attention import Attention, AttentionType from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layers.cross_attention import CrossAttention from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig) from vllm.distributed import get_tensor_model_parallel_world_size @@ -43,7 +44,7 @@ from vllm.transformers_utils.processor import cached_get_processor from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, - SupportsTranscription, SupportsV0Only) + SupportsTranscription) from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, make_layers) @@ -124,6 +125,34 @@ class WhisperAudioInputs(TensorSchema): TensorShape("b", "nmb", "t")] +class WhisperEncoderAttention(MultiHeadAttention): + """Multi-headed attention for Whisper encoder with 2D tensor support.""" + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + ) -> torch.Tensor: + """ + Input shape: batch_size x seq_len x hidden_size + or seq_len x hidden_size + """ + is_2d = query.dim() == 2 + if is_2d: + query = query.unsqueeze(0) + key = key.unsqueeze(0) + value = value.unsqueeze(0) + + # Call the parent forward method + out = super().forward(query, key, value) + + if is_2d: + out = out.squeeze(0) + + return out + + class WhisperPositionalEmbedding(nn.Embedding): def __init__(self, num_positions: int, embedding_dim: int): @@ -144,7 +173,6 @@ class WhisperAttention(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - standalone_encoder: bool = False, ): super().__init__() self.embed_dim = embed_dim @@ -180,14 +208,25 @@ class WhisperAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.out_proj", ) - if standalone_encoder: - self.attn = MultiHeadAttention( + if attn_type == AttentionType.ENCODER: + self.attn = WhisperEncoderAttention( self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, ) - else: + elif self.attn_type == AttentionType.ENCODER_DECODER: + self.attn = CrossAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + attn_type=self.attn_type, + ) + else: # AttentionType.DECODER (regular decoder self-attention) self.attn = Attention( self.num_heads, self.head_dim, @@ -332,11 +371,7 @@ class WhisperMLP(nn.Module): class WhisperEncoderLayer(nn.Module): - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = "", - is_standalone_encoder: bool = False): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -350,7 +385,6 @@ class WhisperEncoderLayer(nn.Module): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", - standalone_encoder=is_standalone_encoder, ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.mlp = WhisperMLP( @@ -446,12 +480,10 @@ class WhisperEncoder(nn.Module): *, vllm_config: VllmConfig, prefix: str = "", - is_standalone_encoder: bool = False, init_in_fp32: bool = False): super().__init__() config = vllm_config.model_config.hf_config embed_dim = config.d_model - self.is_standalone_encoder = is_standalone_encoder self.num_mel_bins = config.num_mel_bins self.max_source_positions = config.max_source_positions self.embed_scale = (math.sqrt(embed_dim) @@ -469,9 +501,7 @@ class WhisperEncoder(nn.Module): self.start_layer, self.end_layer, self.layers = make_layers( config.encoder_layers, lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config, - prefix=f"{prefix}.layers", - is_standalone_encoder= - is_standalone_encoder), + prefix=f"{prefix}.layers"), prefix=f"{prefix}.layers", ) self.layer_norm = nn.LayerNorm(config.d_model) @@ -752,7 +782,7 @@ class WhisperMultiModalProcessor( info=WhisperProcessingInfo, dummy_inputs=WhisperDummyInputsBuilder) class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, - SupportsMultiModal, SupportsV0Only): + SupportsMultiModal): packed_modules_mapping = { "self_attn.qkv_proj": [ "self_attn.q_proj", @@ -880,19 +910,17 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: - # TODO: This method does not obey the interface for SupportsMultiModal. - # Refactor this once encoder/decoder support is implemented in V1. + # Required as part of SupportsMultiModal interface. audio_input = self._parse_and_validate_audio_input(**kwargs) - return self.model.get_encoder_outputs(audio_input["input_features"]) + return [self.model.get_encoder_outputs(audio_input["input_features"])] def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: - # TODO: This method just returns the decoder sequence embeddings since - # Whisper does not have encoder text tokens. Refactor this once - # encoder/decoder support is implemented in V1. + # This method just returns the decoder sequence embeddings since + # Whisper does not have encoder text tokens. return self.model.decoder.get_input_embeddings(input_ids) def _parse_and_validate_audio_input( diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index 8a9c660b882fd..5d9206e188322 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -157,6 +157,7 @@ def _remap_mistral_audio_args(config: dict) -> dict: encoder_attention_heads=encoder_args["n_heads"], vocab_size=encoder_args["vocab_size"], max_source_positions=encoder_args["max_source_positions"], + is_encoder_decoder=False, # Override WhisperConfig default ) } if quant_config: diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index ced8234a7b433..ab87f3bb4e3cb 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -317,8 +317,8 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device) -> None: - self.kv_cache_spec = kv_cache_spec - self.vllm_config = vllm_config + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + self.scheduler_config = vllm_config.scheduler_config # For reorder diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 3cc67acd04c6b..20f1904b3be6f 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -177,12 +177,11 @@ class FlashAttentionMetadataBuilder( def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - self.vllm_config = vllm_config + super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.model_config = vllm_config.model_config self.parallel_config = vllm_config.parallel_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config - self.device = device self.num_heads_q = self.model_config.get_num_attention_heads( self.parallel_config) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 4a491eeaafd10..51defdd40de18 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -163,11 +163,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - self.device = device - self.vllm_config = vllm_config + super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.cache_config = vllm_config.cache_config self.model_config = vllm_config.model_config - self.kv_cache_spec = kv_cache_spec self._workspace_buffer = None self._prefill_wrapper = None # Wrapper for prefill/append self._decode_wrapper = None # Wrapper for decode (general shape) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index d5b1c15e68d0e..cb983494216a7 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -516,10 +516,11 @@ class FlexAttentionMetadataBuilder( def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + self.model_config = vllm_config.model_config self.parallel_config = vllm_config.parallel_config self.cache_config = vllm_config.cache_config - self.device = device self.num_heads_q = self.model_config.get_num_attention_heads( self.parallel_config) diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py index ac0034b5dcf06..3ff201d83a79b 100644 --- a/vllm/v1/attention/backends/linear_attn.py +++ b/vllm/v1/attention/backends/linear_attn.py @@ -39,8 +39,8 @@ class LinearAttentionMetadataBuilder( def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device) assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec def build(self, common_prefix_len: int, diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index 07ef7cb69a160..9970331a6042c 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -22,12 +22,9 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec - self.device = device - self.vllm_config = vllm_config - self.layer_names = layer_names + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + assert isinstance(kv_cache_spec, MambaSpec) self.compilation_config = vllm_config.compilation_config self.decode_cudagraph_max_bs = min( self.vllm_config.scheduler_config.max_num_seqs, @@ -52,4 +49,4 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): m.max_query_len = 1 # decode-only - return self.build(0, m) \ No newline at end of file + return self.build(0, m) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 173a0a255e491..a4e2758bd311f 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -236,11 +236,11 @@ class AiterFlashAttentionMetadataBuilder( def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - self.vllm_config = vllm_config + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + self.model_config = vllm_config.model_config self.parallel_config = vllm_config.parallel_config self.cache_config = vllm_config.cache_config - self.device = device self.num_heads_q = self.model_config.get_num_attention_heads( self.parallel_config) @@ -248,7 +248,6 @@ class AiterFlashAttentionMetadataBuilder( self.parallel_config) self.headdim = self.model_config.get_head_size() self.block_size = kv_cache_spec.block_size - self.kv_cache_spec = kv_cache_spec # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. self.aot_sliding_window: Optional[tuple[int, int]] = None diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index fcbf0c7b53560..f5ad65b02b4d4 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -45,8 +45,8 @@ class ShortConvAttentionMetadataBuilder( def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device) assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec def build(self, common_prefix_len: int, diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index b96d957a150b5..10238f36455d2 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -165,7 +165,8 @@ class TreeAttentionMetadataBuilder( vllm_config: VllmConfig, device: torch.device, ): - self.kv_cache_spec = kv_cache_spec + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + self.block_size = kv_cache_spec.block_size spec_config = vllm_config.speculative_config diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 104cebb45d740..fe2894eaa0751 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -66,9 +66,9 @@ class TritonAttentionMetadataBuilder( def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - self.device = device + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + self.block_size = kv_cache_spec.block_size - self.kv_cache_spec = kv_cache_spec model_config = vllm_config.model_config self.num_heads_q = model_config.get_num_attention_heads( diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 8e3d530fc1f98..009943fa743d8 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -72,6 +72,9 @@ class CommonAttentionMetadata: logits_indices_padded: Optional[torch.Tensor] = None num_logits_indices: Optional[int] = None + # Needed by CrossAttentionBuilder + encoder_seq_lens: Optional[np.ndarray] = None + @dataclass class UbatchSlice: @@ -193,6 +196,9 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): self.kv_cache_spec = kv_cache_spec + self.layer_names = layer_names + self.vllm_config = vllm_config + self.device = device @abstractmethod def build(self, diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index c59ff32cf7c28..a6ca334912353 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -206,8 +206,9 @@ class XFormersAttentionMetadataBuilder( vllm_config: VllmConfig, device: torch.device, ): + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + assert XFORMERS_AVAILABLE - self.kv_cache_spec = kv_cache_spec self.block_size = kv_cache_spec.block_size self._num_decodes = 0 self._num_decode_tokens = 0 diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ed7c16dc520fe..d1a6dd73e85cd 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -144,8 +144,8 @@ class Scheduler(SchedulerInterface): ) # NOTE(woosuk): Here, "encoder" includes the vision encoder (and - # projector if needed). Currently, we assume that the encoder also - # has the Transformer architecture (e.g., ViT). + # projector if needed) for MM models as well as encoder-decoder + # transformers. self.max_num_encoder_input_tokens = encoder_compute_budget # NOTE: For the models without encoder (e.g., text-only models), # the encoder cache will not be initialized because cache size is 0 @@ -775,15 +775,19 @@ class Scheduler(SchedulerInterface): # in the decoder's KV cache. continue - # The same encoder input has already been scheduled in the current - # step. - if request.mm_hashes[i] in mm_hashes_to_schedule: - continue + if not self.is_encoder_decoder: + # We are not using the encoder cache for encoder-decoder models, + # yet. + if request.mm_hashes[i] in mm_hashes_to_schedule: + # The same encoder input has already been scheduled in the + # current step. + continue - if self.encoder_cache_manager.check_and_update_cache(request, i): - # The encoder input is already computed and cached from a - # previous step. - continue + if self.encoder_cache_manager.check_and_update_cache( + request, i): + # The encoder input is already computed and cached from a + # previous step. + continue # If no encoder input chunking is allowed, we do not want to # partially schedule a multimodal item. If the scheduled range would @@ -1047,7 +1051,13 @@ class Scheduler(SchedulerInterface): mm_positions = request.mm_positions[input_id] start_pos = mm_positions.offset num_tokens = mm_positions.length - if start_pos + num_tokens <= request.num_computed_tokens: + if self.is_encoder_decoder and request.num_computed_tokens > 0: + # With Whisper, as soon as we've generated a single token, + # we know we're done with the encoder input. Cross Attention + # KVs have been calculated and cached already. + self.encoder_cache_manager.free_encoder_input( + request, input_id) + elif start_pos + num_tokens <= request.num_computed_tokens: # The encoder output is already processed and stored # in the decoder's KV cache. self.encoder_cache_manager.free_encoder_input( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 8ce070e4d6fba..96e89eeac556a 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -325,7 +325,6 @@ class Processor: ) -> tuple[Optional[str], EngineCoreRequest]: # TODO(woosuk): Support pooling models. - # TODO(woosuk): Support encoder-decoder models. self._validate_lora(lora_request) self._validate_params(params, lora_request) if trace_headers is not None: @@ -384,10 +383,6 @@ class Processor: encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) - # TODO: Impl encoder-decoder - if encoder_inputs is not None: - raise NotImplementedError - sampling_params = None pooling_params = None if isinstance(params, SamplingParams): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 33f4d65a7a115..d3822251b5b67 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -61,12 +61,16 @@ from vllm.v1.attention.backends.utils import ( create_fast_prefill_custom_backend, reorder_batch_to_split_decodes_and_prefills) from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +# yapf conflicts with isort for this block +# yapf: disable from vllm.v1.kv_cache_interface import (AttentionSpec, ChunkedLocalAttentionSpec, + CrossAttentionSpec, EncoderOnlyAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, MambaSpec, SlidingWindowSpec) +# yapf: enable from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, LogprobsLists, LogprobsTensors, ModelRunnerOutput, SamplerOutput) @@ -208,6 +212,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( model_config) + if self.model_config.is_encoder_decoder: + # Maximum length of the encoder input, only for encoder-decoder + # models. + self.max_encoder_len = self.mm_registry.\ + get_encdec_max_encoder_len(model_config) + else: + self.max_encoder_len = 0 + # Sampler self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) @@ -265,7 +277,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # the block_sizes in the kv cache config. self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, + # We need to use the encoder length for encoder-decoer + # because of KV cache for cross-attention. + max_model_len=max(self.max_model_len, self.max_encoder_len), max_num_batched_tokens=self.max_num_tokens, device=self.device, pin_memory=self.pin_memory, @@ -798,6 +812,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): src=self.input_batch.prev_sampled_token_ids[ prev_common_req_indices_tensor, 0]) + def _get_encoder_seq_lens( + self, + scheduler_output: "SchedulerOutput", + kv_cache_spec: KVCacheSpec, + num_reqs: int, + ) -> Optional[np.ndarray]: + if not isinstance(kv_cache_spec, CrossAttentionSpec): + return None + + # Build encoder_seq_lens array mapping request indices to + # encoder lengths for inputs scheduled in this batch + encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32) + for req_id in scheduler_output.scheduled_encoder_inputs: + req_index = self.input_batch.req_id_to_index[req_id] + encoder_seq_lens[req_index] = self.max_encoder_len + + return encoder_seq_lens + def _prepare_inputs( self, scheduler_output: "SchedulerOutput", @@ -937,6 +969,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # in the same group share the same metadata. for kv_cache_group_id, kv_cache_group_spec in enumerate( self.kv_cache_config.kv_cache_groups): + encoder_seq_lens = self._get_encoder_seq_lens( + scheduler_output, kv_cache_group_spec.kv_cache_spec, num_reqs) if isinstance(kv_cache_group_spec.kv_cache_spec, EncoderOnlyAttentionSpec): @@ -981,6 +1015,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits_indices_padded=logits_indices_padded, num_logits_indices=logits_indices.size(0), causal=True, + encoder_seq_lens=encoder_seq_lens, ) if self.speculative_config and \ @@ -1253,10 +1288,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded]) return logits_indices_padded - def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): + def _batch_mm_kwargs_from_scheduler( + self, + scheduler_output: "SchedulerOutput", + ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]: + """Batch multimodal kwargs from scheduled encoder inputs. + + Args: + scheduler_output: The scheduler output containing scheduled encoder + inputs. + + Returns: + A tuple of (mm_kwargs, req_ids_pos) where: + - mm_kwargs: List of multimodal kwargs items to be batched + - mm_hashes_pos: List of (mm_hash, position_info) tuples + """ scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs if not scheduled_encoder_inputs: - return + return [], [] # Batch the multi-modal inputs. mm_kwargs = list[MultiModalKwargsItem]() # list of tuple (mm_hash, position_info) @@ -1270,6 +1319,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_hashes_pos.append( (mm_hash, req_state.mm_positions[mm_input_id])) + return mm_kwargs, mm_hashes_pos + + def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): + # Batch the multi-modal inputs using the helper method. + mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( + scheduler_output) + + if not mm_kwargs: + return + # Batch mm inputs as much as we can: if a request in the batch has # multiple modalities or a different modality than the previous one, # we process it separately to preserve item order. @@ -1360,6 +1419,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_embeds.append(mm_embeds_item) return mm_embeds + def _extract_encoder_inputs( + self, + scheduler_output: "SchedulerOutput", + ) -> dict[str, torch.Tensor]: + """Extract encoder inputs for encoder-decoder models. + + This method extracts multimodal input features from scheduled encoder + inputs and formats them for the encoder-decoder model forward pass. + """ + # Batch the multi-modal inputs using the helper method. + mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output) + + if not mm_kwargs: + return {} + + # Group MM kwargs by modality and extract features + encoder_features = {} + for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( + mm_kwargs, + device=self.device, + pin_memory=self.pin_memory, + ): + # Add the grouped features to encoder_features dict + # This allows the model to receive them as kwargs (e.g., + # input_features=...) + encoder_features.update(mm_kwargs_group) + + return encoder_features + def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. if isinstance(self.model, CUDAGraphWrapper): @@ -1631,7 +1719,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order - if self.supports_mm_inputs and get_pp_group().is_first_rank: + if (self.supports_mm_inputs and get_pp_group().is_first_rank + and not self.model_config.is_encoder_decoder): # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) mm_embeds = self._gather_mm_embeddings(scheduler_output) @@ -1673,6 +1762,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_input_tokens, intermediate_tensors, True) + if (self.model_config.is_encoder_decoder + and scheduler_output.scheduled_encoder_inputs): + encoder_inputs = self._extract_encoder_inputs(scheduler_output) + model_kwargs.update(encoder_inputs) + return ( num_scheduled_tokens, num_input_tokens, @@ -2591,17 +2685,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens, remove_lora): - if self.supports_mm_inputs: + model_kwargs = self._init_model_kwargs(num_tokens) + if (self.supports_mm_inputs + and not self.model_config.is_encoder_decoder): input_ids = None inputs_embeds = self.inputs_embeds.gpu[:num_tokens] model_kwargs = { - **self._init_model_kwargs(num_tokens), + **model_kwargs, **self._dummy_mm_kwargs(num_reqs), } else: input_ids = self.input_ids.gpu[:num_tokens] inputs_embeds = None - model_kwargs = self._init_model_kwargs(num_tokens) if self.uses_mrope: positions = self.mrope_positions.gpu[:, :num_tokens] @@ -2823,7 +2918,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_budget = self.mm_budget assert mm_budget is not None - # TODO: handle encoder-decoder models once we support them. if (encoder_budget := mm_budget.get_encoder_budget()) > 0: # NOTE: Currently model is profiled with a single non-text # modality with the max possible input tokens even when @@ -3170,7 +3264,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): "for more details.") self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, + max_model_len=max(self.max_model_len, self.max_encoder_len), max_num_batched_tokens=self.max_num_tokens, device=self.device, pin_memory=self.pin_memory, @@ -3443,7 +3537,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) for layer_name, attn_module in attn_layers.items(): if attn_module.attn_type == AttentionType.ENCODER_ONLY: - attn_spec = EncoderOnlyAttentionSpec( + attn_spec: AttentionSpec = EncoderOnlyAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, head_size=attn_module.head_size, @@ -3485,7 +3579,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.shared_kv_cache_layers[layer_name] = kv_tgt_layer continue - # TODO: Support other attention modules, e.g., cross-attention # TODO(lucas): move the attention specs into the model layers like # the attention backends if attn_module.attn_type == AttentionType.DECODER: @@ -3513,12 +3606,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): head_size=attn_module.head_size, dtype=self.kv_cache_dtype, use_mla=use_mla) + elif attn_module.attn_type == AttentionType.ENCODER_DECODER: + kv_cache_spec[layer_name] = CrossAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype, + use_mla=use_mla) elif attn_module.attn_type in (AttentionType.ENCODER, AttentionType.ENCODER_ONLY): # encoder-only attention does not need KV cache. continue - elif attn_module.attn_type == AttentionType.ENCODER_DECODER: - raise NotImplementedError else: raise ValueError( f"Unknown attention type: {attn_module.attn_type}") diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 6767804c71b9f..be05d02ff29fe 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -12,6 +12,7 @@ from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry +from vllm.platforms import current_platform from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec @@ -269,7 +270,17 @@ def bind_kv_cache( # One typical case is encoder-decoder model, e.g., bart. # The cross attention and self attention in the same decoder layer # has different layer_name but the same layer_index. - raise NotImplementedError + + # TODO - analyze where runner_kv_caches is used and the right + # way to ensure it properly reflects multiple attention layers + # in the same decoder block. + if current_platform.is_cuda(): + # We know that the GPU runner is not impacted by this + # case. Some test code depends on runner_kv_caches, but + # not in a way that's impacted by ignoring this. + pass + else: + raise NotImplementedError layer_name = layer_names[0] runner_kv_caches.append(kv_caches[layer_name]) From 9a161307f5f096c63ae4134c5055d87a36d224a8 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:59:55 -0400 Subject: [PATCH 138/584] [torch.compile][ROCm][V1] Enable attention output FP8 fusion for V1 attention backends (#19767) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> --- tests/compile/test_fusion_attn.py | 129 ++++++++++------ .../ops/chunked_prefill_paged_decode.py | 18 ++- vllm/attention/ops/prefix_prefill.py | 14 +- .../attention/ops/triton_unified_attention.py | 139 ++++++++++-------- vllm/compilation/backends.py | 3 +- vllm/compilation/fusion_attn.py | 31 ++-- .../layers/quantization/utils/w8a8_utils.py | 38 ++--- vllm/v1/attention/backends/triton_attn.py | 12 +- 8 files changed, 249 insertions(+), 135 deletions(-) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index dba668cfa16a6..6baf4bf83f499 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -40,13 +40,12 @@ backend_unfused: Optional[TestBackend] = None @pytest.mark.parametrize( "model, quant_key", [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)]) -@pytest.mark.parametrize( - "use_triton_fa", [True, False] if current_platform.is_rocm() else [False]) +@pytest.mark.parametrize("use_triton_fa", [True, False]) @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8") -@pytest.mark.skipif(not current_platform.is_cuda_alike(), - reason="Only test CUDA and ROCm") -def test_attention_fusion(example_prompts, monkeypatch, model: str, - quant_key: QuantKey, use_triton_fa: bool): +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="V0 attn quant fusion only on ROCm") +def test_attention_fusion_v0(example_prompts, monkeypatch, model: str, + quant_key: QuantKey, use_triton_fa: bool): # Clean Dynamo cache to avoid reusing other test cases # (for some reason the reset at the end is not enough) torch._dynamo.reset() @@ -69,13 +68,17 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str, backend="tests.compile.test_fusion_attn.backend_unfused", custom_ops=["+quant_fp8"], ) - vllm_config = VllmConfig(compilation_config=compile_config) + vllm_config = VllmConfig(compilation_config=compile_config, + model_config=ModelConfig( + model=model, + dtype=torch.bfloat16, + )) backend_unfused = TestBackend(NoOpEliminationPass(vllm_config)) llm = LLM(model, enforce_eager=True, compilation_config=compile_config, - gpu_memory_utilization=0.9, + gpu_memory_utilization=0.5, max_model_len=2048) sampling_params = SamplingParams(temperature=0.0, @@ -93,7 +96,11 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str, backend="tests.compile.test_fusion_attn.backend", custom_ops=["+quant_fp8"], ) - vllm_config = VllmConfig(compilation_config=compile_config) + vllm_config = VllmConfig(compilation_config=compile_config, + model_config=ModelConfig( + model=model, + dtype=torch.bfloat16, + )) # AttnFusionPass needs attention layers to be registered in config upon init # so we initialize it during compilation. @@ -102,7 +109,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str, llm2 = LLM(model, enforce_eager=True, compilation_config=compile_config, - gpu_memory_utilization=0.9, + gpu_memory_utilization=0.5, max_model_len=2048) # check support @@ -171,6 +178,8 @@ class AttentionQuantPatternModel(torch.nn.Module): cache_config=vllm_config.cache_config, prefix="model.layers.0.self_attn.attn", ) + self.attn._k_scale = self.attn._k_scale.to(device) + self.attn._v_scale = self.attn._v_scale.to(device) self.block_size = 16 @@ -188,7 +197,7 @@ class AttentionQuantPatternModel(torch.nn.Module): device=self.device, ) - def build_attn_metadata(self, batch_size: int): + def build_attn_metadata(self, batch_size: int, use_hnd: bool): """Initialize attention metadata.""" # Create common attn metadata @@ -205,10 +214,8 @@ class AttentionQuantPatternModel(torch.nn.Module): num_blocks = batch_size * max_blocks # Create dummy KV cache for FlashInfer TRTLLM - # - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size] - # - HND: [num_blocks, 2, num_kv_heads, block_size, head_size] - # Create kv_cache in HND layout and permute to NHD layout - # (later will be permuted back to HND layout in forward pass) + # - NHD: [num_blocks, block_size, num_kv_heads, head_size] + # - HND: [num_blocks, num_kv_heads, block_size, head_size] kv_cache = torch.zeros(num_blocks, 2, self.num_kv_heads, @@ -216,7 +223,17 @@ class AttentionQuantPatternModel(torch.nn.Module): self.head_size, dtype=self.kv_cache_dtype, device=self.device) - kv_cache = kv_cache.permute(0, 1, 3, 2, 4) + if current_platform.is_rocm(): + # k/v as 1st dimention + if use_hnd: + kv_cache = kv_cache.permute(1, 0, 2, 3, 4) + else: + kv_cache = kv_cache.permute(1, 0, 3, 2, 4) + else: + # k/v as 2nd dimention + # Create kv_cache in HND layout and permute to NHD layout + # (later will be permuted back to HND layout in forward pass) + kv_cache = kv_cache.permute(0, 1, 3, 2, 4) self.attn.kv_cache = [kv_cache] # Build attn metadata @@ -296,28 +313,51 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel): out_dtype=attn_output.dtype) -@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)]) +if current_platform.is_cuda(): + MODELS = [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", + TestAttentionFp8StaticQuantPatternModel), + ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", + TestAttentionNvfp4QuantPatternModel)] + HEADS = [(64, 8), (40, 8)] +elif current_platform.is_rocm(): + MODELS = [("amd/Llama-3.1-8B-Instruct-FP8-KV", + TestAttentionFp8StaticQuantPatternModel)] + HEADS = [(32, 8), (40, 8)] +else: + MODELS = [] + HEADS = [] + + +@pytest.mark.parametrize("num_qo_heads, num_kv_heads", HEADS) @pytest.mark.parametrize("head_size", [128]) -@pytest.mark.parametrize("batch_size", [7, 256, 533]) -@pytest.mark.parametrize("dtype", [torch.bfloat16]) -@pytest.mark.parametrize("model_name, model_class", - [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", - TestAttentionFp8StaticQuantPatternModel), - ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", - TestAttentionNvfp4QuantPatternModel)]) -@pytest.mark.parametrize("backend", [_Backend.FLASHINFER]) -@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA") +@pytest.mark.parametrize("batch_size", + [7, 256, 533] if current_platform.is_cuda() else [8]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +@pytest.mark.parametrize("model_name, model_class", MODELS) +@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if + current_platform.is_cuda() else [_Backend.ROCM_FLASH]) +@pytest.mark.parametrize( + "split_attention", + [False, True] if current_platform.is_rocm() else [False]) +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Only test ROCm or CUDA") @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8") -@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)), - reason="Only test on SM100(Blackwell)") +@pytest.mark.skipif(current_platform.is_cuda() + and not current_platform.is_device_capability((10, 0)), + reason="On CUDA only test on SM100(Blackwell)") +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Only test ROCm or CUDA") def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, head_size: int, batch_size: int, dtype: torch.dtype, model_name: str, model_class: type[AttentionQuantPatternModel], - backend: _Backend, monkeypatch, dist_init): + backend: _Backend, split_attention: bool, + monkeypatch, dist_init): """Test AttentionStaticQuantPattern fusion pass""" monkeypatch.setenv("VLLM_USE_V1", "1") + if split_attention: + monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1") device = torch.device("cuda:0") torch.manual_seed(42) @@ -326,6 +366,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, model_config=ModelConfig( model=model_name, max_model_len=2048, + dtype=dtype, ), scheduler_config=SchedulerConfig(max_num_seqs=1024), compilation_config=CompilationConfig( @@ -368,7 +409,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, forward_ctx = get_forward_context() forward_ctx.attn_metadata = model_unfused.build_attn_metadata( - batch_size) + batch_size, use_hnd=split_attention) # Run model directly without compilation and fusion result_unfused = model_unfused(q, k, v) @@ -389,7 +430,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, model_fused = model_fused.to(device) forward_ctx = get_forward_context() - forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size) + forward_ctx.attn_metadata = model_fused.build_attn_metadata( + batch_size, use_hnd=split_attention) # Create test backend with fusion passes enabled noop_pass = NoOpEliminationPass(vllm_config) @@ -404,12 +446,19 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, assert model_compiled.attn._o_scale_float is None result_fused_1 = model_compiled(q, k, v) - # After the 1st round of the forward pass, output quant scale should be - # loaded into the attn layer's _o_scale_float, the 2nd round should - # reuse the loaded _o_scale_float - assert model_compiled.attn._o_scale_float is not None - result_fused_2 = model_compiled(q, k, v) - assert model_compiled.attn._o_scale_float is not None + if backend == _Backend.FLASHINFER: + # With the Flashinfer backend after the 1st round of the forward + # pass, output quant scale should be loaded into the attn layer's + # _o_scale_float, the 2nd round should reuse the loaded + # _o_scale_float + assert model_compiled.attn._o_scale_float is not None + result_fused_2 = model_compiled(q, k, v) + assert model_compiled.attn._o_scale_float is not None + + torch.testing.assert_close(result_unfused, + result_fused_2, + atol=1e-2, + rtol=1e-2) # Check attn fusion support quant_key = model_class.quant_key @@ -444,12 +493,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \ "Attention should have output_block_scale after FP4 fusion" # noqa: E501 - # Check that results are closed + # Check that results are close torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2) - torch.testing.assert_close(result_unfused, - result_fused_2, - atol=1e-2, - rtol=1e-2) diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index e5b90a8b27558..bf4b06512a3c1 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -15,6 +15,8 @@ from vllm.triton_utils import tl, triton from .prefix_prefill import context_attention_fwd +float8_info = torch.finfo(current_platform.fp8_dtype()) + @triton.jit def cdiv_fn(x, y): @@ -34,6 +36,7 @@ def kernel_paged_attention_2d( scale, # float32 k_scale, # float32 v_scale, # float32 + out_scale_inv, num_query_heads: tl.constexpr, # int num_queries_per_kv: tl.constexpr, # int num_queries_per_kv_padded: tl.constexpr, # int @@ -60,7 +63,9 @@ def kernel_paged_attention_2d( filter_by_query_len: tl.constexpr, # bool query_start_len_ptr, # [num_seqs+1] USE_SINKS: tl.constexpr, # bool -): + USE_FP8: tl.constexpr, + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max): seq_idx = tl.program_id(0) kv_head_idx = tl.program_id(1) @@ -204,6 +209,9 @@ def kernel_paged_attention_2d( # epilogue acc = acc / L[:, None] + if USE_FP8: + acc = acc * tl.load(out_scale_inv) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) output_offset = (cur_batch_in_all_start_index * output_stride_0 + query_head_idx * output_stride_1) @@ -234,6 +242,7 @@ def chunked_prefill_paged_decode( alibi_slopes=None, sliding_window=None, sm_scale=None, + output_scale=None, # Optional tensor for sinks sinks=None, ): @@ -266,6 +275,7 @@ def chunked_prefill_paged_decode( sliding_window=sliding_window, sm_scale=sm_scale, skip_decode=True, + fp8_out_scale=output_scale, sinks=sinks, ) @@ -316,7 +326,7 @@ def chunked_prefill_paged_decode( tmp_output = torch.empty( size=(total_num_seq, num_query_heads, max_num_partitions, head_size), - dtype=output.dtype, + dtype=query.dtype, device=output.device, ) exp_sums = torch.empty( @@ -345,6 +355,7 @@ def chunked_prefill_paged_decode( kv_cache_dtype=kv_cache_dtype, k_scale=k_scale, v_scale=v_scale, + fp8_out_scale=output_scale, ) else: kernel_paged_attention_2d[( @@ -362,6 +373,8 @@ def chunked_prefill_paged_decode( scale=sm_scale, k_scale=k_scale, v_scale=v_scale, + out_scale_inv=1.0 / + output_scale if output_scale is not None else 1.0, num_query_heads=num_query_heads, num_queries_per_kv=num_queries_per_kv, num_queries_per_kv_padded=num_queries_per_kv_padded, @@ -388,4 +401,5 @@ def chunked_prefill_paged_decode( filter_by_query_len=True, query_start_len_ptr=query_start_loc, USE_SINKS=sinks is not None, + USE_FP8=output_scale is not None, ) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index a70db89cdb76e..7e5c2b6c62e9b 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -15,6 +15,7 @@ NUM_WARPS = 4 if current_platform.is_rocm() else 8 # To check compatibility IS_TURING = current_platform.get_device_capability() == (7, 5) +float8_info = torch.finfo(current_platform.fp8_dtype()) # Here's an example autotuner config for this kernel. This config does provide @@ -43,6 +44,7 @@ def _fwd_kernel(Q, sm_scale, k_scale, v_scale, + out_scale_inv, B_Start_Loc, B_Seqlen, x: tl.constexpr, @@ -82,8 +84,11 @@ def _fwd_kernel(Q, num_unroll_request: tl.constexpr, SKIP_DECODE: tl.constexpr, USE_SINKS: tl.constexpr, + USE_FP8: tl.constexpr, MAX_Q_LEN: tl.constexpr = 0, - MAX_CTX_LEN: tl.constexpr = 0): + MAX_CTX_LEN: tl.constexpr = 0, + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max): cur_batch = tl.program_id(0) cur_head = tl.program_id(1) @@ -284,6 +289,9 @@ def _fwd_kernel(Q, off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_d[None, :] * stride_od) out_ptrs = Out + off_o + if USE_FP8: + acc = acc * tl.load(out_scale_inv) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) tl.store(out_ptrs, acc, mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len)) @@ -743,6 +751,7 @@ def context_attention_fwd(q, sliding_window=None, sm_scale=None, skip_decode=False, + fp8_out_scale=None, sinks=None): q_dtype_is_f32 = q.dtype is torch.float32 @@ -793,6 +802,7 @@ def context_attention_fwd(q, if alibi_slopes is not None: assert sinks is None, "Sinks arg is not supported with alibi" + assert fp8_out_scale is None, "FP8 output not supported with alibi" # need to reduce num. blocks when using fp32 # due to increased use of GPU shared memory # if q.dtype is torch.float32: @@ -870,6 +880,7 @@ def context_attention_fwd(q, sm_scale, k_scale, v_scale, + 1.0 / fp8_out_scale if fp8_out_scale is not None else 1.0, b_start_loc, b_seq_len, k_cache.shape[4], @@ -905,6 +916,7 @@ def context_attention_fwd(q, BLOCK_DMODEL_PADDED=Lk_padded, SLIDING_WINDOW=sliding_window, SKIP_DECODE=skip_decode, + USE_FP8=fp8_out_scale is not None, BLOCK_M=128, BLOCK_N=64, num_unroll_cache=4, diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 250e9b3890444..d2ad2f7e8d2aa 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -10,9 +10,11 @@ import torch from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.triton_utils import tl, triton logger = init_logger(__name__) +float8_info = torch.finfo(current_platform.fp8_dtype()) @triton.jit @@ -48,47 +50,51 @@ def find_seq_idx(query_start_len_ptr, target_idx, num_seqs, @triton.jit def kernel_unified_attention_2d( - output_ptr, # [num_tokens, num_query_heads, head_size] - query_ptr, # [num_tokens, num_query_heads, head_size] - key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] - value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] - sink_ptr, # [num_query_heads] - block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] - seq_lens_ptr, # [num_seqs] - alibi_slopes_ptr, # [num_query_heads] - qq_bias_ptr, # [num_query_tokens, num_query_tokens] - scale, # float32 - k_scale, # float32 - v_scale, # float32 - softcap, # float32 - num_query_heads: tl.constexpr, # int - num_queries_per_kv: tl.constexpr, # int - block_table_stride: tl.int64, # int - query_stride_0: tl.int64, # int - query_stride_1: tl.int64, # int, should be equal to head_size - output_stride_0: tl.int64, # int - output_stride_1: tl.int64, # int, should be equal to head_size - qq_bias_stride_0: tl.int64, # int - BLOCK_SIZE: tl.constexpr, # int - HEAD_SIZE: tl.constexpr, # int - HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 - USE_ALIBI_SLOPES: tl.constexpr, # bool - USE_QQ_BIAS: tl.constexpr, # bool - USE_SOFTCAP: tl.constexpr, # bool - USE_SINKS: tl.constexpr, # bool - SLIDING_WINDOW: tl.constexpr, # int - stride_k_cache_0: tl.int64, # int - stride_k_cache_1: tl.int64, # int - stride_k_cache_2: tl.int64, # int - stride_k_cache_3: tl.constexpr, # int - stride_v_cache_0: tl.int64, # int - stride_v_cache_1: tl.int64, # int - stride_v_cache_2: tl.int64, # int - stride_v_cache_3: tl.constexpr, # int - query_start_len_ptr, # [num_seqs+1] - BLOCK_Q: tl.constexpr, # int - num_seqs: tl.int32, - BLOCK_M: tl.constexpr, # int + output_ptr, # [num_tokens, num_query_heads, head_size] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + sink_ptr, # [num_query_heads] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + qq_bias_ptr, # [num_query_tokens, num_query_tokens] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + out_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + qq_bias_stride_0: tl.int64, # int + BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_QQ_BIAS: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int + USE_FP8: tl.constexpr, # bool + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, ): q_block_global_idx = tl.program_id(0) kv_head_idx = tl.program_id(1) @@ -281,6 +287,9 @@ def kernel_unified_attention_2d( # epilogue acc = acc / L[:, None] + if USE_FP8: + acc = acc * tl.load(out_scale) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) output_offset = (query_offset_0[:, None] * output_stride_0 + query_offset_1[:, None] * output_stride_1 + @@ -552,23 +561,27 @@ def kernel_unified_attention_3d( @triton.jit def reduce_segments( - output_ptr, # [num_tokens, num_query_heads, head_size] - segm_output_ptr, - #[num_tokens, num_query_heads, max_num_segments, head_size] - segm_max_ptr, # [num_tokens, num_query_heads, max_num_segments] - segm_expsum_ptr, # [num_tokens, num_query_heads, max_num_segments] - seq_lens_ptr, # [num_seqs] - num_seqs, # int - num_query_heads: tl.constexpr, # int - output_stride_0: tl.int64, # int - output_stride_1: tl.int64, # int, should be equal to head_size - block_table_stride: tl.int64, # int - BLOCK_SIZE: tl.constexpr, # int - HEAD_SIZE: tl.constexpr, # int, must be power of 2 - HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 - query_start_len_ptr, # [num_seqs+1] - BLOCK_Q: tl.constexpr, # int - NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int + output_ptr, # [num_tokens, num_query_heads, head_size] + segm_output_ptr, + #[num_tokens, num_query_heads, max_num_segments, head_size] + segm_max_ptr, # [num_tokens, num_query_heads, max_num_segments] + segm_expsum_ptr, # [num_tokens, num_query_heads, max_num_segments] + seq_lens_ptr, # [num_seqs] + num_seqs, # int + num_query_heads: tl.constexpr, # int + out_scale_inv, # float32 + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + block_table_stride: tl.int64, # int + BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int, must be power of 2 + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int + USE_FP8: tl.constexpr, # bool + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, ): query_token_idx = tl.program_id(0) query_head_idx = tl.program_id(1) @@ -624,6 +637,10 @@ def reduce_segments( # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0 acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum) + if USE_FP8: + acc = acc * tl.load(out_scale_inv) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + # write result output_offset = (query_token_idx * output_stride_0 + query_head_idx * output_stride_1 + @@ -649,6 +666,7 @@ def unified_attention( k_descale, v_descale, alibi_slopes=None, + output_scale=None, qq_bias=None, # Optional tensor for sinks sinks=None, @@ -707,6 +725,7 @@ def unified_attention( scale=softmax_scale, k_scale=k_descale, v_scale=v_descale, + out_scale=1 / output_scale if output_scale is not None else 1.0, softcap=softcap, num_query_heads=num_query_heads, num_queries_per_kv=num_queries_per_kv, @@ -736,6 +755,7 @@ def unified_attention( BLOCK_Q=BLOCK_Q, num_seqs=num_seqs, BLOCK_M=BLOCK_M, + USE_FP8=output_scale is not None, ) else: # for initial version, NUM_SEGMENTS = 16 is chosen as a default @@ -819,6 +839,8 @@ def unified_attention( seq_lens_ptr=seqused_k, num_seqs=num_seqs, num_query_heads=num_query_heads, + out_scale_inv=1 / + output_scale if output_scale is not None else 1.0, output_stride_0=out.stride(0), output_stride_1=out.stride(1), block_table_stride=block_table.stride(0), @@ -828,4 +850,5 @@ def unified_attention( query_start_len_ptr=cu_seqlens_q, BLOCK_Q=BLOCK_Q, NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, + USE_FP8=output_scale is not None, ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 3361b65a9b885..3cc0fc3106f5a 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -454,11 +454,12 @@ class VllmBackend: inductor_config = config.inductor_compile_config PASS_KEY = "post_grad_custom_post_pass" if PASS_KEY in inductor_config: - # Config should automatically wrap all inductor passes if isinstance(inductor_config[PASS_KEY], PostGradPassManager): + # PassManager already added to config, make sure it's correct assert (inductor_config[PASS_KEY].uuid() == self.post_grad_pass_manager.uuid()) else: + # Config should automatically wrap all inductor passes assert isinstance(inductor_config[PASS_KEY], InductorPass) self.post_grad_pass_manager.add(inductor_config[PASS_KEY]) inductor_config[PASS_KEY] = self.post_grad_pass_manager diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 43c345695ef4e..e3677b3dd62d8 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -39,6 +39,7 @@ class AttentionQuantPattern(ABC): self, layer: Attention, quant_key: QuantKey, + dtype: torch.dtype, ): self.layer = layer self.layer_name = layer.layer_name @@ -46,11 +47,16 @@ class AttentionQuantPattern(ABC): self.head_size = layer.head_size self.quant_key = quant_key self.quant_dtype = quant_key.dtype + self.dtype = dtype assert self.quant_key in QUANT_OPS, \ f"unsupported quantization scheme {self.quant_key}" self.QUANT_OP = QUANT_OPS[self.quant_key] + def empty(self, *args, **kwargs): + kwargs = {'dtype': self.dtype, 'device': "cuda", **kwargs} + return torch.empty(*args, **kwargs) + def empty_quant(self, *args, **kwargs): kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs} return torch.empty(*args, **kwargs) @@ -91,12 +97,13 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): def __init__( self, layer: Attention, + dtype: torch.dtype, symmetric: bool = True, ): quant_key = QuantKey(dtype=FP8_DTYPE, scale=kStaticTensorScale, symmetric=symmetric) - super().__init__(layer, quant_key) + super().__init__(layer, quant_key, dtype) def _register(self, pm_pass: PatternMatcherPass): @@ -139,10 +146,14 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size]) inputs = [ - empty_bf16(5, self.num_heads, self.head_size), # q - empty_bf16(5, self.num_heads, self.head_size), # k - empty_bf16(5, self.num_heads, self.head_size), # v - empty_bf16(5, self.num_heads, self.head_size), # attn_output + self.empty(5, self.num_heads, self.head_size, + dtype=self.dtype), # q + self.empty(5, self.num_heads, self.head_size, + dtype=self.dtype), # k + self.empty(5, self.num_heads, self.head_size, + dtype=self.dtype), # v + self.empty(5, self.num_heads, self.head_size, + dtype=self.dtype), # attn_output self.empty_quant(5, self.num_heads * self.head_size), # quant_output empty_fp32(1, 1) # scale @@ -165,8 +176,8 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): will be passed into Attention op as the `output_scale` argument. """ - def __init__(self, layer: Attention): - super().__init__(layer, kNvfp4Quant) + def __init__(self, layer: Attention, dtype: torch.dtype): + super().__init__(layer, kNvfp4Quant, dtype) def _register(self, pm_pass: PatternMatcherPass): @@ -255,12 +266,14 @@ class AttnFusionPass(VllmInductorPass): attn_layers = get_layers_from_vllm_config(config, Attention) for layer_name, layer in attn_layers.items(): - pattern_fp8 = AttentionFp8StaticQuantPattern(layer) + pattern_fp8 = AttentionFp8StaticQuantPattern( + layer, config.model_config.dtype) pattern_fp8.register_if_supported(self.patterns) if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): - pattern_nvfp4 = AttentionNvfp4QuantPattern(layer) + pattern_nvfp4 = AttentionNvfp4QuantPattern( + layer, config.model_config.dtype) pattern_nvfp4.register_if_supported(self.patterns) if len(attn_layers) == 0: diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 8f6b7f83d47f8..e89a5e643b0e5 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -171,10 +171,12 @@ def flashinfer_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor, bias=bias) -def rocm_per_tensor_w8a8_scaled_mm_impl( - qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype, - scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, - input_2d: torch.Tensor) -> torch.Tensor: +def rocm_per_tensor_w8a8_scaled_mm_impl(qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor) -> torch.Tensor: from vllm.platforms.rocm import on_mi3xx if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx( ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0: @@ -190,10 +192,12 @@ def rocm_per_tensor_w8a8_scaled_mm_impl( return output -def rocm_per_tensor_w8a8_scaled_mm_fake( - qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype, - scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, - input_2d: torch.Tensor) -> torch.Tensor: +def rocm_per_tensor_w8a8_scaled_mm_fake(qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor) -> torch.Tensor: return qinput.new_empty((*qinput.shape[:-1], weight.shape[1]), dtype=out_dtype) @@ -203,11 +207,10 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, out_dtype: torch.dtype, scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, - input_2d: torch.Tensor, output_shape: list) -> torch.Tensor: output = torch.ops.vllm.rocm_per_tensor_w8a8_scaled_mm_impl( - qinput, weight, out_dtype, scale_a, scale_b, bias, input_2d) - return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape) + qinput, weight, out_dtype, scale_a, scale_b, bias) + return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape) direct_register_custom_op( @@ -224,7 +227,6 @@ def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, out_dtype: torch.dtype, scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, - input_2d: torch.Tensor, output_shape: list) -> torch.Tensor: output = torch._scaled_mm(qinput, weight, @@ -237,7 +239,7 @@ def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, if type(output) is tuple and len(output) == 2: output = output[0] - return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape) + return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape) def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor, @@ -245,7 +247,7 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor, out_dtype: torch.dtype, scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, - input_2d: torch.Tensor, output_shape: list, + output_shape: list, **kwargs) -> torch.Tensor: # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM # when using it. @@ -265,7 +267,7 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor, scale_b=scale_b.t(), bias=bias) - output = torch.narrow(output, 0, 0, input_2d.shape[0]) + output = torch.narrow(output, 0, 0, qinput.shape[0]) output = output.view(*output_shape) return output @@ -275,7 +277,6 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor, out_dtype: torch.dtype, scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, - input_2d: torch.Tensor, output_shape: list, **kwargs) -> torch.Tensor: # Use unfused DQ due to limitations with scaled_mm @@ -305,8 +306,8 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor, if type(output) is tuple and len(output) == 2: output = output[0] # Unpad (undo num_token_padding) - output = torch.narrow(output, 0, 0, input_2d.shape[0]) - x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0]) + output = torch.narrow(output, 0, 0, qinput.shape[0]) + x_scale = torch.narrow(scale_a, 0, 0, qinput.shape[0]) # DQ # C = sw * sx * (X * W) + bias @@ -430,7 +431,6 @@ class Fp8LinearOp: scale_a=x_scale, scale_b=weight_scale, bias=bias, - input_2d=input_2d, output_shape=output_shape) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index fe2894eaa0751..c294a5a73cbdd 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -15,6 +15,8 @@ from vllm.attention.ops.chunked_prefill_paged_decode import ( from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, kFp8StaticTensorSym) from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import (AttentionCGSupport, @@ -202,6 +204,9 @@ def use_aiter_unified_attention() -> bool: class TritonAttentionImpl(AttentionImpl): + def fused_output_quant_supported(self, quant_key: QuantKey): + return quant_key == kFp8StaticTensorSym + def __init__( self, num_heads: int, @@ -297,9 +302,9 @@ class TritonAttentionImpl(AttentionImpl): """ assert output is not None, "Output tensor must be provided." - if output_scale is not None or output_block_scale is not None: + if output_block_scale is not None: raise NotImplementedError( - "fused output quantization is not yet supported" + "fused block_scale output quantization is not yet supported" " for TritonAttentionImpl") if attn_metadata is None: @@ -394,6 +399,7 @@ class TritonAttentionImpl(AttentionImpl): alibi_slopes=self.alibi_slopes, sliding_window=self.sliding_window[0], sm_scale=self.scale, + output_scale=output_scale, sinks=self.sinks, ) @@ -419,6 +425,6 @@ class TritonAttentionImpl(AttentionImpl): k_descale=layer._k_scale.expand(descale_shape), v_descale=layer._v_scale.expand(descale_shape), sinks=self.sinks, - ) + output_scale=output_scale) return output From b5e383cd8b62975dec605bed05e22d273c296c7a Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Wed, 10 Sep 2025 14:33:13 -0700 Subject: [PATCH 139/584] [gpt-oss] raise error for flashinfer backend without trtllm (#24482) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- vllm/v1/attention/backends/flashinfer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 51defdd40de18..afa5a7c14d4d0 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -216,7 +216,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.window_left = self.global_hyperparameters.window_left self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap self.has_sinks = self.global_hyperparameters.has_sinks - + if self.has_sinks and not supports_trtllm_attention()[0]: + raise NotImplementedError( + "FlashInfer backend currently does not support attention " + "sinks, please use trtllm on blackwell or flash attention on " + "earlier GPUs.") # Preparing persistent buffers (device-side) self.paged_kv_indptr = torch.zeros(max_num_reqs + 1, dtype=torch.int32, @@ -408,7 +412,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.q_data_type, is_prefill=False, has_sinks=self.has_sinks) - + if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm): + raise NotImplementedError( + "FlashInfer backend currently does not support attention " + "sinks, please use trtllm on blackwell or flash attention on " + "earlier GPUs.") attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, q_data_type=self.q_data_type, From fba785658170e8723ccba762654f30c54b0217a4 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Wed, 10 Sep 2025 18:03:17 -0400 Subject: [PATCH 140/584] [Perf] Warmup FlashInfer attention during startup (#23439) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Matthew Bonanni <mbonanni001@gmail.com> --- vllm/model_executor/warmup/kernel_warmup.py | 28 ++++++++++++++++++- vllm/v1/attention/backends/flashinfer.py | 16 ----------- vllm/v1/worker/gpu_model_runner.py | 31 +++++++++++++++++++-- 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 761172e4d3616..e42e34ebc77b9 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING import torch import vllm.envs as envs +from vllm.logger import init_logger from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup from vllm.platforms import current_platform from vllm.utils.deep_gemm import is_deep_gemm_supported @@ -19,6 +20,8 @@ if TYPE_CHECKING: from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.gpu_worker import Worker +logger = init_logger(__name__) + def kernel_warmup(worker: "Worker"): # Deep GEMM warmup @@ -30,10 +33,33 @@ def kernel_warmup(worker: "Worker"): max_tokens = worker.scheduler_config.max_num_batched_tokens deep_gemm_warmup(model, max_tokens) - # FlashInfer autotune for Blackwell (SM 10.0) GPUs + # FlashInfer kernel autotune for Blackwell (SM 10.0) GPUs if has_flashinfer() and current_platform.is_device_capability(100): flashinfer_autotune(worker.model_runner) + # FlashInfer attention warmup + # Only warmup if the model has FlashInfer attention groups + # and is not a pooling model + def _is_flashinfer_backend(backend): + try: + return backend.get_name() == "FLASHINFER_VLLM_V1" + except NotImplementedError: + return False + + if not worker.model_runner.is_pooling_model and all( + _is_flashinfer_backend(group.backend) + for groups in worker.model_runner.attn_groups for group in groups): + logger.info("Warming up FlashInfer attention.") + # Warmup with mixed batch containing both prefill and decode tokens + # This is to warm up both prefill and decode attention kernels + worker.model_runner._dummy_run( + num_tokens=16, + skip_eplb=True, + is_profile=True, + force_attention=True, + create_mixed_batch=True, + ) + def flashinfer_autotune(runner: "GPUModelRunner") -> None: """ diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index afa5a7c14d4d0..9e05cc8ab2f18 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -549,22 +549,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) return attn_metadata - def build_for_cudagraph_capture( - self, common_attn_metadata: CommonAttentionMetadata): - """ - This method builds the metadata for full cudagraph capture. - Currently, only decode is supported for full cudagraphs with FlashInfer. - """ - m = common_attn_metadata - - assert m.num_reqs == m.num_actual_tokens, \ - "FlashInfer only supports decode-only full CUDAGraph capture. " \ - "Make sure all cudagraph capture sizes <= max_num_seq." - - m.max_query_len = 1 # decode-only - - return self.build(0, m) - def use_cascade_attention(self, *args, **kwargs) -> bool: if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype: # TODO: The cascade wrapper currently does not support setting diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d3822251b5b67..b75756fbdae85 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2578,6 +2578,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): uniform_decode: bool = False, skip_eplb: bool = False, is_profile: bool = False, + create_mixed_batch: bool = False, remove_lora: bool = True, ) -> tuple[torch.Tensor, torch.Tensor]: """ @@ -2596,6 +2597,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): uniform_decode: If True, the batch is a uniform decode batch. skip_eplb: If True, skip EPLB state update. is_profile: If True, this is a profile run. + create_mixed_batch: If True, create a mixed batch with both decode + (1 token) and prefill (multiple tokens) requests. remove_lora: If False, dummy LoRAs are not destroyed after the run """ assert cudagraph_runtime_mode in { @@ -2627,7 +2630,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # has num_tokens in total. assert num_tokens <= self.scheduler_config.max_num_batched_tokens max_num_reqs = self.scheduler_config.max_num_seqs - if uniform_decode: + if create_mixed_batch: + assert not uniform_decode + # Create mixed batch: + # first half decode tokens, second half one prefill + num_decode_tokens = num_tokens // 2 + num_prefill_tokens = num_tokens - num_decode_tokens + num_reqs = num_decode_tokens + 1 + + # Create decode requests (1 token each) followed by prefill request + num_scheduled_tokens_list = [1] * num_decode_tokens + [ + num_prefill_tokens + ] + # Note: Overriding max_query_len to be the prefill tokens + max_query_len = num_prefill_tokens + elif uniform_decode: + assert not create_mixed_batch num_reqs = cdiv(num_tokens, max_query_len) assert num_reqs <= max_num_reqs, \ "Do not capture num_reqs > max_num_reqs for uniform batch" @@ -2652,8 +2670,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL: attn_metadata = {} - # Make sure max_model_len is used at the graph capture time. - self.seq_lens.np[:num_reqs] = self.max_model_len + if create_mixed_batch: + # In the mixed batch mode (used for FI warmup), we use + # shorter sequence lengths to run faster. + # TODO(luka) better system for describing dummy batches + seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1] + else: + # Make sure max_model_len is used at the graph capture time. + seq_lens = self.max_model_len + self.seq_lens.np[:num_reqs] = seq_lens self.seq_lens.np[num_reqs:] = 0 self.seq_lens.copy_to_gpu() From dcb28a332bfb7c7b1991ab8c6694233956a4eafe Mon Sep 17 00:00:00 2001 From: Hanjie Qiu <50634613+hjjq@users.noreply.github.com> Date: Wed, 10 Sep 2025 18:31:10 -0400 Subject: [PATCH 141/584] [Kernel] Flashinfer MLA (trtllm-gen) decode kernel integration (#21078) Signed-off-by: hjjq <hanjieq@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- .buildkite/test-pipeline.yaml | 3 +- .../test_cutlass_mla_decode.py | 0 .../attention/test_flashinfer_mla_decode.py | 123 ++++++++++++++++++ vllm/engine/arg_utils.py | 1 + vllm/platforms/cuda.py | 15 +++ vllm/platforms/interface.py | 1 + vllm/v1/attention/backends/mla/common.py | 3 + .../attention/backends/mla/flashinfer_mla.py | 110 ++++++++++++++++ 8 files changed, 255 insertions(+), 1 deletion(-) rename tests/kernels/{ => attention}/test_cutlass_mla_decode.py (100%) create mode 100644 tests/kernels/attention/test_flashinfer_mla_decode.py create mode 100644 vllm/v1/attention/backends/mla/flashinfer_mla.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 06cb557e2ecf9..51fc9c46e66df 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -729,7 +729,8 @@ steps: # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py similarity index 100% rename from tests/kernels/test_cutlass_mla_decode.py rename to tests/kernels/attention/test_cutlass_mla_decode.py diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py new file mode 100644 index 0000000000000..02225432f77fc --- /dev/null +++ b/tests/kernels/attention/test_flashinfer_mla_decode.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +import torch.nn.functional as F +from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla +from torch import Tensor + +from vllm.platforms import current_platform + +FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024 + +if not current_platform.has_device_capability(100): + pytest.skip( + reason="FlashInfer MLA Requires compute capability of 10 or above.", + allow_module_level=True) + + +def ref_mla( + out: Tensor, # (bs, num_heads, v_head_dim) + query: Tensor, # (bs, num_heads, head_dim) + kv_cache: Tensor, # (num_blocks, block_size, head_dim) + scale: float, + block_tables: Tensor, # (bs, max_num_blocks) + seq_lens: Tensor, # (bs,) +): + bs, num_heads, v_head_dim = out.shape + head_dim = query.shape[2] + + for i in range(bs): + # gather and flatten KV-cache + kv = kv_cache[ + block_tables[i]] # (max_num_blocks, block_size, head_dim) + kv = kv.view(1, -1, + head_dim)[:, :seq_lens[i]] # (1, seq_len, head_dim) + v = kv[:, :, :v_head_dim] + + q = query[i].view(num_heads, 1, head_dim) + o = F.scaled_dot_product_attention(q, + kv, + v, + scale=scale, + enable_gqa=True) + out[i] = o.view(num_heads, v_head_dim) + + return out + + +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("bs", [1, 2, 4, 16]) +@pytest.mark.parametrize("block_size", [32, 64]) +def test_flashinfer_mla_decode(dtype: torch.dtype, bs: int, block_size: int): + torch.set_default_device('cuda') + torch.manual_seed(42) + + # Deepseek R1 config + num_heads = 128 + kv_lora_rank = 512 + qk_nope_head_dim = 128 + qk_rope_head_dim = 64 + qk_head_dim = kv_lora_rank + qk_rope_head_dim + scale = (qk_nope_head_dim + qk_rope_head_dim)**-0.5 + + MAX_SEQ_LEN = 1024 + + seq_lens = [torch.randint(2, MAX_SEQ_LEN, (1, )).item() for _ in range(bs)] + seq_lens[-1] = MAX_SEQ_LEN + max_seq_len = max(seq_lens) + seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32) + + # Generate block tables with random but unique block IDs + # From https://github.com/flashinfer-ai/flashinfer/pull/1222 + blocks_per_seq = (seq_lens_tensor + block_size - 1) // block_size + max_num_blocks_per_seq = max(blocks_per_seq.max().item(), 4) + total_blocks_needed = sum(blocks_per_seq) + # Get random unique IDs for all blocks + all_block_ids = torch.randperm(total_blocks_needed) + + block_id = 0 + block_tables = torch.zeros( + (bs, max_num_blocks_per_seq), + dtype=torch.int32, + ) + + # Populate block tables and track block assignments + block_id = 0 + for i in range(bs): + num_blocks_needed = blocks_per_seq[i] + block_tables[i, :num_blocks_needed] = all_block_ids[block_id:block_id + + num_blocks_needed] + block_id += num_blocks_needed + + kv_cache = torch.randn(block_tables.numel(), block_size, + qk_head_dim).to(dtype) + q = torch.randn(bs, num_heads, qk_head_dim).to(dtype) + + out_ref = q.new_zeros(bs, num_heads, kv_lora_rank) + ref_mla(out_ref, q, kv_cache, scale, block_tables, seq_lens_tensor) + + workspace_buffer = torch.zeros( + FLASHINFER_WORKSPACE_BUFFER_SIZE, + dtype=torch.uint8, + device=q.device, + ) + # Flashinfer MLA expects the query to be of shape + # (bs, q_len_per_request, num_heads, qk_head_dim), + # where q_len_per_request is the MTP query length (=1 without MTP) + q = q.unsqueeze(1) + + out_ans = trtllm_batch_decode_with_kv_cache_mla( + query=q, + kv_cache=kv_cache.unsqueeze(1), + workspace_buffer=workspace_buffer, + qk_nope_head_dim=qk_nope_head_dim, + kv_lora_rank=kv_lora_rank, + qk_rope_head_dim=qk_rope_head_dim, + block_tables=block_tables, + seq_lens=seq_lens_tensor, + max_seq_len=max_seq_len, + bmm1_scale=scale, + ) + out_ans = out_ans.squeeze(1) + torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3fa1d0922f50c..87d90f5147cd8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1504,6 +1504,7 @@ class EngineArgs: "FLASH_ATTN_MLA", "FLASHINFER", "FLASHINFER_VLLM_V1", + "FLASHINFER_MLA", "ROCM_AITER_MLA", "TORCH_SDPA_VLLM_V1", "FLEX_ATTENTION", diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index f15c5594c27b2..dc94cfcc3ce87 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -228,6 +228,8 @@ class CudaPlatformBase(Platform): use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or ( selected_backend is None and cls.is_device_capability(100) and block_size == 128) + use_flashinfermla = (selected_backend == _Backend.FLASHINFER_MLA + and cls.has_device_capability(100)) use_flashmla = selected_backend in [ _Backend.FLASHMLA, _Backend.FLASHMLA_VLLM_V1 ] or (selected_backend is None and is_flashmla_supported()[0]) @@ -252,6 +254,19 @@ class CudaPlatformBase(Platform): else: logger.warning( "Cutlass MLA backend is only supported on V1 engine") + if use_flashinfermla: + if use_v1: + from vllm.v1.attention.backends.utils import ( + set_kv_cache_layout) + set_kv_cache_layout("HND") + logger.info_once( + "Using FlashInfer MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "flashinfer_mla.FlashInferMLABackend") + else: + logger.warning( + "FlashInfer MLA backend is only supported on V1 engine" + ) if use_flashmla: if block_size != 64: logger.warning( diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0cea49eece42e..ab0eaa82ef201 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -51,6 +51,7 @@ class _Backend(enum.Enum): TORCH_SDPA_VLLM_V1 = enum.auto() FLASHINFER = enum.auto() FLASHINFER_VLLM_V1 = enum.auto() + FLASHINFER_MLA = enum.auto() TRITON_MLA = enum.auto() # Supported by V1 TRITON_MLA_VLLM_V1 = enum.auto() CUTLASS_MLA = enum.auto() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 226bc436058d8..440a206eb485b 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -381,6 +381,7 @@ class MLACommonMetadata(Generic[D]): num_reqs: int max_query_len: int + max_seq_len: int num_actual_tokens: int # Number of tokens excluding padding. query_start_loc: torch.Tensor @@ -644,6 +645,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): num_reqs = common_attn_metadata.num_reqs num_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len + max_seq_len = common_attn_metadata.max_seq_len # Note(simon): be careful about the CPU <> GPU memory movement in this # function. We should avoid GPU -> CPU sync as much as possible because @@ -830,6 +832,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): attn_metadata = self.metadata_cls( num_reqs=common_attn_metadata.num_reqs, max_query_len=common_attn_metadata.max_query_len, + max_seq_len=max_seq_len, num_actual_tokens=num_tokens, query_start_loc=query_start_loc, slot_mapping=slot_mapping, diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py new file mode 100644 index 0000000000000..71eb9e0ce70e6 --- /dev/null +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional, Union + +import torch +from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla + +from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, + is_quantized_kv_cache) +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import (MLACommonBackend, + MLACommonImpl, + MLACommonMetadata) + +logger = init_logger(__name__) + +FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024 + + +class FlashInferMLABackend(MLACommonBackend): + + @staticmethod + def get_name() -> str: + return "FLASHINFER_MLA" + + @staticmethod + def get_impl_cls() -> type["FlashInferMLAImpl"]: + return FlashInferMLAImpl + + +g_fi_workspace = torch.zeros( + FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE, + dtype=torch.uint8, + device="cuda", +) + + +class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float], + attn_type: str, + kv_sharing_target_layer_name: Optional[str], + # MLA Specific Arguments + **mla_args) -> None: + super().__init__(num_heads, head_size, scale, num_kv_heads, + alibi_slopes, sliding_window, kv_cache_dtype, + logits_soft_cap, attn_type, + kv_sharing_target_layer_name, **mla_args) + + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] + if any(unsupported_features): + raise NotImplementedError( + "FlashInferMLAImpl does not support one of the following: " + "alibi_slopes, sliding_window, logits_soft_cap") + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashInferMLAImpl") + + if is_quantized_kv_cache(self.kv_cache_dtype): + raise NotImplementedError( + "FlashInferMLA V1 with FP8 KV cache not yet supported") + + self._workspace_buffer = g_fi_workspace + + def _forward_decode( + self, + q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: MLACommonMetadata, + layer: AttentionLayer, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + assert kv_c_and_k_pe_cache.numel() > 0 + assert attn_metadata.decode is not None + + if isinstance(q, tuple): + q_nope, q_pe = q + q = torch.cat([q_nope, q_pe], dim=-1) + + # trtllm API requires extra dimension q_len_per_request for MTP + q = q.unsqueeze(1) + + o = trtllm_batch_decode_with_kv_cache_mla( + query=q, + kv_cache=kv_c_and_k_pe_cache.unsqueeze(1), + workspace_buffer=self._workspace_buffer, + qk_nope_head_dim=self.qk_nope_head_dim, + kv_lora_rank=self.kv_lora_rank, + qk_rope_head_dim=self.qk_rope_head_dim, + block_tables=attn_metadata.decode.block_table, + seq_lens=attn_metadata.decode.seq_lens, + max_seq_len=attn_metadata.max_seq_len, + bmm1_scale=self.scale, + ) + + # TODO: Return LSE pending support from Flashinfer API: + # https://github.com/flashinfer-ai/flashinfer/pull/1566 + return o, None From cc99baf14dacc2497d0c5ed84e076ef2c37f6a4d Mon Sep 17 00:00:00 2001 From: Jonathan Berkhahn <jaberkha@us.ibm.com> Date: Wed, 10 Sep 2025 15:41:12 -0700 Subject: [PATCH 142/584] [Misc] Make timeout passable in init_distributed_environment (#24522) Signed-off-by: jberkhahn <jaberkha@us.ibm.com> --- vllm/distributed/parallel_state.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 602bcebc017dd..ef229299b6848 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -29,6 +29,7 @@ import weakref from collections import namedtuple from contextlib import contextmanager, nullcontext from dataclasses import dataclass +from datetime import timedelta from multiprocessing import shared_memory from typing import Any, Callable, Optional, Union from unittest.mock import patch @@ -978,13 +979,12 @@ def set_custom_all_reduce(enable: bool): _ENABLE_CUSTOM_ALL_REDUCE = enable -def init_distributed_environment( - world_size: int = -1, - rank: int = -1, - distributed_init_method: str = "env://", - local_rank: int = -1, - backend: str = "nccl", -): +def init_distributed_environment(world_size: int = -1, + rank: int = -1, + distributed_init_method: str = "env://", + local_rank: int = -1, + backend: str = "nccl", + timeout: Optional[timedelta] = None): logger.debug( "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s", world_size, rank, local_rank, @@ -1020,7 +1020,8 @@ def init_distributed_environment( backend=backend, init_method=distributed_init_method, world_size=world_size, - rank=rank) + rank=rank, + timeout=timeout) # set the local rank # local_rank is not available in torch ProcessGroup, # see https://github.com/pytorch/pytorch/issues/122816 From 5931b7e5d9acd4fd9eb42d56086c379fa2e2014e Mon Sep 17 00:00:00 2001 From: Alexandre Marques <almarque@redhat.com> Date: Wed, 10 Sep 2025 22:13:56 -0400 Subject: [PATCH 143/584] [Models][Quantization] Add quantization configuration update in Voxtral model (#24122) Signed-off-by: Alexandre Marques <almarque@redhat.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- vllm/model_executor/models/llama.py | 19 +++++-- vllm/model_executor/models/voxtral.py | 73 +++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 96530562b072c..f8ea2111fed57 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -626,9 +626,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): loaded_weight: torch.Tensor, ) -> tuple[str, torch.Tensor]: - def permute(w: torch.Tensor, n_heads: int): + def permute(w: torch.Tensor, n_heads: int, attn_out: int): attn_in = self.config.head_dim * n_heads - attn_out = self.config.hidden_size return w.view(n_heads, attn_in // n_heads // 2, 2, attn_out).transpose(1, 2).reshape(attn_in, attn_out) @@ -637,12 +636,24 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): modules = name.split(".") # rotary embeds should be sliced + # If using quantized model in mistral format, + # quantization scales (qscale_weight) also need to be sliced if "wk" in modules and modules[-1] == "weight": loaded_weight = permute(loaded_weight, - self.config.num_key_value_heads) + self.config.num_key_value_heads, + self.config.hidden_size) + elif "wk" in modules and modules[ + -1] == "qscale_weight" and loaded_weight.numel() > 1: + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads, 1) elif "wq" in modules and modules[-1] == "weight": loaded_weight = permute(loaded_weight, - self.config.num_attention_heads) + self.config.num_attention_heads, + self.config.hidden_size) + elif "wq" in modules and modules[ + -1] == "qscale_weight" and loaded_weight.numel() > 1: + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads, 1) num_modules = len(modules) for i in range(num_modules): diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 27e8b6fa55351..1ea317c2f95f9 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -23,6 +23,7 @@ from transformers.tokenization_utils_base import TextInput from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.inputs.data import PromptType from vllm.logger import init_logger +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import SupportsPP from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -327,6 +328,12 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, super().__init__() self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) + # update quant config to so that ignored module and target module names + # match the vLLM model names + if hasattr(vllm_config, "quant_config"): + vllm_config.quant_config = self.maybe_update_quant_config( + vllm_config.quant_config) + config = vllm_config.model_config.hf_config self.config = config self.downsample_factor = self.config.audio_config.downsample_factor @@ -558,6 +565,72 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, return loaded_weights + def maybe_update_quant_config( + self, quant_config: QuantizationConfig) -> QuantizationConfig: + """ + Update quant config to so that ignored module and target module names + match the vLLM model names. + Right now this is specific for compressed-tensors format and + load_format mistral. + """ + remapping_rules = [ + (r"output", r"language_model.lm_head"), + (r"layers\.(\d+)\.attention\.wo", + r"language_model.model.layers.\1.self_attn.out_proj"), + (r"layers\.(\d+)\.attention\.w(.*)", + r"language_model.model.layers.\1.self_attn.\2_proj"), + (r"layers\.(\d+)\.feed_forward\.w1", + r"language_model.model.layers.\1.mlp.gate_proj"), + (r"layers\.(\d+)\.feed_forward\.w2", + r"language_model.model.layers.\1.mlp.down_proj"), + (r"layers\.(\d+)\.feed_forward\.w3", + r"language_model.model.layers.\1.mlp.up_proj"), + (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo", + r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj" + ), + (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)", + r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj" + ), + (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)", + r"whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2"), + (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0", + r"whisper_encoder.whisper_encoder.conv1"), + (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.1", + r"whisper_encoder.whisper_encoder.conv2"), + (r"mm_whisper_embeddings\.audio_language_projection\.0", + r"audio_language_adapter.w_in"), + (r"mm_whisper_embeddings\.audio_language_projection\.2", + r"audio_language_adapter.w_out"), + ] + + # Update ignore list + if hasattr(quant_config, "ignore"): + mistral_ignore = [] + for name in quant_config.ignore: + mistral_name = name + for pattern, repl in remapping_rules: + if re.fullmatch(pattern, name): + mistral_name = re.sub(pattern, repl, name) + mistral_ignore.append(mistral_name) + quant_config.ignore = mistral_ignore + + # Update target list + if hasattr(quant_config, "config_groups"): + config_groups = quant_config.config_groups + for group_name in config_groups: + if "targets" in config_groups[group_name]: + targets = [] + for name in config_groups[group_name]["targets"]: + mistral_name = name + for pattern, repl in remapping_rules: + if re.fullmatch(pattern, name): + mistral_name = re.sub(pattern, repl, name) + targets.append(mistral_name) + config_groups[group_name]["targets"] = targets + quant_config.config_groups = config_groups + + return quant_config + class AudioLanguageAdapter(nn.Module): From 8c5a747246515995aff188a3e6320b0643083d5c Mon Sep 17 00:00:00 2001 From: youkaichao <youkaichao@gmail.com> Date: Thu, 11 Sep 2025 11:09:38 +0800 Subject: [PATCH 144/584] [distributed] update known issues (#24624) Signed-off-by: youkaichao <youkaichao@gmail.com> --- docs/usage/troubleshooting.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index a82d97ea222f7..6e700d1faaa9c 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -324,3 +324,4 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). - To address a memory overhead issue in older NCCL versions (see [bug](https://github.com/NVIDIA/nccl/issues/1234)), vLLM versions `>= 0.4.3, <= 0.10.1.1` would set the environment variable `NCCL_CUMEM_ENABLE=0`. External processes connecting to vLLM also needed to set this variable to prevent hangs or crashes. Since the underlying NCCL bug was fixed in NCCL 2.22.3, this override was removed in newer vLLM versions to allow for NCCL performance optimizations. +- In some PCIe machines (e.g. machines without NVLink), if you see an error like `transport/shm.cc:590 NCCL WARN Cuda failure 217 'peer access is not supported between these two devices'`, it's likely caused by a driver bug. See [this issue](https://github.com/NVIDIA/nccl/issues/1838) for more details. In that case, you can try to set `NCCL_CUMEM_HOST_ENABLE=0` to disable the feature, or upgrade your driver to the latest version. From 55b823ba0fb56a8f65d4005175e28c81c3386844 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Thu, 11 Sep 2025 12:23:04 +0800 Subject: [PATCH 145/584] Add @chaunceyjiang to codeowner for reasoning Reasoning and Tool parser (#24406) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> --- .github/CODEOWNERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f627ff46447e1..61c59fe89005b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -16,8 +16,8 @@ /vllm/v1/sample @22quinn @houseroad /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee -/vllm/reasoning @aarnphm -/vllm/entrypoints @aarnphm +/vllm/reasoning @aarnphm @chaunceyjiang +/vllm/entrypoints @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg CMakeLists.txt @tlrmchlsmth @LucasWilkinson From 6c8deacd7211ce4f536d22938e26c9ab931bb1f0 Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Wed, 10 Sep 2025 21:23:18 -0700 Subject: [PATCH 146/584] [Bug] [Spec Decode] Fix model_initialization test and mismatch in aux_hidden_layers (#24613) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> --- tests/models/registry.py | 23 ++++++++++++++++------- tests/models/test_initialization.py | 5 ++++- tests/models/utils.py | 15 ++++++++++++--- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index c80f045d98743..34b6923b726bc 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -97,6 +97,12 @@ class _HfExamplesInfo: max_num_seqs: Optional[int] = None """Maximum number of sequences to be processed in a single iteration.""" + use_original_num_layers: bool = False + """ + If True, use the original number of layers from the model config + instead of minimal layers for testing. + """ + def check_transformers_version( self, *, @@ -597,18 +603,21 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random", speculative_model="eagle618/eagle-deepseek-v3-random", # noqa: E501 trust_remote_code=True), - "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B", + "EagleLlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B-Instruct", # noqa: E501 trust_remote_code=True, speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B", - tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501 - "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501 + tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501 + "Eagle3LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.1-8B-Instruct", # noqa: E501 trust_remote_code=True, - speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", - tokenizer="meta-llama/Llama-3.1-8B-Instruct"), - "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3", # noqa: E501 + speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501 + tokenizer="meta-llama/Llama-3.1-8B-Instruct", + use_original_num_layers=True, + max_model_len=10240), + "LlamaForCausalLMEagle3": _HfExamplesInfo("Qwen/Qwen3-8B", # noqa: E501 trust_remote_code=True, speculative_model="AngelSlim/Qwen3-8B_eagle3", # noqa: E501 - tokenizer="Qwen/Qwen3-8B"), + tokenizer="Qwen/Qwen3-8B", + use_original_num_layers=True), "EagleLlama4ForCausalLM": _HfExamplesInfo( "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", trust_remote_code=True, diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 792b93fbcd0f3..06bbc4cea834f 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -36,7 +36,10 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, hf_overrides_fn = partial(dummy_hf_overrides, model_arch=model_arch, - exist_overrides=model_info.hf_overrides) + exist_overrides=model_info.hf_overrides, + use_original_num_layers=getattr( + model_info, 'use_original_num_layers', + False)) # Avoid calling model.forward() def _initialize_kv_caches_v0(self) -> None: diff --git a/tests/models/utils.py b/tests/models/utils.py index 44e9bf539bc17..76c6e4823a12c 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -396,6 +396,7 @@ def dummy_hf_overrides( *, model_arch: str = "", exist_overrides: Optional[dict[str, Any]] = None, + use_original_num_layers: bool = False, ) -> PretrainedConfig: """ Dummy HF overrides function used to create dummy model @@ -412,10 +413,18 @@ def dummy_hf_overrides( # we use three layers for Gemma-3n to check # both normal layer and kv_shared_layer - num_hidden_layers = (3 if model_arch == "Gemma3nForConditionalGeneration" - else 1) + if use_original_num_layers: + # Use the original number of layers from the config + num_layers = getattr(text_config, 'num_layers', 1) + num_hidden_layers = getattr(text_config, 'num_hidden_layers', 1) + else: + # Use minimal layers for testing + num_layers = 1 + num_hidden_layers = (3 if model_arch + == "Gemma3nForConditionalGeneration" else 1) + text_config.update({ - "num_layers": 1, + "num_layers": num_layers, "num_hidden_layers": num_hidden_layers, "num_experts": num_experts, "num_experts_per_tok": 2, From f17a6aa4ec3462ad812331259c527388eb09eb0d Mon Sep 17 00:00:00 2001 From: Peter Salas <peter@fixie.ai> Date: Wed, 10 Sep 2025 22:25:34 -0700 Subject: [PATCH 147/584] [Ultravox] Fix Gemma instantiation, support quantization via --hf-overrides (#24131) Signed-off-by: Peter Salas <peter@fixie.ai> --- vllm/config/__init__.py | 12 ++-- vllm/model_executor/models/ultravox.py | 2 +- vllm/transformers_utils/configs/ultravox.py | 79 ++++++++++++--------- 3 files changed, 53 insertions(+), 40 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 538d44d5337cc..3f63bd2dcf416 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1091,11 +1091,11 @@ class ModelConfig: assert_never(runner_type) - def _parse_quant_hf_config(self): - quant_cfg = getattr(self.hf_config, "quantization_config", None) + def _parse_quant_hf_config(self, hf_config: PretrainedConfig): + quant_cfg = getattr(hf_config, "quantization_config", None) if quant_cfg is None: # compressed-tensors uses a "compression_config" key - quant_cfg = getattr(self.hf_config, "compression_config", None) + quant_cfg = getattr(hf_config, "compression_config", None) else: # Set quant_method for ModelOpt models. @@ -1136,7 +1136,11 @@ class ModelConfig: self.quantization) # Parse quantization method from the HF model config, if available. - quant_cfg = self._parse_quant_hf_config() + quant_cfg = self._parse_quant_hf_config(self.hf_config) + if quant_cfg is None and (text_config := getattr( + self.hf_config, "text_config", None)): + # Check the text config as well for multi-modal models. + quant_cfg = self._parse_quant_hf_config(text_config) if quant_cfg is not None: # Use the community standard 'quant_method' diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index c883065805279..9885309035e6c 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -276,7 +276,7 @@ class UltravoxProjector(nn.Module): else: self.act = get_act_fn(config.projector_act) - dim_out = config.text_hidden_size + dim_out = config.text_config.hidden_size self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False) # Ultravox v0.4.1 and below use layer_norm after the second linear layer diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 87064cc12deda..71266b9327369 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -20,10 +20,13 @@ class UltravoxConfig(transformers.PretrainedConfig): Args: audio_config (`Union[AutoConfig, dict]`, *optional*): - Custom audio config or dict + Custom audio config or dict. text_config (`Union[AutoConfig, dict]`, *optional*): - The config object of the text backbone. Can be any of `LlamaConfig` - or `MistralConfig`. + The config object of the text backbone. + audio_model_id (`str`, *optional*): + The model ID of the audio backbone. + text_model_id (`str`, *optional*): + The model ID of the text backbone. ignore_index (`int`, *optional*, defaults to -100): The ignore index for the loss function. audio_token_index (`int`, *optional*, defaults to 32000): @@ -60,15 +63,10 @@ class UltravoxConfig(transformers.PretrainedConfig): stack_factor: int = 8, norm_init: float = 0.4, projector_act: str = "swiglu", - text_model_lora_config: Optional[dict[str, Any]] = None, - audio_model_lora_config: Optional[dict[str, Any]] = None, projector_ln_mid: bool = False, **kwargs, ): self.ignore_index = ignore_index - - self.audio_model_id = audio_model_id - self.text_model_id = text_model_id self.audio_token_index = audio_token_index self.hidden_size = hidden_size @@ -77,36 +75,47 @@ class UltravoxConfig(transformers.PretrainedConfig): self.projector_act = projector_act self.projector_ln_mid = projector_ln_mid - if text_model_id is not None: - # Avoid circular import - from vllm.transformers_utils.config import get_config - - text_config_obj = get_config(text_model_id, - trust_remote_code=False) - else: + # N.B. May set the wrapped_model_config below. + self.text_model_id = text_model_id + if text_model_id is None: text_config = text_config or {} - text_config_obj = transformers.CONFIG_MAPPING[text_config.get( - "model_type", "llama")](**text_config) + self.wrapped_model_config = transformers.CONFIG_MAPPING[ + text_config.get("model_type", "llama")](**text_config) - inner_text_config = text_config_obj.get_text_config() - - if audio_model_id is not None: - # Avoid circular import - from vllm.transformers_utils.config import get_config - - audio_config = get_config(audio_model_id, trust_remote_code=False) - else: + # N.B. May set the audio_config below. + self.audio_model_id = audio_model_id + if audio_model_id is None: + self.audio_model_id = None audio_config = audio_config or {} - audio_config = transformers.CONFIG_MAPPING[audio_config.get( + self.audio_config = transformers.CONFIG_MAPPING[audio_config.get( "model_type", "whisper")](**audio_config) - self.text_config = text_config_obj - self.audio_config = audio_config - self.text_model_lora_config = text_model_lora_config or {} - self.audio_model_lora_config = audio_model_lora_config or {} - - self.vocab_size = inner_text_config.vocab_size - self.initializer_range = inner_text_config.initializer_range - self.text_hidden_size = inner_text_config.hidden_size - super().__init__(**kwargs) + + def __setattr__(self, key, value): + # Since --hf-overrides are applied _after_ the UltravoxConfig is + # instantiated, load the configs implicitly when assigning text_model_id + # or audio_model_id. This allows: + # + # --hf-overrides.text_model_id=<quantized variant> + # + # to behave as intended. + if key == "text_model_id" and value is not None: + from vllm.transformers_utils.config import get_config + + self.wrapped_model_config = get_config(value, + trust_remote_code=False) + elif key == "audio_model_id" and value is not None: + from vllm.transformers_utils.config import get_config + + self.audio_config = get_config(value, trust_remote_code=False) + + return super().__setattr__(key, value) + + @property + def text_config(self) -> Optional[transformers.PretrainedConfig]: + # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate + # the full model, but the text config is the text config of the inner + # model. + return (self.wrapped_model_config.get_text_config() + if self.wrapped_model_config else None) From 29799ddacc9234591ce33e0e8e818fae20565a10 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" <jiang1.li@intel.com> Date: Thu, 11 Sep 2025 13:28:41 +0800 Subject: [PATCH 148/584] [Bugfix] Add missing VIT backend dispatch on CPU (#24623) Signed-off-by: jiang1.li <jiang1.li@intel.com> --- vllm/attention/layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index be4dc3eb3c0da..bb05b468fd102 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -399,7 +399,8 @@ class MultiHeadAttention(nn.Module): key, value, scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: + elif (self.attn_backend == _Backend.TORCH_SDPA + or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1): query, key, value = (x.transpose(1, 2) for x in (query, key, value)) out = F.scaled_dot_product_attention(query, From e2d8c27f68687476cb6c45723c44193b517308ea Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Wed, 10 Sep 2025 23:05:30 -0700 Subject: [PATCH 149/584] [BugFix] Fix pipeline parallel (#24621) Signed-off-by: Nick Hill <nhill@redhat.com> --- vllm/executor/uniproc_executor.py | 4 ++++ vllm/v1/worker/gpu_model_runner.py | 1 - vllm/v1/worker/kv_connector_model_runner_mixin.py | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index aabc9ed9b80a2..f45a94f3151b6 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -71,6 +71,10 @@ class UniProcExecutor(ExecutorBase): self.shutdown() return + def shutdown(self) -> None: + if worker := self.driver_worker: + worker.shutdown() + UniProcExecutorAsync = UniProcExecutor diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b75756fbdae85..ce53154896bae 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2070,7 +2070,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): sampler_output = self._sample(logits, spec_decode_metadata) with record_function_or_nullcontext("Bookkeep"): - assert isinstance(hidden_states, torch.Tensor) ( num_nans_in_logits, logprobs_lists, diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 67bb967d2edfa..3eb9f26e9f5b6 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -45,7 +45,8 @@ class KVConnectorModelRunnerMixin: @staticmethod def ensure_kv_transfer_shutdown() -> None: - if has_kv_transfer_group(): + # has_kv_transfer_group can be None during interpreter shutdown. + if has_kv_transfer_group and has_kv_transfer_group(): ensure_kv_transfer_shutdown() @staticmethod From 8a894084d21a1ae3bd22743e285c79bda735f910 Mon Sep 17 00:00:00 2001 From: Guy Stone <guys@spotify.com> Date: Thu, 11 Sep 2025 02:05:42 -0400 Subject: [PATCH 150/584] [Engine][Chore] use local variable and remove output var assignment (#24554) Signed-off-by: Guy Stone <guys@spotify.com> --- vllm/engine/protocol.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index b0b11a33a4443..94eacfbdfb301 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -78,6 +78,7 @@ class EngineClient(ABC): preprocessor = await self.get_input_preprocessor() tokenizer_group = preprocessor.get_tokenizer_group() tokenizer = await tokenizer_group.get_lora_tokenizer_async() + eos_token_id = tokenizer.eos_token_id if is_explicit_encoder_decoder_prompt(prompt): raise NotImplementedError @@ -104,7 +105,7 @@ class EngineClient(ABC): tokenized_length = len(prompt_token_ids) sort_beams_key = create_sort_beams_key_function( - tokenizer.eos_token_id, length_penalty) + eos_token_id, length_penalty) beam_search_params = SamplingParams( logprobs=2 * beam_width, @@ -154,7 +155,7 @@ class EngineClient(ABC): if result.outputs[0].logprobs is not None: logprobs = result.outputs[0].logprobs[0] for token_id, logprob_obj in logprobs.items(): - if token_id == tokenizer.eos_token_id and \ + if token_id == eos_token_id and \ not ignore_eos: completed.append( BeamSearchSequence( @@ -166,7 +167,7 @@ class EngineClient(ABC): cum_logprob=current_beam.cum_logprob + logprob_obj.logprob, finish_reason="stop", - stop_reason=tokenizer.eos_token_id)) + stop_reason=eos_token_id)) else: new_beams.append( BeamSearchSequence( @@ -189,14 +190,14 @@ class EngineClient(ABC): best_beams = sorted_completed[:beam_width] for beam in best_beams: - if (beam.tokens[-1] == tokenizer.eos_token_id and not ignore_eos): + if (beam.tokens[-1] == eos_token_id and not ignore_eos): # Skip the eos token in the text. tokens = beam.tokens[tokenized_length:-1] else: tokens = beam.tokens[tokenized_length:] beam.text = tokenizer.decode(tokens) - beam_search_output = RequestOutput( + yield RequestOutput( request_id=request_id, prompt=prompt_text, outputs=[ @@ -214,8 +215,6 @@ class EngineClient(ABC): prompt_token_ids=prompt_token_ids, prompt_logprobs=None) - yield beam_search_output - @abstractmethod def encode( self, From 3d1393f6fc730f34e6f57a8bed40c455164f5ac4 Mon Sep 17 00:00:00 2001 From: "Saman A. Pour" <samanamp@outlook.com> Date: Wed, 10 Sep 2025 23:06:16 -0700 Subject: [PATCH 151/584] Kimi K2 Fused MoE kernels Optimization configs (#24597) Signed-off-by: Saman Keon <samanamp@outlook.com> --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 5 files changed, 730 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b4e736bec9b65 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bb71005a72bc5 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..ac53df14ce846 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..f1ed617d6308f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e72282dc5bcd9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} From ee0bc5e1b42c98de83b6364ee32f1a21422bfae7 Mon Sep 17 00:00:00 2001 From: Tomas Ruiz <tomas.ruiz.te@gmail.com> Date: Thu, 11 Sep 2025 08:06:19 +0200 Subject: [PATCH 152/584] Enable --profile in 'vllm bench throughput' (#24575) Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com> --- vllm/benchmarks/throughput.py | 38 +++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index f022a55e625f5..96e39fd92eba0 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -37,6 +37,7 @@ def run_vllm( requests: list[SampleRequest], n: int, engine_args: EngineArgs, + do_profile: bool, disable_detokenize: bool = False, ) -> tuple[float, Optional[list[RequestOutput]]]: from vllm import LLM, SamplingParams @@ -75,10 +76,14 @@ def run_vllm( outputs = None if not use_beam_search: start = time.perf_counter() + if do_profile: + llm.start_profile() outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests, use_tqdm=True) + if do_profile: + llm.stop_profile() end = time.perf_counter() else: assert lora_requests is None, "BeamSearch API does not support LoRA" @@ -88,6 +93,8 @@ def run_vllm( for request in requests: assert request.expected_output_len == output_len start = time.perf_counter() + if do_profile: + llm.start_profile() llm.beam_search( prompts, BeamSearchParams( @@ -95,6 +102,8 @@ def run_vllm( max_tokens=output_len, ignore_eos=True, )) + if do_profile: + llm.stop_profile() end = time.perf_counter() return end - start, outputs @@ -103,6 +112,7 @@ def run_vllm_chat( requests: list[SampleRequest], n: int, engine_args: EngineArgs, + do_profile: bool, disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]: """ Run vLLM chat benchmark. This function is recommended ONLY for benchmarking @@ -133,7 +143,11 @@ def run_vllm_chat( detokenize=not disable_detokenize, )) start = time.perf_counter() + if do_profile: + llm.start_profile() outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + if do_profile: + llm.stop_profile() end = time.perf_counter() return end - start, outputs @@ -142,6 +156,7 @@ async def run_vllm_async( requests: list[SampleRequest], n: int, engine_args: AsyncEngineArgs, + do_profile: bool, disable_frontend_multiprocessing: bool = False, disable_detokenize: bool = False, ) -> float: @@ -185,6 +200,8 @@ async def run_vllm_async( generators = [] start = time.perf_counter() + if do_profile: + await llm.start_profile() for i, (prompt, sp, lr) in enumerate(zip(prompts, sampling_params, lora_requests)): generator = llm.generate(prompt, @@ -195,6 +212,8 @@ async def run_vllm_async( all_gens = merge_async_iterators(*generators) async for i, res in all_gens: pass + if do_profile: + await llm.stop_profile() end = time.perf_counter() return end - start @@ -543,6 +562,12 @@ def add_cli_args(parser: argparse.ArgumentParser): type=str, default=None, help="Split of the HF dataset.") + parser.add_argument( + "--profile", + action="store_true", + default=False, + help="Use Torch Profiler. The env variable " + "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.") # prefix repetition dataset prefix_repetition_group = parser.add_argument_group( @@ -600,22 +625,27 @@ def main(args: argparse.Namespace): requests, args.n, AsyncEngineArgs.from_cli_args(args), - args.disable_frontend_multiprocessing, - args.disable_detokenize, + disable_frontend_multiprocessing=args.disable_frontend_multiprocessing, + disable_detokenize=args.disable_detokenize, + do_profile=args.profile, )) else: elapsed_time, request_outputs = run_vllm( requests, args.n, EngineArgs.from_cli_args(args), - args.disable_detokenize) + disable_detokenize=args.disable_detokenize, + do_profile=args.profile) elif args.backend == "hf": assert args.tensor_parallel_size == 1 + if args.profile: + raise NotImplementedError( + "Profiling not implemented yet for backend='hf'.") elapsed_time = run_hf(requests, args.model, tokenizer, args.n, args.hf_max_batch_size, args.trust_remote_code, args.disable_detokenize) elif args.backend == "vllm-chat": elapsed_time, request_outputs = run_vllm_chat( requests, args.n, EngineArgs.from_cli_args(args), - args.disable_detokenize) + disable_detokenize=args.disable_detokenize, do_profile=args.profile) else: raise ValueError(f"Unknown backend: {args.backend}") From 41329a0ff95e916670e1a559e82d0d0d45d0883b Mon Sep 17 00:00:00 2001 From: shengshiqi-google <160179165+shengshiqi-google@users.noreply.github.com> Date: Thu, 11 Sep 2025 06:10:01 +0000 Subject: [PATCH 153/584] [Core] feat: Add --safetensors-load-strategy flag for faster safetensors loading from Lustre (#24469) Signed-off-by: Shiqi Sheng <shengshiqi@google.com> Signed-off-by: shengshiqi-google <160179165+shengshiqi-google@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/config/load.py | 9 ++++++++ vllm/engine/arg_utils.py | 5 +++++ .../model_loader/default_loader.py | 1 + .../model_loader/weight_utils.py | 22 ++++++++++++++----- 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/vllm/config/load.py b/vllm/config/load.py index e4999e36b49bf..68253359fc567 100644 --- a/vllm/config/load.py +++ b/vllm/config/load.py @@ -51,6 +51,15 @@ class LoadConfig: download_dir: Optional[str] = None """Directory to download and load the weights, default to the default cache directory of Hugging Face.""" + safetensors_load_strategy: Optional[str] = "lazy" + """Specifies the loading strategy for safetensors weights. + - "lazy" (default): Weights are memory-mapped from the file. This enables + on-demand loading and is highly efficient for models on local storage. + - "eager": The entire file is read into CPU memory upfront before loading. + This is recommended for models on network filesystems (e.g., Lustre, NFS) + as it avoids inefficient random reads, significantly speeding up model + initialization. However, it uses more CPU RAM. + """ model_loader_extra_config: Union[dict, TensorizerConfig] = field( default_factory=dict) """Extra config for model loader. This will be passed to the model loader diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 87d90f5147cd8..d9a29511eb529 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -289,6 +289,8 @@ class EngineArgs: trust_remote_code: bool = ModelConfig.trust_remote_code allowed_local_media_path: str = ModelConfig.allowed_local_media_path download_dir: Optional[str] = LoadConfig.download_dir + safetensors_load_strategy: Optional[ + str] = LoadConfig.safetensors_load_strategy load_format: Union[str, LoadFormats] = LoadConfig.load_format config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype @@ -587,6 +589,8 @@ class EngineArgs: load_group.add_argument("--load-format", **load_kwargs["load_format"]) load_group.add_argument("--download-dir", **load_kwargs["download_dir"]) + load_group.add_argument("--safetensors-load-strategy", + **load_kwargs["safetensors_load_strategy"]) load_group.add_argument("--model-loader-extra-config", **load_kwargs["model_loader_extra_config"]) load_group.add_argument("--ignore-patterns", @@ -1023,6 +1027,7 @@ class EngineArgs: return LoadConfig( load_format=self.load_format, download_dir=self.download_dir, + safetensors_load_strategy=self.safetensors_load_strategy, device="cpu" if is_online_quantization(self.quantization) else None, model_loader_extra_config=self.model_loader_extra_config, diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index f883e1e739102..d1bdec21fd974 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -189,6 +189,7 @@ class DefaultModelLoader(BaseModelLoader): weights_iterator = safetensors_weights_iterator( hf_weights_files, self.load_config.use_tqdm_on_load, + self.load_config.safetensors_load_strategy, ) else: if extra_config.get("enable_multithread_load"): diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0de8dbbca9c7f..c6ca9cd48d009 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -19,7 +19,7 @@ import huggingface_hub.constants import numpy as np import torch from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download -from safetensors.torch import load_file, safe_open, save_file +from safetensors.torch import load, load_file, safe_open, save_file from tqdm.auto import tqdm from vllm import envs @@ -519,18 +519,28 @@ def np_cache_weights_iterator( def safetensors_weights_iterator( hf_weights_files: list[str], use_tqdm_on_load: bool, + safetensors_load_strategy: Optional[str] = "lazy", ) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" + loading_desc = "Loading safetensors checkpoint shards" + if safetensors_load_strategy == "eager": + loading_desc += " (eager)" + for st_file in tqdm( hf_weights_files, - desc="Loading safetensors checkpoint shards", + desc=loading_desc, disable=not enable_tqdm(use_tqdm_on_load), bar_format=_BAR_FORMAT, ): - with safe_open(st_file, framework="pt") as f: - for name in f.keys(): # noqa: SIM118 - param = f.get_tensor(name) - yield name, param + if safetensors_load_strategy == "eager": + with open(st_file, "rb") as f: + state_dict = load(f.read()) + yield from state_dict.items() + else: + with safe_open(st_file, framework="pt") as f: + for name in f.keys(): # noqa: SIM118 + param = f.get_tensor(name) + yield name, param def multi_thread_safetensors_weights_iterator( From e2b1f863aa0ca94532624fd2add48d61f8446c2d Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Thu, 11 Sep 2025 08:19:28 +0200 Subject: [PATCH 154/584] [Doc]: fixing doc typos (#24635) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- vllm/config/__init__.py | 2 +- vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py | 2 +- vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py | 2 +- vllm/model_executor/models/arcee.py | 2 +- vllm/model_executor/models/llava_onevision.py | 2 +- vllm/model_executor/models/phi4_multimodal.py | 2 +- vllm/model_executor/models/phi4mm_audio.py | 2 +- vllm/model_executor/models/qwen2_5_omni_thinker.py | 4 ++-- vllm/v1/attention/backends/mla/common.py | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 3f63bd2dcf416..f62e4468ef177 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3881,7 +3881,7 @@ def contains_object_print(text): Check if the text looks like a printed Python object, e.g. contains any substring matching the pattern: "at 0xFFFFFFF>" We match against 0x followed by 2-16 hex chars (there's - a max of 16 on a 64 bit system). + a max of 16 on a 64-bit system). Args: text (str): The text to check diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 63af3ff7f4d1f..37c360145b04a 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -60,7 +60,7 @@ class Internlm2ToolParser(ToolParser): if '<|action_start|>' not in current_text: self.position = len(current_text) return DeltaMessage(content=delta_text) - # if the tool call is sended, return an empty delta message + # if the tool call is sent, return an empty delta message # to make sure the finish_reason will be sent correctly. if self.current_tool_id > 0: return DeltaMessage(content='') diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py index ad58a9918f03c..a7b3c814859ce 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py @@ -502,7 +502,7 @@ def _chunk_state_varlen_kernel( dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk - # If HAS_INITSTATES==True need to consider two possiblties + # If HAS_INITSTATES==True need to consider two possibilities # - if start_idx < pid_c * chunk_size, then we need to take the past_states_ptrs # - if state_idx >= pid * chunk_size, then we need to insert initstates if ((start_idx < pid_c * chunk_size) # first chunk diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 13ed4da0602ad..be82c2fd59644 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -342,7 +342,7 @@ class ArceeModel(nn.Module): class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): """Arcee Model for causal language modeling, integrated with vLLM runtime.""" - # Map fused module names to their sub-module components + # Map fused module names to their submodule components # (for quantization and LoRA) packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index bc340a9e2d8f8..46d54452a52d8 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -835,7 +835,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 6d973a964de04..ab63649b43561 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -1350,7 +1350,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index a2481375f4fb8..a1c452053ddd2 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -100,7 +100,7 @@ class ConformerEncoderLayer(nn.Module): activation function for glu used in the multihead attention, default "swish". activation_checkpointing: str, optional - a dictionarry of {"module","interval","offload"}, where + a dictionary of {"module","interval","offload"}, where "module": str accept ["transformer", "attention"] to select which module should do activation checkpointing. diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index e79428d17a70a..f8a943d4cab33 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -846,7 +846,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration( return [] # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary @@ -873,7 +873,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration( if multimodal_embeddings is not None \ and len(multimodal_embeddings) != 0: - # TODO (ywang96): support overlapping modalitiy embeddings so that + # TODO (ywang96): support overlapping modality embeddings so that # `use_audio_in_video` will work on V1. inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [ diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 440a206eb485b..5308a1113a1ad 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -463,7 +463,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.dcp_world_size = 1 self.dcp_rank = 0 - # Dont try to access the runner on AMD + # Don't try to access the runner on AMD if self.aot_schedule: self.page_size = self.kv_cache_spec.block_size From 9bd831f5012b0ad1b4e3b7c1af511b19558d7626 Mon Sep 17 00:00:00 2001 From: TaehyunKim <73943231+ca1207@users.noreply.github.com> Date: Thu, 11 Sep 2025 15:29:40 +0900 Subject: [PATCH 155/584] [Model] New model support for Motif-1-Tiny (#23414) Signed-off-by: ca1207 <ca1207zzz@gmail.com> Signed-off-by: TaehyunKim <73943231+ca1207@users.noreply.github.com> Co-authored-by: WyldeCat <skan1543@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> --- benchmarks/kernels/benchmark_polynorm.py | 155 ++++++++ csrc/layernorm_kernels.cu | 251 +++++++++++++ csrc/ops.h | 5 +- csrc/torch_bindings.cpp | 6 + docs/models/supported_models.md | 1 + tests/kernels/core/test_layernorm.py | 33 +- tests/models/registry.py | 3 + tests/models/test_initialization.py | 5 +- vllm/_custom_ops.py | 8 + .../backends/differential_flash_attn.py | 3 + vllm/model_executor/layers/layernorm.py | 59 +++ vllm/model_executor/models/motif.py | 345 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 13 files changed, 871 insertions(+), 4 deletions(-) create mode 100644 benchmarks/kernels/benchmark_polynorm.py create mode 100644 vllm/model_executor/models/motif.py diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py new file mode 100644 index 0000000000000..9ac8f5e6594e4 --- /dev/null +++ b/benchmarks/kernels/benchmark_polynorm.py @@ -0,0 +1,155 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools + +import torch + +from vllm import _custom_ops as vllm_ops +from vllm.triton_utils import triton + + +def polynorm_naive( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + + def norm(x, eps: float): + return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps) + + x = x.float() + return ( + ( + weight[0] * norm(x**3, eps) + + weight[1] * norm(x**2, eps) + + weight[2] * norm(x, eps) + + bias + ) + .to(weight.dtype) + .view(orig_shape) + ) + + +def polynorm_vllm( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + + out = torch.empty_like(x) + vllm_ops.poly_norm(out, x, weight, bias, eps) + output = out + + output = output.view(orig_shape) + return output + + +def calculate_diff(batch_size, seq_len, hidden_dim): + dtype = torch.bfloat16 + x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") + weight = torch.ones(3, dtype=dtype, device="cuda") + bias = torch.ones(1, dtype=dtype, device="cuda") + + output_naive = polynorm_naive(x, weight, bias) + output_vllm = polynorm_vllm(x, weight, bias) + + if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): + print("✅ All implementations match") + else: + print("❌ Implementations differ") + + +batch_size_range = [2**i for i in range(0, 7, 2)] +seq_length_range = [2**i for i in range(6, 11, 1)] +dim_range = [2048, 4096] +configs = list(itertools.product(dim_range, batch_size_range, seq_length_range)) + + +def get_benchmark(): + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["dim", "batch_size", "seq_len"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["naive", "vllm"], + line_names=["Naive", "vLLM"], + styles=[("blue", "-"), ("red", "-")], + ylabel="us", + plot_name="polynorm-perf", + args={}, + ) + ) + def benchmark(dim, batch_size, seq_len, provider): + dtype = torch.bfloat16 + hidden_dim = dim * 4 + + x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") + weight = torch.ones(3, dtype=dtype, device="cuda") + bias = torch.ones(1, dtype=dtype, device="cuda") + + quantiles = [0.5, 0.2, 0.8] + + if provider == "naive": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: polynorm_naive(x, weight, bias), + quantiles=quantiles, + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: polynorm_vllm(x, weight, bias), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + return benchmark + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--batch-size", + type=int, + default=4, + help="Batch size", + ) + parser.add_argument( + "--seq-len", + type=int, + default=128, + help="Sequence length", + ) + parser.add_argument( + "--hidden-dim", + type=int, + default=8192, + help="Intermediate size of MLP", + ) + parser.add_argument( + "--save-path", + type=str, + default="./configs/polnorm/", + help="Path to save polnorm benchmark results", + ) + + args = parser.parse_args() + + # Run correctness test + calculate_diff( + batch_size=args.batch_size, + seq_len=args.seq_len, + hidden_dim=args.hidden_dim, + ) + + benchmark = get_benchmark() + # Run performance benchmark + benchmark.run(print_data=True, save_path=args.save_path) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index f051eb0702228..05be023de0f28 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -140,6 +140,211 @@ fused_add_rms_norm_kernel( } } +/* Function specialization in the case of FP16/BF16 tensors. + Additional optimizations we can make in this case are + packed and vectorized operations, which help with the + memory latency bottleneck. + + _f16VecPN struct extends _f16Vec to add operations specifically required for + polynomial normalization (poly norm). + The original _f16Vec does not include the sum-of-powers computation or + in-place polynomial normalization logic. */ +template <typename scalar_t, int width> +struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> { + using Base = _f16Vec<scalar_t, width>; + using Converter = typename Base::Converter; + using T1 = typename Base::T1; + using T2 = typename Base::T2; + using Base::data; + + __device__ auto sum_pows() const { + float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f; + +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 z = Converter::convert(T2{data[i], data[i + 1]}); + float x2 = z.x * z.x; + float x4 = x2 * x2; + float x6 = x4 * x2; + + float y2 = z.y * z.y; + float y4 = y2 * y2; + float y6 = y4 * y2; + + s2 += x2 + y2; + s4 += x4 + y4; + s6 += x6 + y6; + } + return std::make_tuple(s2, s4, s6); + } + + __device__ void poly_norm_inplace(const float w2_inv_std, + const float w1_inv_std2, + const float w0_inv_std3, const float bias) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 z = Converter::convert(T2{data[i], data[i + 1]}); + + float x2 = z.x * z.x; + float x3 = x2 * z.x; + z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias; + + float y2 = z.y * z.y; + float y3 = y2 * z.y; + z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias; + + auto out = Converter::convert(z); + data[i] = out.x; + data[i + 1] = out.y; + } + } +}; + +template <typename scalar_t, int width> +__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists> +poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [3] + const scalar_t* __restrict__ bias, // [1] + const float epsilon, const int hidden_size) { + // Sanity checks on our vector struct and type-punned pointer arithmetic + static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>); + static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width); + + /* These and the argument pointers are all declared `restrict` as they are + not aliased in practice. Argument pointers should not be dereferenced + in this kernel as that would be undefined behavior */ + auto* __restrict__ input_v = + reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input); + const int vec_hidden_size = hidden_size / width; + float variance = 0.0f; + float variance2 = 0.0f; + float variance3 = 0.0f; + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16VecPN<scalar_t, width> temp = input_v[id]; + auto [x2, x4, x6] = temp.sum_pows(); + + variance += x2; + variance2 += x4; + variance3 += x6; + } + + float3 thread_variances = make_float3(variance, variance2, variance3); + + struct SumOp { + __device__ float3 operator()(const float3& a, const float3& b) const { + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); + } + }; + + using BlockReduce = cub::BlockReduce<float3, 1024>; + __shared__ typename BlockReduce::TempStorage reduceStore; + float3 block_variances = + BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x); + + variance = block_variances.x; + variance2 = block_variances.y; + variance3 = block_variances.z; + + __shared__ float s_w2_inv_std; + __shared__ float s_w1_inv_std2; + __shared__ float s_w0_inv_std3; + __shared__ float s_bias; + + if (threadIdx.x == 0) { + float w0 = (float)weight[0]; + float w1 = (float)weight[1]; + float w2 = (float)weight[2]; + s_bias = (float)bias[0]; + + s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon); + s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon); + s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon); + } + __syncthreads(); + + auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out); + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16VecPN<scalar_t, width> temp = input_v[id]; + temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias); + out_v[id] = temp; + } +} + +/* Generic poly_norm_kernel + The width field is not used here but necessary for other specializations. + */ +template <typename scalar_t, int width> +__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists> +poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [3] + const scalar_t* __restrict__ bias, // [1] + const float epsilon, const int hidden_size) { + float variance = 0.0f; + float variance2 = 0.0f; + float variance3 = 0.0f; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)input[blockIdx.x * hidden_size + idx]; + float x2 = x * x; + float x4 = x2 * x2; + float x6 = x4 * x2; + + variance += x2; + variance2 += x4; + variance3 += x6; + } + + float3 thread_variances = make_float3(variance, variance2, variance3); + + struct SumOp { + __device__ float3 operator()(const float3& a, const float3& b) const { + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); + } + }; + + using BlockReduce = cub::BlockReduce<float3, 1024>; + __shared__ typename BlockReduce::TempStorage reduceStore; + float3 block_variances = + BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x); + + variance = block_variances.x; + variance2 = block_variances.y; + variance3 = block_variances.z; + + __shared__ float s_w2_inv_std; + __shared__ float s_w1_inv_std2; + __shared__ float s_w0_inv_std3; + __shared__ float s_bias; + + if (threadIdx.x == 0) { + float w0 = (float)weight[0]; + float w1 = (float)weight[1]; + float w2 = (float)weight[2]; + s_bias = (float)bias[0]; + + s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon); + s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon); + s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon); + } + __syncthreads(); + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)input[blockIdx.x * hidden_size + idx]; + float x2 = x * x; + float x3 = x2 * x; + + out[blockIdx.x * hidden_size + idx] = + (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 + + s_bias); + } +} + } // namespace vllm void rms_norm(torch::Tensor& out, // [..., hidden_size] @@ -219,3 +424,49 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size] LAUNCH_FUSED_ADD_RMS_NORM(0); } } + +#define LAUNCH_FUSED_POLY_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \ + vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>( \ + out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), \ + weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon, \ + hidden_size); \ + }); + +void poly_norm(torch::Tensor& out, // [..., hidden_size] + torch::Tensor& input, // [..., hidden_size] + torch::Tensor& weight, // [3] + torch::Tensor& bias, // [1] + double epsilon) { + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(out.data_ptr() != input.data_ptr()); + + int hidden_size = input.size(-1); + int num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + /* This kernel is memory-latency bound in many scenarios. + When num_tokens is large, a smaller block size allows + for increased block occupancy on CUs and better latency + hiding on global mem ops. */ + const int max_block_size = (num_tokens < 256) ? 1024 : 256; + dim3 block(std::min(hidden_size, max_block_size)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + /*If the tensor types are FP16/BF16, try to use the optimized kernel + with packed + vectorized ops. + Max optimization is achieved with a width-8 vector of FP16/BF16s + since we can load at most 128 bits at once in a global memory op. + However, this requires each tensor's data to be aligned to 16 + bytes. + */ + auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr()); + auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr()); + bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0; + if (ptrs_are_aligned && hidden_size % 8 == 0) { + LAUNCH_FUSED_POLY_NORM(8); + } else { + LAUNCH_FUSED_POLY_NORM(0); + } +} diff --git a/csrc/ops.h b/csrc/ops.h index a288112e21000..2ee5df4cac54c 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -92,6 +92,9 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, + torch::Tensor& bias, double epsilon); + void apply_repetition_penalties_(torch::Tensor& logits, const torch::Tensor& prompt_mask, const torch::Tensor& output_mask, @@ -353,4 +356,4 @@ void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles); void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, int64_t quant_level, bool cast_bf2half = false); int64_t qr_max_size(); -#endif \ No newline at end of file +#endif diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index d3f50d1076cb0..9e4586e8f0450 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -168,6 +168,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "float epsilon) -> ()"); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); + // Polynomial Normalization. + ops.def( + "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float " + "epsilon) -> ()"); + ops.impl("poly_norm", torch::kCUDA, &poly_norm); + // Apply repetition penalties to logits in-place ops.def( "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, " diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ec65bfca10096..851b1b58d4bae 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -383,6 +383,7 @@ th { | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MotifForCausalLM` | Motif-1-Tiny | `Motif-Technologies/Motif-2.6B`, `Motif-Technologies/Motif-2.6b-v1.1-LC`, etc. | ✅︎ | ✅︎ | | | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 02316ceaac735..53e6d793cf2f9 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -6,7 +6,7 @@ import torch from tests.kernels.quant_utils import FP8_DTYPE from tests.kernels.utils import opcheck -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -70,6 +70,37 @@ def test_rms_norm( (out, x, layer.weight.data, layer.variance_epsilon)) +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_poly_norm( + num_tokens: int, + hidden_size: int, + dtype: torch.dtype, + seed: int, + device: str, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + layer = PolyNorm().to(dtype=dtype) + layer.weight.data.normal_(mean=1.0, std=0.1) + layer.bias.data.normal_(mean=1.0, std=0.1) + scale = 1 / (2 * hidden_size) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + x *= scale + + ref_out = layer.forward_native(x) + out = layer(x) + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + opcheck( + torch.ops._C.poly_norm, + (out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon)) + + @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) diff --git a/tests/models/registry.py b/tests/models/registry.py index 34b6923b726bc..dfeaee1acf4e4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -291,6 +291,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501 {"tiny": "TitanML/tiny-mixtral"}), # noqa: E501 + "MotifForCausalLM": _HfExamplesInfo("Motif-Technologies/Motif-2.6B", + trust_remote_code=True, + v0_only=True), "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 06bbc4cea834f..3b13a12276f5d 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -63,8 +63,9 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, _initialize_kv_caches_v1), monkeypatch.context() as m): if model_info.v0_only: m.setenv("VLLM_USE_V1", "0") - if model_arch == "Phi4FlashForCausalLM": - # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend + if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"): + # Phi4FlashForCausalLM and MotifForCausalLM + # only supports DIFFERENTIAL_FLASH_ATTN backend m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN") if model_arch == "GptOssForCausalLM": # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 6e9a8df0a56a2..3b2e859b76af7 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -280,6 +280,13 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon) +def poly_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, + bias: torch.Tensor, epsilon: float) -> None: + # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input + input_contiguous = input.contiguous() + torch.ops._C.poly_norm(out, input_contiguous, weight, bias, epsilon) + + def apply_repetition_penalties_torch( logits: torch.Tensor, prompt_mask: torch.Tensor, output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None: @@ -710,6 +717,7 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool: def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool: return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability) + def cutlass_sparse_compress(a: torch.Tensor) \ -> tuple[torch.Tensor, torch.Tensor]: """ diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py index caa02530d2fd6..a7d0e3afb517f 100644 --- a/vllm/attention/backends/differential_flash_attn.py +++ b/vllm/attention/backends/differential_flash_attn.py @@ -734,6 +734,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl): window_size=self.sliding_window, alibi_slopes=self.alibi_slopes, softcap=self.logits_soft_cap, + fa_version=self.vllm_flash_attn_version, ) assert prefill_output.shape == output[: num_prefill_tokens].shape @@ -755,6 +756,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl): window_size=self.sliding_window, alibi_slopes=self.alibi_slopes, softcap=self.logits_soft_cap, + fa_version=self.vllm_flash_attn_version, ).squeeze(1) except Exception as e: logger.error("Error in PagedAttention.forward_decode: %s", @@ -787,6 +789,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl): window_size=self.sliding_window, alibi_slopes=self.alibi_slopes, softcap=self.logits_soft_cap, + fa_version=self.vllm_flash_attn_version, ).squeeze(1) return output diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 0488eab1e03fb..f875f712ba9c9 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -43,6 +43,20 @@ def fused_add_rms_norm( return x, residual +def poly_norm(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, + variance_epsilon: float) -> torch.Tensor: + from vllm import _custom_ops as ops + out = torch.empty_like(x) + ops.poly_norm( + out, + x, + weight, + bias, + variance_epsilon, + ) + return out + + def rocm_aiter_rms_norm_impl(x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float) -> torch.Tensor: import aiter as rocm_aiter @@ -320,3 +334,48 @@ class GemmaRMSNorm(CustomOp): self.forward_static) self._is_compiled = True return self.forward_native(x, residual) + + +@CustomOp.register("poly_norm") +class PolyNorm(CustomOp): + """Polynomial normalization. + + Computes x -> w_0 * RMSNorm(x^3) + w_1 * RMSNorm(x^2) + w_2 * RMSNorm(x) + b + where w_n is the learned weight and b is the bias. + Refer to https://arxiv.org/html/2411.03884v1 + """ + + def __init__( + self, + eps: float = 1e-6, + ) -> None: + super().__init__() + self.weight = torch.nn.Parameter(torch.ones(3) / 3) + self.bias = torch.nn.Parameter(torch.zeros(1)) + self.variance_epsilon = eps + + def _norm(self, x): + return x / torch.sqrt( + x.pow(2).mean(-1, keepdim=True) + self.variance_epsilon) + + def forward_native( + self, + x: torch.Tensor, + ) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward(). + + Refer to https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md + """ + + orig_dtype = x.dtype + x_float = x.to(torch.float32) + output = (self.weight[0] * self._norm(x_float**3) + + self.weight[1] * self._norm(x_float**2) + + self.weight[2] * self._norm(x_float) + self.bias) + return output.to(orig_dtype) + + def forward_cuda( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return poly_norm(x, self.weight, self.bias, self.variance_epsilon) diff --git a/vllm/model_executor/models/motif.py b/vllm/model_executor/models/motif.py new file mode 100644 index 0000000000000..153f36dcf1f55 --- /dev/null +++ b/vllm/model_executor/models/motif.py @@ -0,0 +1,345 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://huggingface.co/Motif-Technologies/Motif-2.6B/blob/main/modeling_motif.py +# Copyright (c) Alibaba Cloud. +# LICENSE: https://huggingface.co/Motif-Technologies/Motif-2.6B/blob/main/LICENSE +"""Inference-only Motif model compatible with HuggingFace weights.""" +import math +from typing import Any, Optional + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import Attention, AttentionType +from vllm.attention.selector import _Backend +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.models.llama import LlamaForCausalLM + +from .adapters import as_seq_cls_model +from .interfaces import SupportsV0Only +from .utils import extract_layer_index + + +class MotifMLP(nn.Module): + """MLP for the language component of the Motif model, which contains a + MergedColumnParallelLinear merging 2 outputs via PolyNorm activation.""" + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str = "poly_norm", + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + prefix: str = "", + reduce_results: bool = True, + ): + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[intermediate_size] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "poly_norm": + raise NotImplementedError(f"Unsupported activation: {hidden_act}. " + "Only poly_norm is supported for now.") + self.act_fn = PolyNorm() + self.intermediate_size = intermediate_size + tp_size = get_tensor_model_parallel_world_size() + if hidden_act == "poly_norm" and tp_size > 1: + raise NotImplementedError( + "Tensor parallelism for poly_norm is not supported yet. " + "Support will be added in the future.") + + def forward(self, x): + x, _ = self.gate_up_proj(x) + x = self.act_fn( + x[..., :self.intermediate_size]) * x[..., self.intermediate_size:] + x, _ = self.down_proj(x) + return x + + +class MotifAttention(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + bias_o_proj: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + ) -> None: + super().__init__() + layer_idx = extract_layer_index(prefix) + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + # MistralConfig has an optional head_dim introduced by Mistral-Nemo + head_dim = getattr(config, "head_dim", None) + if head_dim is None: + head_dim = self.hidden_size // self.total_num_heads + self.head_dim = head_dim + # Phi models introduced a partial_rotary_factor parameter in the config + self.partial_rotary_factor = getattr(config, "partial_rotary_factor", + 1) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + assert self.num_heads % 2 == 0, 'num_heads should be even' + assert self.num_kv_heads % 2 == 0, 'num_heads should be even' + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=bias_o_proj, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self._init_rotary_emb(config, + rope_scaling=rope_scaling, + quant_config=quant_config) + sliding_window = None + + self.lambda_init = self.lambda_init_fn(layer_idx) + self.lambda_q1 = nn.Parameter( + torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, + std=0.1)) + self.lambda_k1 = nn.Parameter( + torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, + std=0.1)) + self.lambda_q2 = nn.Parameter( + torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, + std=0.1)) + self.lambda_k2 = nn.Parameter( + torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, + std=0.1)) + self.subln = RMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps) + + params = { + 'differential_flash_attention_config': { + 'lambda_init': self.lambda_init, + 'lambda_q1': self.lambda_q1, + 'lambda_k1': self.lambda_k1, + 'lambda_q2': self.lambda_q2, + 'lambda_k2': self.lambda_k2, + "subln": self.subln, + } + } + + diff_attn_err_msg = ( + 'Set VLLM_ATTENTION_BACKEND="DIFFERENTIAL_FLASH_ATTN" ' + 'to enable Differential Flash Attention.') + try: + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=sliding_window, + attn_type=attn_type, + prefix=f"{prefix}.attn", + **params, + ) + except TypeError as e: + raise ValueError(diff_attn_err_msg) from e + assert (self.attn.backend == _Backend.DIFFERENTIAL_FLASH_ATTN + ), diff_attn_err_msg + + def lambda_init_fn(self, depth): + return 0.8 - 0.6 * math.exp(-0.3 * (depth - 1)) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + def _init_rotary_emb(self, config: PretrainedConfig, + rope_scaling: Optional[dict[str, Any]], + quant_config: Optional[QuantizationConfig]) -> None: + is_neox_style = True + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and config.model_type == "llama": + is_neox_style = False + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, + rope_scaling=rope_scaling, + is_neox_style=is_neox_style, + partial_rotary_factor=self.partial_rotary_factor, + ) + + +class MotifDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + attention_bias = getattr(config, "attention_bias", False) or getattr( + config, "use_bias", False) + bias_o_proj = attention_bias + if hasattr(config, 'qkv_bias'): + attention_bias = config.qkv_bias + + # By default, Motif uses causal attention as it is a decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + # (e.g. parasail-ai/GritLM-7B-vllm) + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + + self.self_attn = MotifAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + bias_o_proj=bias_o_proj, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + attn_type=attn_type, + ) + self.mlp = MotifMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "use_bias", False), + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn(positions=positions, + hidden_states=hidden_states) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +# Motif model uses differential attention +# Only supported in v0 (no chunked prefill support) +class MotifForCausalLM(LlamaForCausalLM, SupportsV0Only): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: type[nn.Module] = MotifDecoderLayer): + + # Prefix caching and chunked prefill is not supported for this model. + assert not vllm_config.cache_config.enable_prefix_caching, \ + "Motif currently does not support prefix caching" + assert not vllm_config.scheduler_config.chunked_prefill_enabled, \ + "Motif currently does not support chunked prefill" + + super().__init__(vllm_config=vllm_config, + prefix=prefix, + layer_type=layer_type) + + +MotifForSequenceClassification = as_seq_cls_model(MotifForCausalLM) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index fbc369c870c57..8fe8402c15d3d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -110,6 +110,7 @@ _TEXT_GENERATION_MODELS = { "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), + "MotifForCausalLM": ("motif", "MotifForCausalLM"), # transformers's mpt class has lower case "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), From d13360183a429562ff7d62235534ef37e712409c Mon Sep 17 00:00:00 2001 From: Chenxi Yang <cxyang@cs.utexas.edu> Date: Wed, 10 Sep 2025 23:45:07 -0700 Subject: [PATCH 156/584] Remove redundant all gather + split (#23441) Co-authored-by: Chenxi Yang <cxyang@meta.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> --- vllm/model_executor/models/glm4_1v.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 055cab901361f..846b4b800dd59 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -272,23 +272,10 @@ class Glm4vVisionAttention(nn.Module): def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: # [s, b, 3 * head * head_dim] seq_len, bs, _ = qkv.shape - if self.tp_size > 1: - qkv = all_gather_interleave(qkv, self.qkv.hidden_size, - self.tp_size) # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] q, k, v = qkv.chunk(3, dim=2) - # 3 * [s, b, head * head_dim] - if self.tp_size > 1: - splitter = partial( - dist_utils.split_tensor_along_last_dim, - num_partitions=self.tp_size, - ) - q = splitter(q)[self.tp_rank] - k = splitter(k)[self.tp_rank] - v = splitter(v)[self.tp_rank] - # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] new_shape = ( seq_len, From 2048c4e37909a42847cd2f51c7e0cf92e3b63466 Mon Sep 17 00:00:00 2001 From: Jerry Zhang <jerryzh168@gmail.com> Date: Wed, 10 Sep 2025 23:53:24 -0700 Subject: [PATCH 157/584] [torchao] Support quantization configs using module swap (#21982) Signed-off-by: Jerry Zhang <jerryzh168@gmail.com> --- .buildkite/test-pipeline.yaml | 4 ++++ tests/quantization/test_torchao.py | 20 +++++++++++++++++++ .../layers/quantization/torchao.py | 16 ++++++++------- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 51fc9c46e66df..35a849d70c313 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -507,6 +507,10 @@ steps: commands: # temporary install here since we need nightly, will move to requirements/test.in # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index eef3568efea12..8e68f6a2e019f 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -75,5 +75,25 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner): print(output) +@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") +@pytest.mark.skip( + reason="since torchao nightly is only compatible with torch nightly" + "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip " + "torchao tests that requires newer versions (0.14.0.dev+) for now") +def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner): + torch._dynamo.reset() + model_name = ("torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2" + "-0.14.0.dev") + with vllm_runner(model_name=model_name, + quantization="torchao", + dtype="bfloat16", + pt_load_map_location="cuda:0") as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + + assert output + print(output) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 63b2ab6bab063..3498d2994c2ab 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -152,18 +152,20 @@ def torchao_quantize_param_data(param: torch.Tensor, from torchao.quantization import quantize_ assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}" - """ - Avoid real weight allocation for faster load, since we will + """ + Avoid real weight allocation for faster load, since we will end up setting it to param. """ with torch.device("meta"): - dummy_linear = torch.nn.Linear(param.shape[1], - param.shape[0], - bias=False) + # linear can't be top level module since quantize_ is inplace + # while some of our configs need to do module swap, and only non-top + # level modules support module swap + dummy_linear = torch.nn.Sequential( + torch.nn.Linear(param.shape[1], param.shape[0], bias=False)) - dummy_linear.weight = param + dummy_linear[0].weight = param quantize_(dummy_linear, torchao_config) - return dummy_linear.weight + return dummy_linear[0].weight class TorchAOLinearMethod(LinearMethodBase): From e93f4cc9e37484009f74e15d3111a1f335c532a5 Mon Sep 17 00:00:00 2001 From: Tao He <linzhu.ht@alibaba-inc.com> Date: Thu, 11 Sep 2025 15:32:09 +0800 Subject: [PATCH 158/584] Add the support for the qwen3 next model (a hybrid attention model). (#24526) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> --- .yapfignore | 1 + docs/models/supported_models.md | 1 + pyproject.toml | 1 + tests/models/registry.py | 6 +- vllm/config/__init__.py | 56 +- vllm/config/compilation.py | 1 + .../layers/fla/ops/chunk_delta_h.py | 5 +- vllm/model_executor/layers/fla/ops/chunk_o.py | 9 +- .../layers/fla/ops/chunk_scaled_dot_kkt.py | 10 +- .../layers/fla/ops/fused_recurrent.py | 4 +- vllm/model_executor/layers/fla/ops/l2norm.py | 2 +- vllm/model_executor/layers/fla/ops/op.py | 5 - .../layers/mamba/mamba_utils.py | 37 + .../layers/mamba/ops/causal_conv1d.py | 48 +- vllm/model_executor/models/config.py | 3 +- vllm/model_executor/models/qwen3_next.py | 1294 +++++++++++++++++ vllm/model_executor/models/qwen3_next_mtp.py | 285 ++++ vllm/model_executor/models/registry.py | 2 + vllm/transformers_utils/config.py | 2 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/qwen3_next.py | 275 ++++ vllm/v1/attention/backends/gdn_attn.py | 319 ++++ vllm/v1/core/single_type_kv_cache_manager.py | 44 +- vllm/v1/kv_cache_interface.py | 1 + vllm/v1/spec_decode/eagle.py | 10 +- vllm/v1/worker/block_table.py | 20 +- vllm/v1/worker/gpu_input_batch.py | 17 + vllm/v1/worker/gpu_model_runner.py | 74 +- vllm/worker/worker.py | 3 +- 29 files changed, 2476 insertions(+), 61 deletions(-) create mode 100644 vllm/model_executor/models/qwen3_next.py create mode 100644 vllm/model_executor/models/qwen3_next_mtp.py create mode 100644 vllm/transformers_utils/configs/qwen3_next.py create mode 100644 vllm/v1/attention/backends/gdn_attn.py diff --git a/.yapfignore b/.yapfignore index 2d6dcf8380cac..38158259032a6 100644 --- a/.yapfignore +++ b/.yapfignore @@ -1 +1,2 @@ collect_env.py +vllm/model_executor/layers/fla/ops/*.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 851b1b58d4bae..a3cc1cda5f866 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -403,6 +403,7 @@ th { | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen3NextForCausalLM` | Qwen3.5MoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ | | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ | diff --git a/pyproject.toml b/pyproject.toml index e63f8aeae2787..7aa0371f38dc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -228,6 +228,7 @@ fo = "fo" ba = "ba" [tool.typos.type.py.extend-words] +ba = "ba" [tool.typos.type.cpp] extend-glob = ["*.cu"] diff --git a/tests/models/registry.py b/tests/models/registry.py index dfeaee1acf4e4..0c77ec5ef13c3 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -326,6 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"), + "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct", + min_transformers_version="4.56.2"), "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"), "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501 trust_remote_code=True, @@ -640,7 +642,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { is_available_online=False), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, - speculative_model="XiaomiMiMo/MiMo-7B-RL") + speculative_model="XiaomiMiMo/MiMo-7B-RL"), + "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct", + min_transformers_version="4.56.2"), } _TRANSFORMERS_BACKEND_MODELS = { diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index f62e4468ef177..587cfab35515f 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1508,7 +1508,8 @@ class ModelConfig: if (self.hf_text_config.model_type == "deepseek_mtp" or self.hf_config.model_type == "mimo_mtp" or self.hf_config.model_type == "glm4_moe_mtp" - or self.hf_config.model_type == "ernie_mtp"): + or self.hf_config.model_type == "ernie_mtp" + or self.hf_config.model_type == "qwen3_next_mtp"): total_num_hidden_layers = getattr(self.hf_text_config, "num_nextn_predict_layers", 0) else: @@ -1571,15 +1572,28 @@ class ModelConfig: if attn_type_list: return sum(t == 1 for t in attn_type_list[start:end]) - if layers_block_type_value is None and attn_type_list is None: + # Hybrid model Qwen3Next + layer_types_value = getattr(self.hf_config, "layer_types", None) + if layer_types_value is not None: + if getattr(block_type, "value", block_type) == "attention": + return sum(t == "full_attention" + for t in layer_types_value[start:end]) + elif getattr(block_type, "value", + block_type) == "linear_attention": + return sum(t == "linear_attention" + for t in layer_types_value[start:end]) + else: + return sum(t == getattr(block_type, "value", block_type) + for t in layer_types_value[start:end]) + + if (layers_block_type_value is None and attn_type_list is None + and layer_types_value is None): raise ValueError( "The model is an hybrid without a" - "layers_block_type or an attn_type_list in the hf_config," - "cannot determine the num of " + "layers_block_type or an attn_type_list, or a layer_types " + "in the hf_config, cannot determine the num of " f"{block_type.value} layers") - return sum(t == 1 for t in attn_type_list[start:end]) - def get_mamba_chunk_size(self) -> Optional[int]: """ Returns the mamba chunk size if it exists @@ -1866,7 +1880,7 @@ class DeviceConfig: SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", "mlp_speculator", "draft_model", "deepseek_mtp", - "ernie_mtp"] + "ernie_mtp", "qwen3_next_mtp"] @config @@ -2007,7 +2021,15 @@ class SpeculativeConfig: "n_predict": n_predict, "architectures": ["ErnieMTPModel"] }) - return hf_config + + if hf_config.model_type == "qwen3_next": + hf_config.model_type = "qwen3_next_mtp" + if hf_config.model_type == "qwen3_next_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["Qwen3NextMTP"] + }) return hf_config @@ -2028,9 +2050,13 @@ class SpeculativeConfig: (self.target_model_config.hf_text_config.model_type \ == "deepseek_v3" or self.target_model_config.hf_text_config.model_type in - ("mimo","ernie4_5_moe")): + ("mimo","ernie4_5_moe", "qwen3_next")): # use the draft model from the same model: self.model = self.target_model_config.model + # Align the quantization of draft model for cases such as + # --quantization fp8 with a bf16 checkpoint. + if not self.quantization: + self.quantization = self.target_model_config.quantization elif self.method in ("ngram", "[ngram]"): self.model = "ngram" else: @@ -2140,6 +2166,15 @@ class SpeculativeConfig: "one layer. Might need some code changes " \ "to support multiple layers." ) + elif (self.draft_model_config.hf_config.model_type == + "qwen3_next_mtp"): + self.method = "qwen3_next_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Qwen3Next MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) else: self.method = "draft_model" raise NotImplementedError( @@ -2355,7 +2390,8 @@ class SpeculativeConfig: return self.num_speculative_tokens def use_eagle(self) -> bool: - return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp") + return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp", + "qwen3_next_mtp") def __repr__(self) -> str: method = self.method diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 09600e96a1c62..f8ccc20222615 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -341,6 +341,7 @@ class CompilationConfig: "vllm.short_conv", "vllm.linear_attention", "vllm.plamo2_mamba_mixer", + "vllm.gdn_attention", ] def compute_hash(self) -> str: diff --git a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py index eac56ef352e70..34006f87f457b 100644 --- a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py +++ b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py @@ -14,7 +14,7 @@ import torch from vllm.triton_utils import tl, triton from .index import prepare_chunk_indices, prepare_chunk_offsets -from .op import exp, safe_exp +from .op import exp from .utils import is_nvidia_hopper, use_cuda_graph NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16] @@ -175,12 +175,13 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64( boundary_check=(0, 1)) if USE_G: + m_t = (i_t * BT + tl.arange(0, BT)) < T last_idx = min((i_t + 1) * BT, T) - 1 b_g_last = tl.load(g + bos * H + last_idx * H + i_h) p_g = tl.make_block_ptr(g + bos * H + i_h, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, )) b_g = tl.load(p_g, boundary_check=(0, )) - b_v_new = b_v_new * safe_exp(b_g_last - b_g)[:, None] + b_v_new = b_v_new * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None] b_g_last = exp(b_g_last) b_h1 = b_h1 * b_g_last if K > 64: diff --git a/vllm/model_executor/layers/fla/ops/chunk_o.py b/vllm/model_executor/layers/fla/ops/chunk_o.py index 5a36d313320fa..332751a1860a9 100644 --- a/vllm/model_executor/layers/fla/ops/chunk_o.py +++ b/vllm/model_executor/layers/fla/ops/chunk_o.py @@ -16,7 +16,7 @@ import torch from vllm.triton_utils import tl, triton from .index import prepare_chunk_indices -from .op import exp, safe_exp +from .op import exp from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper BKV_LIST = [64, 128] if check_shared_mem() else [32, 64] @@ -112,10 +112,11 @@ def chunk_fwd_kernel_o( p_g = tl.make_block_ptr(g, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, )) b_g = tl.load(p_g, boundary_check=(0, )) b_o = b_o * exp(b_g)[:, None] - b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :]) + b_A = b_A * exp(b_g[:, None] - b_g[None, :]) - o_i = tl.arange(0, BT) - m_A = o_i[:, None] >= o_i[None, :] + o_t = i_t * BT + tl.arange(0, BT) + m_t = o_t < T + m_A = (o_t[:, None] >= o_t[None, :]) & (m_t[:, None] & m_t) b_A = tl.where(m_A, b_A, 0) p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), diff --git a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py index 9938eae52db75..d1adc6978f245 100644 --- a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +++ b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py @@ -14,7 +14,7 @@ import torch from vllm.triton_utils import tl, triton from .index import prepare_chunk_indices -from .op import safe_exp +from .op import exp @triton.heuristics({ @@ -56,7 +56,8 @@ def chunk_scaled_dot_kkt_fwd_kernel( T = eos - bos else: bos, eos = i_b * T, i_b * T + T - o_t = tl.arange(0, BT) + o_t = i_t * BT + tl.arange(0, BT) + m_t = o_t < T p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, )) @@ -76,9 +77,10 @@ def chunk_scaled_dot_kkt_fwd_kernel( (i_t * BT, ), (BT, ), (0, )) b_g = tl.load(p_g, boundary_check=(0, )) b_g_diff = b_g[:, None] - b_g[None, :] - b_A = b_A * safe_exp(b_g_diff) + b_A = b_A * exp(b_g_diff) - b_A = tl.where(o_t[:, None] > o_t[None, :], b_A, 0) + m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t) + b_A = tl.where(m_A, b_A, 0) p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), (i_t * BT, 0), (BT, BT), (1, 0)) tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1)) diff --git a/vllm/model_executor/layers/fla/ops/fused_recurrent.py b/vllm/model_executor/layers/fla/ops/fused_recurrent.py index 25a615fe12441..b278e37415748 100644 --- a/vllm/model_executor/layers/fla/ops/fused_recurrent.py +++ b/vllm/model_executor/layers/fla/ops/fused_recurrent.py @@ -116,8 +116,8 @@ def fused_recurrent_gated_delta_rule_fwd_kernel( b_g = tl.load(p_g).to(tl.float32) if USE_QK_L2NORM_IN_KERNEL: - b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q)) + 1e-6) - b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k)) + 1e-6) + b_q = b_q / tl.sqrt(tl.sum(b_q * b_q) + 1e-6) + b_k = b_k / tl.sqrt(tl.sum(b_k * b_k) + 1e-6) b_q = b_q * scale # [BK, BV] b_h *= exp(b_g) diff --git a/vllm/model_executor/layers/fla/ops/l2norm.py b/vllm/model_executor/layers/fla/ops/l2norm.py index b89c67871d071..ef9788ceaf20e 100644 --- a/vllm/model_executor/layers/fla/ops/l2norm.py +++ b/vllm/model_executor/layers/fla/ops/l2norm.py @@ -78,7 +78,7 @@ def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr): row_idx = xoffset + tl.arange(0, MBLOCK)[:, None] xmask = row_idx < M rindex = tl.arange(0, N)[None, :] - xs = tl.load(X + (rindex + N * row_idx), None).to(tl.float32) + xs = tl.load(X + (rindex + N * row_idx), xmask).to(tl.float32) square = tl.broadcast_to(xs * xs, [MBLOCK, N]) square_sum = tl.sum(tl.where(xmask, square, 0), 1)[:, None] rsqrt = tl.rsqrt(square_sum + eps) diff --git a/vllm/model_executor/layers/fla/ops/op.py b/vllm/model_executor/layers/fla/ops/op.py index 05c424b437f48..8c29434ca106a 100644 --- a/vllm/model_executor/layers/fla/ops/op.py +++ b/vllm/model_executor/layers/fla/ops/op.py @@ -28,11 +28,6 @@ else: log2 = tl.log2 -@triton.jit -def safe_exp(x): - return exp(tl.where(x <= 0, x, float('-inf'))) - - if not hasattr(tl, 'gather'): @triton.jit diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index 1dc46639640b0..a6c1af91de421 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -70,6 +70,15 @@ class MambaStateDtypeCalculator: model_dtype) return (conv_state_dtype, ) + @classmethod + def gated_delta_net_state_dtype( + cls, + model_dtype: Union[ModelDType, torch.dtype], + mamba_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, torch.dtype]: + state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (state_dtype, state_dtype) + class MambaStateShapeCalculator: @@ -163,3 +172,31 @@ class MambaStateShapeCalculator: # for n_groups == 1, this is exactly tp_size - n_groups return tp_size - ngroups + + @classmethod + def gated_delta_net_state_shape( + cls, + tp_world_size: int, + num_k_heads: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + conv_kernel_size: int, + num_spec: int = 0, + use_v1: bool = True, + ): + conv_dim = (head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads) + conv_state_shape = ( + divide(conv_dim, tp_world_size), + conv_kernel_size - 1 + num_spec, + ) + + # In V0, the conv_state shape was swapped during allocation in + # MambaCacheManager, but in V1 it needs to be determined here at the + # calculation level + if use_v1: + conv_state_shape = conv_state_shape[1], conv_state_shape[0] + + temporal_state_shape = (divide(num_v_heads, + tp_world_size), head_k_dim, head_v_dim) + return conv_state_shape, temporal_state_shape diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index b8d4bbc37105d..709794429c7b6 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -464,7 +464,9 @@ def causal_conv1d_fn( # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx] # 4. computation can be skipped if cache_indices[idx] == pad_slot_id num_cache_lines = conv_states.size(0) - assert (num_cache_lines, dim, width - 1) == conv_states.shape + assert (num_cache_lines == conv_states.shape[0] + and dim == conv_states.shape[1] + and width - 1 <= conv_states.shape[2]) stride_istate_seq = conv_states.stride(0) stride_istate_dim = conv_states.stride(1) stride_istate_token = conv_states.stride(2) @@ -623,6 +625,7 @@ def _causal_conv1d_update_kernel( conv_state_ptr, cache_seqlens_ptr, # circular buffer conv_state_indices_ptr, + num_accepted_tokens_ptr, o_ptr, # (batch, dim, seqlen) # Matrix dimensions batch: int, @@ -639,6 +642,7 @@ def _causal_conv1d_update_kernel( stride_conv_state_seq: tl.constexpr, stride_conv_state_dim: tl.constexpr, stride_conv_state_tok: tl.constexpr, + stride_state_indices: tl.constexpr, stride_o_seq: tl.constexpr, stride_o_dim: tl.constexpr, stride_o_token: tl.constexpr, @@ -649,6 +653,7 @@ def _causal_conv1d_update_kernel( KERNEL_WIDTH: tl.constexpr, SILU_ACTIVATION: tl.constexpr, IS_CONTINUOUS_BATCHING: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, NP2_STATELEN: tl.constexpr, USE_PAD_SLOT: tl.constexpr, BLOCK_N: tl.constexpr, @@ -663,8 +668,9 @@ def _causal_conv1d_update_kernel( if IS_CONTINUOUS_BATCHING: # mask = idx_seq < batch - conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to( - tl.int64) + conv_state_batch_coord = tl.load(conv_state_indices_ptr + + idx_seq * stride_state_indices).to( + tl.int64) else: conv_state_batch_coord = idx_seq if USE_PAD_SLOT: # noqa @@ -672,13 +678,32 @@ def _causal_conv1d_update_kernel( # not processing as this is not the actual sequence return + if IS_SPEC_DECODING: + # The rolling of conv state: + # + # Before forward, the conv_state is: + # [history1, history2, ..., historyM]. + # + # After forward, the conv_state becomes: + # [history2, ..., historyM, draft1, draft2, ..., draftN]. + # + # After acceptance, it becomes: + # + # - accept 1 tokens: [history2, ..., historyM, draft1] + # - accept 2 tokens: [history3, ..., historyM, draft1, draft2] + # - and so on. + conv_state_token_offset = (tl.load(num_accepted_tokens_ptr + idx_seq) - + 1) + else: + conv_state_token_offset = 0 + # STEP 1: READ init_state data conv_states_base = (conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) + (idx_feats * stride_conv_state_dim)) mask_w = idx_feats < dim - prior_tokens = conv_states_base + prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok if KERNEL_WIDTH >= 2: conv_states_ptrs = prior_tokens # [BLOCK_N] col0 = tl.load(conv_states_ptrs, mask_w, 0.0) @@ -695,10 +720,14 @@ def _causal_conv1d_update_kernel( # STEP 2: assume state_len > seqlen idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + # The conv_state updates works in a sliding window manner, + # at each forward pass, the tokens are shift by 1, so we + # load since idx_tokens + 1. conv_state_ptrs_source = ( conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) + + conv_state_token_offset * stride_conv_state_tok + (idx_feats * stride_conv_state_dim)[None, :] + - ((idx_tokens + seqlen) * stride_conv_state_tok)[:, None] + ((idx_tokens + 1) * stride_conv_state_tok)[:, None] ) # [BLOCK_M, BLOCK_N] mask = ((conv_state_batch_coord < num_cache_lines) & ((idx_tokens + seqlen) < state_len)[:, None] @@ -820,6 +849,7 @@ def causal_conv1d_update( activation: Union[bool, str, None] = None, cache_seqlens: Optional[torch.Tensor] = None, conv_state_indices: Optional[torch.Tensor] = None, + num_accepted_tokens: Optional[torch.Tensor] = None, pad_slot_id: int = PAD_SLOT_ID, metadata=None, validate_data=False, @@ -890,10 +920,11 @@ def causal_conv1d_update( ) # X (batch, dim, seqlen) stride_o_seq, stride_o_dim, stride_o_token = out.stride() - stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride( ) - state_len = width - 1 + stride_state_indices = conv_state_indices.stride( + 0) if conv_state_indices is not None else 0 + state_len = width - 1 + (seqlen - 1) # effective state_len needed np2_statelen = triton.next_power_of_2(state_len) def grid(META): @@ -910,6 +941,7 @@ def causal_conv1d_update( conv_state, cache_seqlens, conv_state_indices, + num_accepted_tokens, out, # Matrix dimensions batch, @@ -926,6 +958,7 @@ def causal_conv1d_update( stride_istate_seq, stride_istate_dim, stride_istate_token, + stride_state_indices, stride_o_seq, stride_o_dim, stride_o_token, @@ -936,6 +969,7 @@ def causal_conv1d_update( KERNEL_WIDTH=width, SILU_ACTIVATION=activation in ["silu", "swish"], IS_CONTINUOUS_BATCHING=conv_state_indices is not None, + IS_SPEC_DECODING=num_accepted_tokens is not None, NP2_STATELEN=np2_statelen, USE_PAD_SLOT=pad_slot_id is not None, BLOCK_N=256, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index f38e7fc202209..687af7a189cea 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -312,7 +312,8 @@ class MambaModelConfig(VerifyAndUpdateConfig): # TODO(tdoublep): remove as full cuda graph support is added FCG_NOT_SUPPORTED_MODELS = [ - "Lfm2ForCausalLM", "MiniMaxText01ForCausalLM" + "Lfm2ForCausalLM", + "MiniMaxText01ForCausalLM", ] if (model_config.architecture not in FCG_NOT_SUPPORTED_MODELS diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py new file mode 100644 index 0000000000000..12db3d9461ef8 --- /dev/null +++ b/vllm/model_executor/models/qwen3_next.py @@ -0,0 +1,1294 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only Qwen3Next model.""" +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn.functional as F +from einops import rearrange +from torch import nn +from transformers.activations import ACT2FN + +from vllm import envs +from vllm.attention import Attention, AttentionBackend, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile +from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig, + VllmConfig, get_current_vllm_config) +from vllm.distributed import (divide, get_ep_group, get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.layers.fla.ops import ( + RMSNormGated, chunk_gated_delta_rule, fused_recurrent_gated_delta_rule) +from vllm.model_executor.layers.fused_moe import FusedMoE +# yapf conflicts with isort for this block +# yapf: disable +from vllm.model_executor.layers.layernorm import ( + GemmaRMSNorm as Qwen3NextRMSNorm) +# yapf: enable +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_mixer2 import ( + mamba_v2_sharded_weight_loader) +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, MambaStateShapeCalculator) +from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( + causal_conv1d_fn, causal_conv1d_update) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, sharded_weight_loader) +from vllm.model_executor.models.mamba_cache import MambaCacheParams +from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs import Qwen3NextConfig +from vllm.triton_utils import tl, triton +from vllm.utils import direct_register_custom_op +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata + +from .interfaces import (HasInnerState, IsHybrid, MixtureOfExperts, + SupportsLoRA, SupportsPP) +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + +logger = init_logger(__name__) + +KVCache = tuple[torch.Tensor, torch.Tensor] + + +class Qwen3NextSparseMoeBlock(nn.Module): + + def __init__( + self, + config: Qwen3NextConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + enable_eplb: bool = False, + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts = config.num_experts + + if self.tp_size > config.num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.num_experts}.") + + # Load balancing settings. + vllm_config = get_current_vllm_config() + eplb_config = vllm_config.parallel_config.eplb_config + self.enable_eplb = enable_eplb + + self.n_logical_experts = self.n_routed_experts + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + + self.experts = FusedMoE(num_experts=self.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) + + self.gate = ReplicatedLinear( + config.hidden_size, + config.num_experts, + bias=False, + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=f"{prefix}.gate") + + if config.shared_expert_intermediate_size > 0: + self.shared_expert = Qwen3NextMLP( + hidden_size=config.hidden_size, + intermediate_size=config.shared_expert_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), + ) + else: + self.shared_expert = None + self.shared_expert_gate = torch.nn.Linear(config.hidden_size, + 1, + bias=False) + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid gate quantization. + # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4 + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # NOTE: hidden_states can have either 1D or 2D shape. + orig_shape = hidden_states.shape + hidden_dim = hidden_states.shape[-1] + hidden_states = hidden_states.view(-1, hidden_dim) + + shared_output = None + if self.shared_expert is not None: + shared_output = self.shared_expert(hidden_states) + if self.shared_expert_gate is not None: + shared_output = F.sigmoid( + self.shared_expert_gate(hidden_states)) * shared_output + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts(hidden_states=hidden_states, + router_logits=router_logits) + + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + if self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 + final_hidden_states) + + return final_hidden_states.view(orig_shape) + + +class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): + + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend + return GDNAttentionBackend + + def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: + return MambaStateDtypeCalculator.gated_delta_net_state_dtype( + self.model_config.dtype, self.cache_config.mamba_cache_dtype) + + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.gated_delta_net_state_shape( + self.tp_size, + self.num_k_heads, + self.num_v_heads, + self.head_k_dim, + self.head_v_dim, + self.conv_kernel_size, + self.num_spec, + use_v1=True) + + def __init__( + self, + config: Qwen3NextConfig, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.hidden_size = config.hidden_size + self.num_v_heads = config.linear_num_value_heads + self.num_k_heads = config.linear_num_key_heads + self.head_k_dim = config.linear_key_head_dim + self.head_v_dim = config.linear_value_head_dim + self.key_dim = self.head_k_dim * self.num_k_heads + self.value_dim = self.head_v_dim * self.num_v_heads + + self.conv_kernel_size = config.linear_conv_kernel_dim + self.layer_idx = extract_layer_index(prefix) + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.layer_norm_epsilon = config.rms_norm_eps + self.prefix = prefix + + self.config = config + self.model_config = model_config + self.cache_config = cache_config + self.quant_config = quant_config + self.speculative_config = speculative_config + self.num_spec = (self.speculative_config.num_speculative_tokens + if self.speculative_config else 0) + + # QKV + self.conv_dim = self.key_dim * 2 + self.value_dim + self.conv1d = ColumnParallelLinear( + input_size=self.conv_kernel_size, + output_size=self.conv_dim, + bias=False, + prefix=f"{prefix}.conv1d", + ) + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + # projection of the input hidden states + self.projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2 + self.projection_size_ba = self.num_v_heads * 2 + self.in_proj = MergedColumnParallelLinear( + input_size=self.hidden_size, + output_sizes=[self.projection_size_qkvz, self.projection_size_ba], + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj", + ) + + query_key_settings = (self.key_dim, 0, False) + value_settings = (self.value_dim, 0, False) + + delattr(self.conv1d.weight, "weight_loader") + set_weight_attrs( + self.conv1d.weight, { + "weight_loader": + mamba_v2_sharded_weight_loader([ + query_key_settings, + query_key_settings, + value_settings, + ], self.tp_size, self.tp_rank) + }) + + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter( + torch.ones(self.num_v_heads // self.tp_size), ) + self.A_log = nn.Parameter( + torch.empty( + divide(self.num_v_heads, self.tp_size), + dtype=torch.float32, + )) + + set_weight_attrs(self.A_log, + {"weight_loader": sharded_weight_loader(0)}) + set_weight_attrs(self.dt_bias, + {"weight_loader": sharded_weight_loader(0)}) + + self.norm = RMSNormGated( + self.head_v_dim, + eps=self.layer_norm_epsilon, + group_size=None, + norm_before_gate=True, + device=torch.cuda.current_device(), + dtype=config.torch_dtype, + ) + + self.out_proj = RowParallelLinear(self.value_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + quant_config=quant_config, + prefix=f"{prefix}.out_proj") + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + def fix_query_key_value_ordering( + self, + mixed_qkvz, + mixed_ba, + ): + """ + Derives `query`, `key` and `value` tensors from `mixed_qkvzba`. + """ + new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + ( + self.num_k_heads // self.tp_size, + (self.head_k_dim + self.head_k_dim + + (self.head_v_dim + self.head_v_dim) * self.num_v_heads // + self.num_k_heads), + ) + new_tensor_shape_ba = mixed_qkvz.size()[:-1] + ( + self.num_k_heads // self.tp_size, + 2 * self.num_v_heads // self.num_k_heads, + ) + + mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz) + mixed_ba = mixed_ba.view(*new_tensor_shape_ba) + + split_arg_list_qkvz = [ + self.head_k_dim, + self.head_k_dim, + (self.num_v_heads // self.num_k_heads * self.head_v_dim), + (self.num_v_heads // self.num_k_heads * self.head_v_dim), + ] + split_arg_list_ba = [ + self.num_v_heads // self.num_k_heads, + self.num_v_heads // self.num_k_heads + ] + + # [b, sq, ng, (hn + hn + np/ng * hn + np/ng + np/ng)] + # --> [b, sq, ng, hn], [b, sq, ng, hn], [b, sq, ng, np/ng * hn], + # [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng], [b, sq, ng, np/ng] + (query, key, value, z) = torch.split(mixed_qkvz, + split_arg_list_qkvz, + dim=2) + (b, a) = torch.split(mixed_ba, split_arg_list_ba, dim=2) + + # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn] + value = value.reshape(value.size(0), -1, self.head_v_dim) + z = z.reshape(z.size(0), -1, self.head_v_dim) + b = b.reshape(b.size(0), self.num_v_heads // self.tp_size) + a = a.reshape(a.size(0), self.num_v_heads // self.tp_size) + + return query, key, value, z, b, a + + def rearrange_mixed_qkv(self, mixed_qkv): + if mixed_qkv is None: + return None, None, None + query, key, value = torch.split( + mixed_qkv, + [ + self.key_dim // self.tp_size, + self.key_dim // self.tp_size, + self.value_dim // self.tp_size, + ], + dim=-1, + ) + query, key = map( + lambda x: rearrange(x, 'l (h d) -> 1 l h d', d=self.head_k_dim), + (query, key)) + value = rearrange(value, 'l (h d) -> 1 l h d', d=self.head_v_dim) + return query, key, value + + def forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + cache_params: Optional[MambaCacheParams] = None, + ): + return torch.ops.vllm.gdn_attention( + hidden_states, + output, + self.prefix, + ) + + def _forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ): + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + + if attn_metadata is None: + # V1 profile run + return + + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, GDNAttentionMetadata) + has_initial_state = attn_metadata.has_initial_state + spec_query_start_loc = attn_metadata.spec_query_start_loc + non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc + spec_sequence_masks = attn_metadata.spec_sequence_masks + spec_token_masks = attn_metadata.spec_token_masks + spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 + non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + num_actual_tokens = (attn_metadata.num_prefill_tokens + + attn_metadata.num_decode_tokens + + attn_metadata.num_spec_decode_tokens) + num_accepted_tokens = attn_metadata.num_accepted_tokens + + # 1. Set up dimensions for reshapes later + projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens]) + if spec_token_masks is not None: + spec_token_masks = spec_token_masks[:num_actual_tokens] + projected_states_qkvz, projected_states_ba = torch.split( + projected_states, + [ + self.projection_size_qkvz // self.tp_size, + self.projection_size_ba // self.tp_size + ], + dim=-1, + ) + query, key, value, z, b, a = self.fix_query_key_value_ordering( + projected_states_qkvz, projected_states_ba) + query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'), + (query, key, value)) + mixed_qkv = torch.cat((query, key, value), dim=-1) + + # 2. Convolution sequence transformation + conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), + self.conv1d.weight.size(2)) + + if spec_sequence_masks is not None: + if (attn_metadata.num_prefills == 0 + and attn_metadata.num_decodes == 0): + mixed_qkv_spec = mixed_qkv + mixed_qkv_non_spec = None + else: + mixed_qkv_spec = mixed_qkv[spec_token_masks] + mixed_qkv_non_spec = mixed_qkv[~spec_token_masks] + else: + mixed_qkv_spec = None + mixed_qkv_non_spec = mixed_qkv + + # 2.1: process the mutli-query part + if spec_sequence_masks is not None: + mixed_qkv_spec = mixed_qkv_spec.view( + attn_metadata.num_spec_decodes, -1, mixed_qkv_spec.size(-1)) + mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b l d -> b d l') + mixed_qkv_spec = causal_conv1d_update( + mixed_qkv_spec, + conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=spec_state_indices_tensor[:, 0] + [:attn_metadata.num_spec_decodes], + num_accepted_tokens=num_accepted_tokens, + validate_data=False, + ) + mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b d l -> (b l) d') + + # 2.2: process the remaining part + if attn_metadata.num_prefills > 0: + # - "cache_indices" updates the conv_state cache in positions + # pointed to by "mamba_cache_params.state_indices_tensor" + mixed_qkv_non_spec = causal_conv1d_fn( + mixed_qkv_non_spec.transpose(0, 1), + conv_weights, + self.conv1d.bias, + activation=self.activation, + conv_states=conv_state, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + ).transpose(0, 1) + elif attn_metadata.num_decodes > 0: + mixed_qkv_non_spec = causal_conv1d_update( + mixed_qkv_non_spec, + conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=non_spec_state_indices_tensor[:attn_metadata + .num_decodes], + validate_data=True, + ) + else: + mixed_qkv_non_spec = None + + query_spec, key_spec, value_spec = self.rearrange_mixed_qkv( + mixed_qkv_spec) + query_non_spec, key_non_spec, value_non_spec = self.rearrange_mixed_qkv( + mixed_qkv_non_spec) + + beta = b.sigmoid() + # g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias) + g = fused_gdn_gating(self.A_log, a, self.dt_bias) + g, beta = map(lambda x: rearrange(x, 'l d -> 1 l d'), (g, beta)) + + if spec_sequence_masks is not None: + if (attn_metadata.num_prefills == 0 + and attn_metadata.num_decodes == 0): + g_spec = g + beta_spec = beta + g_non_spec = None + beta_non_spec = None + else: + g_spec = g[:, spec_token_masks] + beta_spec = beta[:, spec_token_masks] + g_non_spec = g[:, ~spec_token_masks] + beta_non_spec = beta[:, ~spec_token_masks] + else: + g_spec = None + beta_spec = None + g_non_spec = g + beta_non_spec = beta + + # 3. Recurrent attention + + # 3.1: process the mutlti-query part + if spec_sequence_masks is not None: + core_attn_out_spec, last_recurrent_state = ( + fused_recurrent_gated_delta_rule( + q=query_spec, + k=key_spec, + v=value_spec, + g=g_spec, + beta=beta_spec, + initial_state=ssm_state, + inplace_final_state=True, + cu_seqlens=spec_query_start_loc[:attn_metadata. + num_spec_decodes + 1], + ssm_state_indices=spec_state_indices_tensor, + num_accepted_tokens=num_accepted_tokens, + use_qk_l2norm_in_kernel=True, + )) + else: + core_attn_out_spec, last_recurrent_state = None, None + + # 3.2: process the remaining part + if attn_metadata.num_prefills > 0: + initial_state = ssm_state[ + non_spec_state_indices_tensor].contiguous() + initial_state[~has_initial_state, ...] = 0 + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = chunk_gated_delta_rule( + q=query_non_spec, + k=key_non_spec, + v=value_non_spec, + g=g_non_spec, + beta=beta_non_spec, + initial_state=initial_state, + output_final_state=True, + cu_seqlens=non_spec_query_start_loc, + head_first=False, + use_qk_l2norm_in_kernel=True, + ) + # Init cache + ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to( + ssm_state.dtype) + elif attn_metadata.num_decodes > 0: + core_attn_out_non_spec, last_recurrent_state = ( + fused_recurrent_gated_delta_rule( + q=query_non_spec, + k=key_non_spec, + v=value_non_spec, + g=g_non_spec, + beta=beta_non_spec, + initial_state=ssm_state, + inplace_final_state=True, + cu_seqlens=non_spec_query_start_loc[:attn_metadata. + num_decodes + 1], + ssm_state_indices=non_spec_state_indices_tensor, + use_qk_l2norm_in_kernel=True, + )) + else: + core_attn_out_non_spec, last_recurrent_state = None, None + + # Merge core attention output + if (spec_sequence_masks is not None + and core_attn_out_non_spec is not None): + core_attn_out = torch.empty( + (1, num_actual_tokens, *core_attn_out_spec.shape[2:]), + dtype=core_attn_out_non_spec.dtype, + device=core_attn_out_non_spec.device, + ) + core_attn_out[:, spec_token_masks] = core_attn_out_spec + core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec + elif spec_sequence_masks is not None: + core_attn_out = core_attn_out_spec + else: + core_attn_out = core_attn_out_non_spec + + z_shape_og = z.shape + # reshape input data into 2D tensor + core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1]) + z = z.reshape(-1, z.shape[-1]) + core_attn_out = self.norm(core_attn_out, z) + core_attn_out = core_attn_out.reshape(z_shape_og) + core_attn_out = rearrange(core_attn_out, '... h d -> ... (h d)') + + output[:num_actual_tokens], _ = self.out_proj(core_attn_out) + + +class Qwen3NextAttention(nn.Module): + + def __init__( + self, + config: Qwen3NextConfig, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = config.head_dim or (self.hidden_size // self.num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.dual_chunk_attention_config = getattr( + config, "dual_chunk_attention_config", None) + self.attn_output_gate = getattr(config, "attn_output_gate", True) + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_dim, + self.total_num_heads * (1 + self.attn_output_gate), + self.total_num_kv_heads, + bias=getattr(config, "qkv_bias", False), + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + head_size=self.head_dim, + rotary_dim=self.head_dim, + max_position=config.max_position_embeddings, + base=config.rope_theta, + rope_scaling=config.rope_scaling, + partial_rotary_factor=config.partial_rotary_factor, + dual_chunk_attention_config=self.dual_chunk_attention_config, + ) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + **{ + "layer_idx": extract_layer_index(prefix), + "dual_chunk_attention_config": + self.dual_chunk_attention_config, + } if self.dual_chunk_attention_config else {}, + ) + + self.q_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + output: torch.Tensor, + hidden_states: torch.Tensor, + ): + qkv, _ = self.qkv_proj(hidden_states) + + if self.attn_output_gate: + q_gate, k, v = qkv.split( + [self.q_size * 2, self.kv_size, self.kv_size], dim=-1) + orig_shape = q_gate.shape[:-1] + q_gate = q_gate.view(*orig_shape, self.num_heads, -1) + q, gate = torch.chunk(q_gate, 2, dim=-1) + q = q.reshape(*orig_shape, -1) + gate = gate.reshape(*orig_shape, -1) + else: + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], + dim=-1) + + q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)).view( + -1, self.num_heads * self.head_dim) + k = self.k_norm(k.view(-1, self.num_kv_heads, self.head_dim)).view( + -1, self.num_kv_heads * self.head_dim) + + q, k = self.rotary_emb(positions, q, k) + + attn_output = self.attn(q, k, v) + + if self.attn_output_gate: + gate = torch.sigmoid(gate) + attn_output = attn_output * gate + + output[:], _ = self.o_proj(attn_output) + + +class Qwen3NextDecoderLayer(nn.Module): + + def __init__( + self, + config: Qwen3NextConfig, + layer_type: str, + model_config: Optional[ModelConfig] = None, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, + prefix: str = "", + enable_eplb: bool = False, + ) -> None: + super().__init__() + self.config = config + + self.layer_type = layer_type + self.layer_idx = extract_layer_index(prefix) + + if self.layer_type == "linear_attention": + self.linear_attn = Qwen3NextGatedDeltaNet( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + speculative_config=speculative_config, + prefix=f'{prefix}.linear_attn') + elif self.layer_type == "full_attention": + self.self_attn = Qwen3NextAttention( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f'{prefix}.self_attn', + ) + else: + raise ValueError(f"Invalid layer_type {self.layer_type}") + + mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else + config.mlp_only_layers) + if (self.layer_idx not in mlp_only_layers) and ( + config.num_experts > 0 and + (self.layer_idx + 1) % config.decoder_sparse_step == 0): + self.mlp = Qwen3NextSparseMoeBlock( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb, + ) + else: + self.mlp = Qwen3NextMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + ) + + self.input_layernorm = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen3NextRMSNorm( + config.hidden_size, eps=config.rms_norm_eps) + + self.layer_scale = getattr(config, "layer_scale", False) + if self.layer_scale: + self.attn_layer_scale = torch.nn.Parameter( + torch.zeros( + 1, + 1, + self.config.hidden_size, + dtype=config.torch_dtype, + ), ) + self.ffn_layer_scale = torch.nn.Parameter( + torch.zeros( + 1, + 1, + self.config.hidden_size, + dtype=config.torch_dtype, + ), ) + + def forward( + self, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + positions: torch.Tensor = None, + **kwargs: object, + ): + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + self_attention_output = torch.empty_like(hidden_states) + if self.layer_type == "linear_attention": + self.linear_attn( + hidden_states=hidden_states, + output=self_attention_output, + ) + elif self.layer_type == "full_attention": + self.self_attn( + hidden_states=hidden_states, + output=self_attention_output, + positions=positions, + ) + else: + raise ValueError("Invalid layer_type") + hidden_states = self_attention_output + + if self.layer_scale: + if len(hidden_states.shape) == 2: + hidden_states = hidden_states * ( + self.attn_layer_scale.to(hidden_states.dtype)[0] + 1) + else: + hidden_states = hidden_states * ( + self.attn_layer_scale.to(hidden_states.dtype) + 1) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + + if self.layer_scale: + if len(hidden_states.shape) == 2: + hidden_states = hidden_states * ( + self.ffn_layer_scale.to(hidden_states.dtype)[0] + 1) + else: + assert len(hidden_states.shape) == len( + self.ffn_layer_scale.shape + ), f'shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}' # noqa: E501 + hidden_states = hidden_states * ( + self.ffn_layer_scale.to(hidden_states.dtype) + 1) + + return hidden_states, residual + + +@support_torch_compile +class Qwen3NextModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config: Qwen3NextConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + parallel_config = vllm_config.parallel_config + lora_config = vllm_config.lora_config + speculative_config = vllm_config.speculative_config + enable_eplb = parallel_config.enable_eplb + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts + + self.config = config + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + + def get_layer(prefix: str): + return Qwen3NextDecoderLayer( + config, + layer_type=config.layer_types[extract_layer_index(prefix)], + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + speculative_config=speculative_config, + prefix=prefix, + enable_eplb=enable_eplb, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + self.norm = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers: + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + num_redundant_experts=self.num_redundant_experts) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ("in_proj", "in_proj_qkvz", 0), + ("in_proj", "in_proj_ba", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + if name.startswith("mtp."): + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + + if "mlp.experts" in name: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # name = apply_attn_prefix(name, params_dict) + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, + MixtureOfExperts, IsHybrid): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": ["up_proj", "down_proj"], + "in_proj": ["in_proj_qkvz", "in_proj_ba"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + lora_config = vllm_config.lora_config + scheduler_config = vllm_config.scheduler_config + assert not cache_config.enable_prefix_caching, \ + "Qwen3Next currently does not support prefix caching" + assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1" + self.quant_config = vllm_config.quant_config + + super().__init__() + self.config = config + self.scheduler_config = scheduler_config + self.model = Qwen3NextModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + ) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + # Set MoE hyperparameters + self.expert_weights = [] + + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, Qwen3NextDecoderLayer) + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No Qwen3Next layer found in the model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_layer.n_logical_experts + self.num_physical_experts = example_layer.n_physical_experts + self.num_local_physical_experts = example_layer.n_local_physical_experts + self.num_routed_experts = example_layer.n_routed_experts + self.num_redundant_experts = example_layer.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ): + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + + return hidden_states + + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + return MambaStateDtypeCalculator.gated_delta_net_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype) + + @classmethod + def get_mamba_state_shape_from_config( + cls, vllm_config: "VllmConfig" + ) -> tuple[tuple[int, int], tuple[int, int]]: + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + tp_size = parallel_config.tensor_parallel_size + num_spec = (vllm_config.speculative_config.num_speculative_tokens + if vllm_config.speculative_config else 0) + return MambaStateShapeCalculator.gated_delta_net_state_shape( + tp_size, + hf_config.linear_num_key_heads, + hf_config.linear_num_value_heads, + hf_config.linear_key_head_dim, + hf_config.linear_value_head_dim, + hf_config.linear_conv_kernel_dim, + num_spec, + use_v1=True) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=["mtp."], + ) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def gdn_attention( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward(hidden_states=hidden_states, output=output) + + +def gdn_attention_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="gdn_attention", + op_func=gdn_attention, + mutates_args=["output"], + fake_impl=gdn_attention_fake, + dispatch_key=current_platform.dispatch_key, +) + + +# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias) +@triton.jit +def fused_gdn_gating_kernel( + g, + A_log, + a, + dt_bias, + seq_len, + NUM_HEADS: tl.constexpr, + beta: tl.constexpr, + threshold: tl.constexpr, + BLK_HEADS: tl.constexpr, +): + i_b, i_s, i_d = tl.program_id(0), tl.program_id(1), tl.program_id(2) + head_off = i_d * BLK_HEADS + tl.arange(0, BLK_HEADS) + off = i_b * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off + mask = head_off < NUM_HEADS + blk_A_log = tl.load(A_log + head_off, mask=mask) + blk_a = tl.load(a + off, mask=mask) + blk_bias = tl.load(dt_bias + head_off, mask=mask) + # If the model is loaded in fp16, without the .float() here, A might be -inf + x = blk_a.to(tl.float32) + blk_bias.to(tl.float32) + softplus_x = tl.where(beta * x <= threshold, + (1 / beta) * tl.log(1 + tl.exp(beta * x)), x) + blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x + tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask) + + +def fused_gdn_gating( + A_log: torch.Tensor, + a: torch.Tensor, + dt_bias: torch.Tensor, + beta: float = 1.0, + threshold: float = 20.0, +) -> torch.Tensor: + batch, num_heads = a.shape + seq_len = 1 + grid = (batch, seq_len, triton.cdiv(num_heads, 8)) + g = torch.empty_like(a, dtype=torch.float32) + fused_gdn_gating_kernel[grid](g, + A_log, + a, + dt_bias, + seq_len, + num_heads, + beta, + threshold, + 8, + num_warps=1) + return g diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py new file mode 100644 index 0000000000000..e7aff377e9aec --- /dev/null +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only Qwen3Next MTP model.""" +from collections.abc import Iterable +from typing import Optional + +import torch +from torch import nn + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import ColumnParallelLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.qwen3_next import (Qwen3NextDecoderLayer, + Qwen3NextRMSNorm) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs import Qwen3NextConfig + +from .interfaces import SupportsPP +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, maybe_prefix) + +logger = init_logger(__name__) + +KVCache = tuple[torch.Tensor, torch.Tensor] + + +@support_torch_compile +class Qwen3NextMultiTokenPredictor(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + config: Qwen3NextConfig = model_config.hf_config + + self.config = config + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1) + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + + self.fc = ColumnParallelLinear(self.config.hidden_size * 2, + self.config.hidden_size, + gather_output=True, + bias=False, + return_bias=False) + + self.layers = torch.nn.ModuleList( + Qwen3NextDecoderLayer( + config, + layer_type="full_attention", + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f'{prefix}.layers.{self.mtp_start_layer_idx + idx}', + ) for idx in range(self.num_mtp_layers)) + + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + self.norm = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.pre_fc_norm_hidden = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.pre_fc_norm_embedding = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) + assert hidden_states.shape[-1] == inputs_embeds.shape[-1] + inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds) + hidden_states = self.pre_fc_norm_hidden(hidden_states) + hidden_states = torch.cat([inputs_embeds, hidden_states], dim=-1) + hidden_states = self.fc(hidden_states) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + current_step_idx = (spec_step_idx % self.num_mtp_layers) + hidden_states, residual = self.layers[current_step_idx]( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + + if "mlp.experts" in name: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +@support_torch_compile +class Qwen3NextMTP(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": ["up_proj", "down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + self.vllm_config = vllm_config + cache_config = vllm_config.cache_config + assert not cache_config.enable_prefix_caching, \ + "Qwen3NextMTP currently does not support prefix caching" + + self.quant_config = vllm_config.quant_config + + super().__init__() + self.config = config + self.model = Qwen3NextMultiTokenPredictor(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + self.unpadded_vocab_size = config.vocab_size + self.lm_head = ParallelLMHead(self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ): + hidden_states = self.model(input_ids, positions, hidden_states, + intermediate_tensors, inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> Optional[torch.Tensor]: + return self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + shared_weight_names = ["embed_tokens", "lm_head"] + + def remap_weight_names(weights): + for name, weight in weights: + if name.startswith("mtp."): + name = name.replace("mtp.", "model.") + elif not any(key in name for key in shared_weight_names): + continue + yield name, weight + + loader = AutoWeightsLoader(self) + return loader.load_weights(remap_weight_names(weights)) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8fe8402c15d3d..7d7654e846e1c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -74,6 +74,7 @@ _TEXT_GENERATION_MODELS = { "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"), "Gemma3nForCausalLM": ("gemma3n", "Gemma3nForCausalLM"), + "Qwen3NextForCausalLM": ("qwen3_next", "Qwen3NextForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), @@ -285,6 +286,7 @@ _SPECULATIVE_DECODING_MODELS = { "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), + "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"), # Temporarily disabled. # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d6ebcdf805252..2852d16ec53f4 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -79,7 +79,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( ultravox="UltravoxConfig", step3_vl="Step3VLConfig", step3_text="Step3TextConfig", -) + qwen3_next="Qwen3NextConfig") _CONFIG_ATTRS_MAPPING: dict[str, str] = { "llm_config": "text_config", diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index f651ecb078b95..cdae59ccc24e0 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -24,6 +24,7 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.ovis import OvisConfig +from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, Step3VisionEncoderConfig, @@ -50,4 +51,5 @@ __all__ = [ "Step3VLConfig", "Step3VisionEncoderConfig", "Step3TextConfig", + "Qwen3NextConfig", ] diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py new file mode 100644 index 0000000000000..c7af26acd1b9f --- /dev/null +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -0,0 +1,275 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Qwen3-Next model configuration""" + +from transformers.configuration_utils import (PretrainedConfig, + layer_type_validation) +from transformers.modeling_rope_utils import rope_config_validation +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class Qwen3NextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a + Qwen3-Next model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of + Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the model. Defines the number of different tokens that can be represented by the + `inputs_ids`. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 5632): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 48): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str`, *optional*, defaults to `"silu"`): + The non-linear activation function in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + partial_rotary_factor (`float`, *optional*, defaults to 0.25): + Percentage of the query and keys which will have rotary embedding. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + head_dim (`int`, *optional*, defaults to 256): + Projection weights dimension in multi-head attention. + linear_conv_kernel_dim (`int`, *optional*, defaults to 4): + Kernel size of the convolution used in linear attention layers. + linear_key_head_dim (`int`, *optional*, defaults to 128): + Dimension of each key head in linear attention. + linear_value_head_dim (`int`, *optional*, defaults to 128): + Dimension of each value head in linear attention. + linear_num_key_heads (`int`, *optional*, defaults to 16): + Number of key heads used in linear attention layers. + linear_num_value_heads (`int`, *optional*, defaults to 32): + Number of value heads used in linear attention layers. + decoder_sparse_step (`int`, *optional*, defaults to 1): + The frequency of the MoE layer. + moe_intermediate_size (`int`, *optional*, defaults to 512): + Intermediate size of the routed expert. + shared_expert_intermediate_size (`int`, *optional*, defaults to 512): + Intermediate size of the shared expert. + num_experts_per_tok (`int`, *optional*, defaults to 10): + Number of selected experts. + num_experts (`int`, *optional*, defaults to 512): + Number of routed experts. + norm_topk_prob (`bool`, *optional*, defaults to `True`): + Whether to normalize the topk probabilities. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss, including load balancing loss and router z-loss. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + mlp_only_layers (`list[int]`, *optional*, defaults to `[]`): + Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock + The list contains layer index, from 0 to num_layers-1 if we have num_layers layers + If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. + layer_types (`list[str]`, *optional*): + Types of each layer (attention or linear). + + ```python + >>> from transformers import Qwen3NextModel, Qwen3NextConfig + + >>> # Initializing a Qwen3Next style configuration + >>> configuration = Qwen3NextConfig() + + >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration + >>> model = Qwen3NextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ # noqa: E501 + + model_type = "qwen3_next" + keys_to_ignore_at_inference = ["past_key_values"] + + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.experts.*.gate_proj": "colwise", + "layers.*.mlp.experts.*.up_proj": "colwise", + "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.shared_experts.gate_proj": "colwise", + "layers.*.mlp.shared_experts.up_proj": "colwise", + "layers.*.mlp.shared_experts.down_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=151936, + hidden_size=2048, + intermediate_size=5632, + num_hidden_layers=48, + num_attention_heads=16, + num_key_value_heads=2, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + partial_rotary_factor=0.25, + attention_bias=False, + attention_dropout=0.0, + head_dim=256, + linear_conv_kernel_dim=4, + linear_key_head_dim=128, + linear_value_head_dim=128, + linear_num_key_heads=16, + linear_num_value_heads=32, + decoder_sparse_step=1, + moe_intermediate_size=512, + shared_expert_intermediate_size=512, + num_experts_per_tok=10, + num_experts=512, + norm_topk_prob=True, + output_router_logits=False, + router_aux_loss_coef=0.001, + mlp_only_layers=None, + layer_types=None, + **kwargs, + ): + if mlp_only_layers is None: + mlp_only_layers = [] + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.partial_rotary_factor = partial_rotary_factor + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.head_dim = head_dim + rope_config_validation(self) + + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "linear_attention" if bool((i + 1) % 4) else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types) + + # linear attention part + self.linear_conv_kernel_dim = linear_conv_kernel_dim + self.linear_key_head_dim = linear_key_head_dim + self.linear_value_head_dim = linear_value_head_dim + self.linear_num_key_heads = linear_num_key_heads + self.linear_num_value_heads = linear_num_value_heads + + # MoE arguments + self.decoder_sparse_step = decoder_sparse_step + self.moe_intermediate_size = moe_intermediate_size + self.shared_expert_intermediate_size = shared_expert_intermediate_size + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.norm_topk_prob = norm_topk_prob + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + self.mlp_only_layers = mlp_only_layers + + +__all__ = ["Qwen3NextConfig"] diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py new file mode 100644 index 0000000000000..12233af057b04 --- /dev/null +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -0,0 +1,319 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Backend for GatedDeltaNet attention.""" +from dataclasses import dataclass +from typing import ClassVar, Optional + +import torch + +from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.config import VllmConfig +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata, + split_decodes_and_prefills) +from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec + + +class GDNAttentionBackend(AttentionBackend): + + @staticmethod + def get_builder_cls() -> type["GDNAttentionMetadataBuilder"]: + return GDNAttentionMetadataBuilder + + +@dataclass +class GDNAttentionMetadata: + num_prefills: int + num_prefill_tokens: int + num_decodes: int + num_decode_tokens: int + num_spec_decodes: int + num_spec_decode_tokens: int + + has_initial_state: Optional[torch.Tensor] = None + + spec_query_start_loc: Optional[ + torch.Tensor] = None # shape: [num_spec_decodes + 1,] + non_spec_query_start_loc: Optional[ + torch.Tensor] = None # shape: [batch - num_spec_decodes + 1,] + + spec_state_indices_tensor: Optional[ + torch.Tensor] = None # shape: [batch, num_spec] + non_spec_state_indices_tensor: Optional[ + torch.Tensor] = None # shape: [batch - num_spec_decodes,] + spec_sequence_masks: Optional[torch.Tensor] = None # shape: [batch,] + spec_token_masks: Optional[ + torch. + Tensor] = None # shape: [num_prefill_tokens + num_decode_tokens,] + num_accepted_tokens: Optional[torch.Tensor] = None # shape: [batch,] + + +class GDNAttentionMetadataBuilder( + AttentionMetadataBuilder[GDNAttentionMetadata]): + + cudagraph_support = AttentionCGSupport.UNIFORM_BATCH + + reorder_batch_threshold: ClassVar[int] = 1 + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + assert isinstance(kv_cache_spec, MambaSpec) + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config + self.speculative_config = vllm_config.speculative_config + self.kv_cache_spec = kv_cache_spec + if self.speculative_config: + self.num_spec = self.speculative_config.num_speculative_tokens # noqa: E501 + else: + self.num_spec = 0 + self.use_spec_decode = self.num_spec > 0 + self.reorder_batch_threshold = self.num_spec + 1 # type: ignore[misc] + + self.use_full_cuda_graph = \ + self.compilation_config.cudagraph_mode.has_full_cudagraphs() + self.decode_cudagraph_max_bs = min( + self.vllm_config.scheduler_config.max_num_seqs, + self.compilation_config.max_capture_size) + + self.spec_state_indices_tensor = torch.empty( + (self.decode_cudagraph_max_bs, self.num_spec + 1), + dtype=torch.int32, + device=device, + ) + self.non_spec_state_indices_tensor = torch.empty( + (self.decode_cudagraph_max_bs, ), + dtype=torch.int32, + device=device, + ) + self.spec_sequence_masks = torch.empty( + (self.decode_cudagraph_max_bs, ), + dtype=torch.bool, + device=device, + ) + self.spec_token_masks = torch.empty( + (self.decode_cudagraph_max_bs * (self.num_spec + 1), ), + dtype=torch.bool, + device=device, + ) + self.spec_query_start_loc = torch.empty( + (self.decode_cudagraph_max_bs + 1, ), + dtype=torch.int32, + device=device, + ) + self.non_spec_query_start_loc = torch.empty( + (self.decode_cudagraph_max_bs + 1, ), + dtype=torch.int32, + device=device, + ) + self.num_accepted_tokens = torch.empty( + (self.decode_cudagraph_max_bs, ), + dtype=torch.int32, + device=device, + ) + + def build( # type: ignore[override] + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + num_accepted_tokens: Optional[torch.Tensor] = None, + num_draft_tokens: Optional[torch.Tensor] = None, + fast_build: bool = False, + ) -> GDNAttentionMetadata: + m = common_attn_metadata + + query_start_loc = m.query_start_loc + context_lens = m.num_computed_tokens_cpu + context_lens_tensor = context_lens.to(query_start_loc.device) + seq_lens_tensor = m.seq_lens + + if (not self.use_spec_decode or num_draft_tokens is None + or num_draft_tokens.sum().item() == 0): + spec_sequence_masks = None + else: + spec_sequence_masks = (num_draft_tokens > 0) & ( + context_lens_tensor + + (num_draft_tokens + 1) == seq_lens_tensor) + if spec_sequence_masks.sum().item() == 0: + spec_sequence_masks = None + + if spec_sequence_masks is None: + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + split_decodes_and_prefills(m, decode_threshold=1)) + num_spec_decodes = 0 + num_spec_decode_tokens = 0 + spec_token_masks = None + spec_state_indices_tensor = None + non_spec_state_indices_tensor = m.block_table_tensor[:, 0] + spec_query_start_loc = None + non_spec_query_start_loc = query_start_loc + num_accepted_tokens = None + else: + num_spec_decodes = spec_sequence_masks.sum().item() + query_lens = query_start_loc[1:] - query_start_loc[:-1] + + non_spec_query_lens = query_lens[~spec_sequence_masks] + num_decodes = (non_spec_query_lens == 1).sum().item() + num_prefills = non_spec_query_lens.size(0) - num_decodes + num_decode_tokens = num_decodes + num_prefill_tokens = non_spec_query_lens.sum().item( + ) - num_decode_tokens + + if num_prefills == 0 and num_decodes == 0: + spec_token_masks = torch.ones( + (min(num_spec_decodes * + (self.num_spec + 1), query_start_loc[-1].item())), + dtype=torch.bool, + device=query_start_loc.device) + spec_state_indices_tensor = m.block_table_tensor[:, :self. + num_spec + 1] + non_spec_state_indices_tensor = None + spec_query_start_loc = query_start_loc + non_spec_query_start_loc = None + else: + spec_token_masks = torch.repeat_interleave( + spec_sequence_masks, query_lens) + spec_state_indices_tensor = m.block_table_tensor[ + spec_sequence_masks, :self.num_spec + 1] + non_spec_state_indices_tensor = \ + m.block_table_tensor[~spec_sequence_masks, 0] + + spec_query_start_loc = torch.zeros( + num_spec_decodes + 1, + dtype=torch.int32, + device=query_start_loc.device) + torch.cumsum(query_lens[spec_sequence_masks], + dim=0, + out=spec_query_start_loc[1:]) + non_spec_query_start_loc = torch.zeros( + query_lens.size(0) - num_spec_decodes + 1, + dtype=torch.int32, + device=query_start_loc.device) + torch.cumsum(query_lens[~spec_sequence_masks], + dim=0, + out=non_spec_query_start_loc[1:]) + + num_spec_decode_tokens = min( + num_spec_decodes * (self.num_spec + 1), + spec_token_masks.size(0)) + assert num_accepted_tokens is not None + num_accepted_tokens = num_accepted_tokens[spec_sequence_masks] + + if num_prefills > 0: + has_initial_state = context_lens_tensor > 0 + if spec_sequence_masks is not None: + has_initial_state = has_initial_state[~spec_sequence_masks] + else: + has_initial_state = None + + # prepare tensors for cudagraph + if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0 + and num_spec_decodes <= self.decode_cudagraph_max_bs): + num_total_tokens = self.vllm_config.pad_for_cudagraph( + m.num_actual_tokens) + batch_size = num_total_tokens // (self.num_spec + 1) + + self.spec_state_indices_tensor[:num_spec_decodes].copy_( + spec_state_indices_tensor, non_blocking=True) + spec_state_indices_tensor = self.spec_state_indices_tensor[: + batch_size] + spec_state_indices_tensor[num_spec_decodes:].fill_(PAD_SLOT_ID) + + self.spec_sequence_masks[:num_spec_decodes].copy_( + spec_sequence_masks, non_blocking=True) + spec_sequence_masks = self.spec_sequence_masks[:batch_size] + spec_sequence_masks[num_spec_decodes:].fill_(False) + + assert spec_token_masks is not None + self.spec_token_masks[:spec_token_masks.size(0)].copy_( + spec_token_masks, non_blocking=True) + spec_token_masks = self.spec_token_masks[:m.num_actual_tokens] + spec_token_masks[spec_token_masks.size(0):].fill_(False) + + self.spec_query_start_loc[:num_spec_decodes + 1].copy_( + spec_query_start_loc, non_blocking=True) + spec_num_query_tokens = spec_query_start_loc[ + -1] # type: ignore[index] + spec_query_start_loc = self.spec_query_start_loc[:batch_size + 1] + spec_query_start_loc[num_spec_decodes + + 1:].fill_(spec_num_query_tokens) + + self.num_accepted_tokens[:num_spec_decodes].copy_( + num_accepted_tokens, non_blocking=True) + num_accepted_tokens = self.num_accepted_tokens[:batch_size] + num_accepted_tokens[num_spec_decodes:].fill_(1) + + if (self.use_full_cuda_graph and num_prefills == 0 + and num_spec_decodes == 0 + and num_decodes <= self.decode_cudagraph_max_bs): + num_total_tokens = self.vllm_config.pad_for_cudagraph( + m.num_actual_tokens) + batch_size = num_total_tokens + + self.non_spec_state_indices_tensor[:num_decodes].copy_( + non_spec_state_indices_tensor, non_blocking=True) + non_spec_state_indices_tensor = \ + self.non_spec_state_indices_tensor[:batch_size] + non_spec_state_indices_tensor[num_decodes:].fill_(PAD_SLOT_ID) + + self.non_spec_query_start_loc[:num_decodes + 1].copy_( + non_spec_query_start_loc, non_blocking=True) + non_spec_num_query_tokens = non_spec_query_start_loc[ + -1] # type: ignore[index] + non_spec_query_start_loc = \ + self.non_spec_query_start_loc[:batch_size + 1] + non_spec_query_start_loc[num_decodes + + 1:].fill_(non_spec_num_query_tokens) + + attn_metadata = GDNAttentionMetadata( + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, + num_decodes=num_decodes, + num_decode_tokens=num_decode_tokens, + num_spec_decodes=num_spec_decodes, + num_spec_decode_tokens=num_spec_decode_tokens, + has_initial_state=has_initial_state, + spec_query_start_loc=spec_query_start_loc, + non_spec_query_start_loc=non_spec_query_start_loc, + spec_state_indices_tensor=spec_state_indices_tensor, + non_spec_state_indices_tensor=non_spec_state_indices_tensor, + spec_sequence_masks=spec_sequence_masks, + spec_token_masks=spec_token_masks, + num_accepted_tokens=num_accepted_tokens, + ) + return attn_metadata + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata): + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with Mamba. + """ + m = common_attn_metadata + + assert (m.num_reqs * (self.num_spec + 1) <= m.num_actual_tokens + and ((m.num_reqs + 1) * (self.num_spec + 1) + >= m.num_actual_tokens)), \ + "GDN only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + num_accepted_tokens = torch.full((m.num_reqs, ), + m.max_query_len, + dtype=torch.int32, + device=m.query_start_loc.device) + num_drafted_tokens = torch.full((m.num_reqs, ), + self.num_spec, + dtype=torch.int32, + device=m.query_start_loc.device) + + # Fixes query-start loc for spec-sequence-indices. + m.query_start_loc = torch.arange(0, + m.num_actual_tokens + 1, + step=m.max_query_len, + device=m.query_start_loc.device, + dtype=torch.int32) + m.num_computed_tokens_cpu = (m.seq_lens_cpu - torch.full( + (m.num_reqs, ), m.max_query_len, dtype=torch.int32, device='cpu')) + + return self.build(0, m, num_accepted_tokens, num_drafted_tokens) diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 8159349e46758..d27239164b0db 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -559,12 +559,48 @@ class MambaManager(SingleTypeKVCacheManager): num_running_requests: int) -> int: return 0 + def get_num_blocks_to_allocate( + self, request_id: str, num_tokens: int, + new_computed_blocks: list[KVCacheBlock]) -> int: + """ + Get the number of blocks needed to be allocated for the request. + + Args: + request_id: The request ID. + num_tokens: The total number of tokens that need a slot (including + tokens that are already allocated). + new_computed_blocks: The new computed blocks just hitting the + prefix caching. + + Returns: + The number of blocks + """ + + assert isinstance(self.kv_cache_spec, MambaSpec) + if self.kv_cache_spec.num_speculative_blocks > 0: + num_tokens += (self.kv_cache_spec.block_size * + self.kv_cache_spec.num_speculative_blocks) + num_required_blocks = cdiv(num_tokens, self.block_size) + num_new_blocks = (num_required_blocks - len(new_computed_blocks) - + len(self.req_to_blocks[request_id])) + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it will be changed from a free block + # to a computed block when the request is allocated, so we also count + # it as needed to be allocated. + num_evictable_computed_blocks = sum( + blk.ref_cnt == 0 and not blk.is_null + for blk in new_computed_blocks) + return num_new_blocks + num_evictable_computed_blocks + def allocate_new_blocks(self, request_id: str, num_tokens: int) -> list[KVCacheBlock]: - new_blocks = super().allocate_new_blocks(request_id, num_tokens) - assert len(self.req_to_blocks[request_id]) == 1, ( - "MambaManager should only allocate 1 block for each request.") - return new_blocks + # Allocate extra `num_speculative_blocks` blocks for + # speculative decoding (MTP/EAGLE) with linear attention. + assert isinstance(self.kv_cache_spec, MambaSpec) + if self.kv_cache_spec.num_speculative_blocks > 0: + num_tokens += (self.kv_cache_spec.block_size * + self.kv_cache_spec.num_speculative_blocks) + return super().allocate_new_blocks(request_id, num_tokens) class CrossAttentionManager(SingleTypeKVCacheManager): diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 6467fcfe40aef..6e8f569fff0e3 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -194,6 +194,7 @@ class MambaSpec(KVCacheSpec): dtypes: tuple[torch.dtype] page_size_padded: Optional[int] = None mamba_type: str = "mamba2" + num_speculative_blocks: int = 0 @property def page_size_bytes(self) -> int: diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index bf25c91d8390c..7132d507c722c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -218,7 +218,7 @@ class EagleProposer: hidden_states=self.hidden_states[:num_input_tokens], inputs_embeds=inputs_embeds, ) - if self.method in ("deepseek_mtp", "ernie_mtp"): + if self.method in ("deepseek_mtp", "ernie_mtp", "qwen3_next_mtp"): last_hidden_states = ret_hidden_states hidden_states = last_hidden_states else: @@ -322,12 +322,18 @@ class EagleProposer: with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=input_batch_size): - last_hidden_states, hidden_states = self.model( + ret_hidden_states = self.model( input_ids=input_ids, positions=self.positions[:input_batch_size], hidden_states=self.hidden_states[:input_batch_size], inputs_embeds=inputs_embeds, ) + if self.method in ("deepseek_mtp", "ernie_mtp", + "qwen3_next_mtp"): + last_hidden_states = ret_hidden_states + hidden_states = ret_hidden_states + else: + last_hidden_states, hidden_states = ret_hidden_states hidden_states = hidden_states[:batch_size] logits = self.model.compute_logits(last_hidden_states[:batch_size], None) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 0e509b7453b9d..1901de6d2e5b3 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -156,9 +156,14 @@ class BlockTable: class MultiGroupBlockTable: """The BlockTables for each KV cache group.""" - def __init__(self, max_num_reqs: int, max_model_len: int, - max_num_batched_tokens: int, pin_memory: bool, - device: torch.device, block_sizes: list[int]) -> None: + def __init__(self, + max_num_reqs: int, + max_model_len: int, + max_num_batched_tokens: int, + pin_memory: bool, + device: torch.device, + block_sizes: list[int], + num_speculative_tokens: int = 0) -> None: # Note(hc): each dcp rank only store # (max_model_len//dcp_world_size) tokens in kvcache, # so the block_size which used for calc max_num_blocks_per_req @@ -170,10 +175,11 @@ class MultiGroupBlockTable: dcp_world_size = 1 self.block_tables = [ - BlockTable(block_size, max_num_reqs, - cdiv(max_model_len, block_size * dcp_world_size), - max_num_batched_tokens, pin_memory, device) - for block_size in block_sizes + BlockTable( + block_size, max_num_reqs, + max(cdiv(max_model_len, block_size * dcp_world_size), + 1 + num_speculative_tokens), max_num_batched_tokens, + pin_memory, device) for block_size in block_sizes ] def append_row(self, block_ids: tuple[list[int], ...], diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index bf9b16575e609..1cf56656d7adf 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -83,6 +83,7 @@ class InputBatch: logitsprocs: Optional[LogitsProcessors] = None, is_spec_decode: bool = False, is_pooling_model: bool = False, + num_speculative_tokens: int = 0, ): self.is_pooling_model = is_pooling_model self.is_spec_decode = is_spec_decode @@ -127,6 +128,7 @@ class InputBatch: pin_memory=pin_memory, device=device, block_sizes=block_sizes, + num_speculative_tokens=num_speculative_tokens, ) # Sampling-related. @@ -202,6 +204,14 @@ class InputBatch: self.repetition_penalties_cpu_tensor.numpy() self.repetition_penalties_reqs: set[str] = set() + # Speculative decoding + self.num_accepted_tokens_cpu_tensor = torch.ones((max_num_reqs, ), + dtype=torch.int64, + device="cpu", + pin_memory=pin_memory) + self.num_accepted_tokens_cpu = \ + self.num_accepted_tokens_cpu_tensor.numpy() + # lora related self.request_lora_mapping = np.zeros((self.max_num_reqs, ), dtype=np.int32) @@ -394,6 +404,9 @@ class InputBatch: else: raise NotImplementedError("Unrecognized request type") + # Speculative decoding: by default 1 token is generated. + self.num_accepted_tokens_cpu[req_index] = 1 + # Add request lora ID if request.lora_request: lora_id = request.lora_request.lora_int_id @@ -515,6 +528,8 @@ class InputBatch: self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1] self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] = \ self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1] + self.num_accepted_tokens_cpu[i1], self.num_accepted_tokens_cpu[i2] =\ + self.num_accepted_tokens_cpu[i2], self.num_accepted_tokens_cpu[i1] swap_dict_values(self.generators, i1, i2) swap_dict_values(self.bad_words_token_ids, i1, i2) @@ -609,6 +624,8 @@ class InputBatch: empty_index] = self.presence_penalties_cpu[last_req_index] self.repetition_penalties_cpu[ empty_index] = self.repetition_penalties_cpu[last_req_index] + self.num_accepted_tokens_cpu[ + empty_index] = self.num_accepted_tokens_cpu[last_req_index] generator = self.generators.pop(last_req_index, None) if generator is not None: self.generators[empty_index] = generator diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ce53154896bae..1b785af96a9ab 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -53,9 +53,9 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, cdiv, check_use_alibi, - get_dtype_size, is_pin_memory_available, round_up, - supports_dynamo) + GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, + is_pin_memory_available, round_up, supports_dynamo) +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, create_fast_prefill_custom_backend, @@ -324,6 +324,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.hidden_size, dtype=self.dtype, numpy=False) + self.num_draft_tokens = self._make_buffer(self.max_num_reqs, + dtype=torch.int32) + self.num_accepted_tokens = self._make_buffer(self.max_num_reqs, + dtype=torch.int64) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: @@ -663,6 +667,31 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Refresh batch metadata with any pending updates. self.input_batch.refresh_metadata() + def _update_states_after_model_execute( + self, output_token_ids: torch.Tensor) -> None: + """Update the cached states after model execution. + + This is used for MTP/EAGLE for hybrid models, as in linear attention, + only the last token's state is kept. In MTP/EAGLE, for draft tokens + the state are kept util we decide how many tokens are accepted for + each sequence, and a shifting is done during the next iteration + based on the number of accepted tokens. + """ + if not self.model_config.is_hybrid or not self.speculative_config: + return + + # Find the number of accepted tokens for each sequence. + num_accepted_tokens = (torch.cat( + [ + output_token_ids, + torch.full((output_token_ids.size(0), 1), + -1, + device=output_token_ids.device), + ], + dim=1) == -1).int().argmax(-1).cpu().numpy() + for i, num_tokens in enumerate(num_accepted_tokens): + self.input_batch.num_accepted_tokens_cpu[i] = num_tokens + def _init_mrope_positions(self, req_state: CachedRequestState): image_grid_thw = [] video_grid_thw = [] @@ -936,6 +965,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # We will ignore the sampled tokens from the partial requests. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 + num_draft_tokens = None spec_decode_metadata = None else: # Get the number of draft tokens for each request. @@ -950,6 +980,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): spec_decode_metadata = self._calc_spec_decode_metadata( num_draft_tokens, cu_num_tokens) logits_indices = spec_decode_metadata.logits_indices + self.num_draft_tokens.np[:num_reqs] = num_draft_tokens + self.num_draft_tokens.np[num_reqs:].fill(0) + self.num_draft_tokens.copy_to_gpu() logits_indices_padded = None if self.cache_config.kv_sharing_fast_prefill: @@ -964,6 +997,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_computed_tokens_cpu = ( self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs]) spec_decode_common_attn_metadata = None + if use_spec_decode: + self.num_accepted_tokens.np[:num_reqs] = ( + self.input_batch.num_accepted_tokens_cpu[:num_reqs]) + self.num_accepted_tokens.np[num_reqs:].fill(1) + self.num_accepted_tokens.copy_to_gpu() # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. @@ -1034,10 +1072,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): builder, ) + extra_attn_metadata_args = {} + if use_spec_decode and isinstance(builder, + GDNAttentionMetadataBuilder): + extra_attn_metadata_args = dict( + num_accepted_tokens=self.num_accepted_tokens. + gpu[:num_reqs], + num_draft_tokens=self.num_draft_tokens.gpu[:num_reqs], + ) + attn_metadata_i = builder.build( common_prefix_len=common_prefix_len, common_attn_metadata=common_attn_metadata, - ) + **extra_attn_metadata_args) for layer_name in attn_group.layer_names: attn_metadata[layer_name] = attn_metadata_i @@ -1814,6 +1861,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): sampling_metadata, ) sampler_output.sampled_token_ids = output_token_ids + self._update_states_after_model_execute(output_token_ids) return sampler_output @@ -2644,13 +2692,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Note: Overriding max_query_len to be the prefill tokens max_query_len = num_prefill_tokens elif uniform_decode: - assert not create_mixed_batch - num_reqs = cdiv(num_tokens, max_query_len) + num_reqs = num_tokens // max_query_len assert num_reqs <= max_num_reqs, \ "Do not capture num_reqs > max_num_reqs for uniform batch" num_scheduled_tokens_list = [max_query_len] * num_reqs if num_tokens % max_query_len != 0: - num_scheduled_tokens_list[-1] = num_tokens % max_query_len + num_scheduled_tokens_list[-1] += num_tokens % max_query_len else: num_reqs = min(num_tokens, max_num_reqs) min_tokens_per_req = num_tokens // num_reqs @@ -3297,6 +3344,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): is_spec_decode=bool(self.vllm_config.speculative_config), logitsprocs=self.input_batch.logitsprocs, is_pooling_model=self.is_pooling_model, + num_speculative_tokens=( + self.vllm_config.speculative_config.num_speculative_tokens + if self.vllm_config.speculative_config else 0), ) def _allocate_kv_cache_tensors( @@ -3647,7 +3697,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase) if len(mamba_layers) > 0: - if self.vllm_config.speculative_config is not None: + if (self.vllm_config.speculative_config is not None + and self.vllm_config.model_config.hf_config.model_type + not in ["qwen3_next"]): raise NotImplementedError( "Mamba with speculative decoding is not supported yet.") if self.vllm_config.cache_config.enable_prefix_caching: @@ -3666,7 +3718,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): dtypes=mamba_module.get_state_dtype(), block_size=max_model_len, page_size_padded=page_size_padded, - mamba_type=mamba_module.mamba_type) + mamba_type=mamba_module.mamba_type, + num_speculative_blocks=( + self.speculative_config.num_speculative_tokens + if self.speculative_config else 0), + ) return kv_cache_spec diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b4a67e2899d0d..142b1afce8c3f 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -78,7 +78,8 @@ class Worker(LocalOrDistributedWorkerBase): "deepseek_mtp", "glm4_moe_mtp", "mimo_mtp", - "ernie_mtp")) \ + "ernie_mtp", + "qwen3_next_mtp")) \ else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner From 6aeb1dab4a3585fb0a16f76b7aa2779fffdd5fc8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Thu, 11 Sep 2025 16:48:25 +0800 Subject: [PATCH 159/584] [Bugfix] Fix incorrect import of CacheConfig (#24631) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/attention/layers/cross_attention.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index 5f814b23888b3..c24fa4e15f679 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -6,14 +6,13 @@ from typing import Optional import numpy as np import torch -from transformers import CacheConfig from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata, AttentionType) from vllm.attention.layer import Attention from vllm.attention.selector import get_attn_backend -from vllm.config import VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import cdiv From 85df8afdae72866f5142d674a916ec6a0879b9e8 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Thu, 11 Sep 2025 16:50:05 +0800 Subject: [PATCH 160/584] [Docs] Revise frameworks/anything-llm.md (#24489) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- docs/deployment/frameworks/anything-llm.md | 56 +++++++++++++--------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md index 0b41e73b030cc..40a463a8a596c 100644 --- a/docs/deployment/frameworks/anything-llm.md +++ b/docs/deployment/frameworks/anything-llm.md @@ -1,41 +1,53 @@ -# Anything LLM +# AnythingLLM -[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting. +[AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting. It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. ## Prerequisites -- Setup vLLM environment +Set up the vLLM environment: + +```bash +pip install vllm +``` ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with a supported chat-completion model, for example: -```bash -vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 -``` + ```bash + vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 + ``` -- Download and install [Anything LLM desktop](https://anythingllm.com/desktop). +1. Download and install [AnythingLLM Desktop](https://anythingllm.com/desktop). -- On the bottom left of open settings, AI Providers --> LLM: - - LLM Provider: Generic OpenAI - - Base URL: http://{vllm server host}:{vllm server port}/v1 - - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` +1. Configure the AI provider: -![](../../assets/deployment/anything-llm-provider.png) + - At the bottom, click the 🔧 wrench icon -> **Open settings** -> **AI Providers** -> **LLM**. + - Enter the following values: + - LLM Provider: Generic OpenAI + - Base URL: `http://{vllm server host}:{vllm server port}/v1` + - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` -- Back to home page, New Workspace --> create `vllm` workspace, and start to chat: + ![set AI providers](../../assets/deployment/anything-llm-provider.png) -![](../../assets/deployment/anything-llm-chat-without-doc.png) +1. Create a workspace: -- Click the upload button: - - upload the doc - - select the doc and move to the workspace - - save and embed + 1. At the bottom, click the ↺ back icon and back to workspaces. + 1. Create a workspace (e.g., `vllm`) and start chatting. -![](../../assets/deployment/anything-llm-upload-doc.png) + ![create a workspace](../../assets/deployment/anything-llm-chat-without-doc.png) -- Chat again: +1. Add a document. -![](../../assets/deployment/anything-llm-chat-with-doc.png) + 1. Click the 📎 attachment icon. + 1. Upload a document. + 1. Select and move the document into your workspace. + 1. Save and embed it. + + ![add a document](../../assets/deployment/anything-llm-upload-doc.png) + +1. Chat using your document as context. + + ![chat with your context](../../assets/deployment/anything-llm-chat-with-doc.png) From ba6011027dd68ce3bff9de9d8001354f095cee5c Mon Sep 17 00:00:00 2001 From: Russell Bryant <rbryant@redhat.com> Date: Thu, 11 Sep 2025 04:50:08 -0400 Subject: [PATCH 161/584] [Docs] Update V1 doc to reflect whisper support (#24606) Signed-off-by: Russell Bryant <rbryant@redhat.com> --- docs/models/supported_models.md | 2 +- docs/usage/v1_guide.md | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a3cc1cda5f866..de28ca37396eb 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -766,7 +766,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | +| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | ✅︎ | | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 525f740d12a7f..d404c87e8f5a7 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the | Model Type | Status | |-----------------------------|------------------------------------------------------------------------------------| | **Decoder-only Models** | <nobr>🚀 Optimized</nobr> | -| **Encoder-Decoder Models** | <nobr>🟠 Delayed</nobr> | +| **Encoder-Decoder Models** | <nobr>🟢 Whisper only</nobr> | | **Embedding Models** | <nobr>🟢 Functional</nobr> | | **Mamba Models** | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr> | | **Multimodal Models** | <nobr>🟢 Functional</nobr> | @@ -118,8 +118,9 @@ Please note that prefix caching is not yet supported for any of the above models #### Encoder-Decoder Models -Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`) -are not yet supported. +Whisper is supported. Other models requiring cross-attention between separate +encoder and decoder (e.g., `BartForConditionalGeneration`, +`MllamaForConditionalGeneration`) are not yet supported. ### Features From d14c4ebf08c9a6b6c4131eb3021d50da6fb0c212 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Thu, 11 Sep 2025 16:50:12 +0800 Subject: [PATCH 162/584] [Docs] Use 1-2-3 list for deploy steps in deployment/frameworks/ (#24633) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- docs/deployment/frameworks/autogen.md | 16 ++--- docs/deployment/frameworks/chatbox.md | 24 ++++--- docs/deployment/frameworks/dify.md | 50 ++++++++------- docs/deployment/frameworks/haystack.md | 12 ++-- docs/deployment/frameworks/litellm.md | 22 +++---- .../retrieval_augmented_generation.md | 64 +++++++++---------- 6 files changed, 98 insertions(+), 90 deletions(-) diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md index c255a85d38401..7517ee771c097 100644 --- a/docs/deployment/frameworks/autogen.md +++ b/docs/deployment/frameworks/autogen.md @@ -4,9 +4,7 @@ ## Prerequisites -- Setup vLLM environment - -- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment +Set up the vLLM and [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment: ```bash pip install vllm @@ -18,14 +16,14 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]" ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -python -m vllm.entrypoints.openai.api_server \ - --model mistralai/Mistral-7B-Instruct-v0.2 -``` + ```bash + python -m vllm.entrypoints.openai.api_server \ + --model mistralai/Mistral-7B-Instruct-v0.2 + ``` -- Call it with AutoGen: +1. Call it with AutoGen: ??? code diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md index cbca6e6282fc6..002935da56009 100644 --- a/docs/deployment/frameworks/chatbox.md +++ b/docs/deployment/frameworks/chatbox.md @@ -6,27 +6,31 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac ## Prerequisites -- Setup vLLM environment +Set up the vLLM environment: + +```bash +pip install vllm +``` ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve qwen/Qwen1.5-0.5B-Chat -``` + ```bash + vllm serve qwen/Qwen1.5-0.5B-Chat + ``` -- Download and install [Chatbox desktop](https://chatboxai.app/en#download). +1. Download and install [Chatbox desktop](https://chatboxai.app/en#download). -- On the bottom left of settings, Add Custom Provider +1. On the bottom left of settings, Add Custom Provider - API Mode: `OpenAI API Compatible` - Name: vllm - API Host: `http://{vllm server host}:{vllm server port}/v1` - API Path: `/chat/completions` - Model: `qwen/Qwen1.5-0.5B-Chat` -![](../../assets/deployment/chatbox-settings.png) + ![](../../assets/deployment/chatbox-settings.png) -- Go to `Just chat`, and start to chat: +1. Go to `Just chat`, and start to chat: -![](../../assets/deployment/chatbox-chat.png) + ![](../../assets/deployment/chatbox-chat.png) diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md index 35f02c33cb02b..820ef0cbed9fa 100644 --- a/docs/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -8,44 +8,50 @@ This guide walks you through deploying Dify using a vLLM backend. ## Prerequisites -- Setup vLLM environment -- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) +Set up the vLLM environment: + +```bash +pip install vllm +``` + +And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/). ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve Qwen/Qwen1.5-7B-Chat -``` + ```bash + vllm serve Qwen/Qwen1.5-7B-Chat + ``` -- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): +1. Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): -```bash -git clone https://github.com/langgenius/dify.git -cd dify -cd docker -cp .env.example .env -docker compose up -d -``` + ```bash + git clone https://github.com/langgenius/dify.git + cd dify + cd docker + cp .env.example .env + docker compose up -d + ``` -- Open the browser to access `http://localhost/install`, config the basic login information and login. +1. Open the browser to access `http://localhost/install`, config the basic login information and login. -- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it. +1. In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it. + +1. Fill in the model provider details as follows: -- Fill in the model provider details as follows: - **Model Type**: `LLM` - **Model Name**: `Qwen/Qwen1.5-7B-Chat` - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1` - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat` - **Completion Mode**: `Completion` -![](../../assets/deployment/dify-settings.png) + ![](../../assets/deployment/dify-settings.png) -- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type: +1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type: -![](../../assets/deployment/dify-create-chatbot.png) + ![](../../assets/deployment/dify-create-chatbot.png) -- Click the chatbot you just created to open the chat interface and start interacting with the model: +1. Click the chatbot you just created to open the chat interface and start interacting with the model: -![](../../assets/deployment/dify-chat.png) + ![](../../assets/deployment/dify-chat.png) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index 70b4b48d4543e..836305cf15c42 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -6,7 +6,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac ## Prerequisites -- Setup vLLM and Haystack environment +Set up the vLLM and Haystack environment: ```bash pip install vllm haystack-ai @@ -14,13 +14,13 @@ pip install vllm haystack-ai ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve mistralai/Mistral-7B-Instruct-v0.1 -``` + ```bash + vllm serve mistralai/Mistral-7B-Instruct-v0.1 + ``` -- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. +1. Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. ??? code diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md index c7e514f2276e0..0d6c3729911ad 100644 --- a/docs/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -13,7 +13,7 @@ And LiteLLM supports all models on VLLM. ## Prerequisites -- Setup vLLM and litellm environment +Set up the vLLM and litellm environment: ```bash pip install vllm litellm @@ -23,13 +23,13 @@ pip install vllm litellm ### Chat completion -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve qwen/Qwen1.5-0.5B-Chat -``` + ```bash + vllm serve qwen/Qwen1.5-0.5B-Chat + ``` -- Call it with litellm: +1. Call it with litellm: ??? code @@ -51,13 +51,13 @@ vllm serve qwen/Qwen1.5-0.5B-Chat ### Embeddings -- Start the vLLM server with the supported embedding model, e.g. +1. Start the vLLM server with the supported embedding model, e.g. -```bash -vllm serve BAAI/bge-base-en-v1.5 -``` + ```bash + vllm serve BAAI/bge-base-en-v1.5 + ``` -- Call it with litellm: +1. Call it with litellm: ```python from litellm import embedding diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index d5f2ec302b6cd..d86ab1600f126 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -11,7 +11,7 @@ Here are the integrations: ### Prerequisites -- Setup vLLM and langchain environment +Set up the vLLM and langchain environment: ```bash pip install -U vllm \ @@ -22,33 +22,33 @@ pip install -U vllm \ ### Deploy -- Start the vLLM server with the supported embedding model, e.g. +1. Start the vLLM server with the supported embedding model, e.g. -```bash -# Start embedding service (port 8000) -vllm serve ssmits/Qwen2-7B-Instruct-embed-base -``` + ```bash + # Start embedding service (port 8000) + vllm serve ssmits/Qwen2-7B-Instruct-embed-base + ``` -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -# Start chat service (port 8001) -vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 -``` + ```bash + # Start chat service (port 8001) + vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 + ``` -- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py> +1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py> -- Run the script +1. Run the script -```python -python retrieval_augmented_generation_with_langchain.py -``` + ```python + python retrieval_augmented_generation_with_langchain.py + ``` ## vLLM + llamaindex ### Prerequisites -- Setup vLLM and llamaindex environment +Set up the vLLM and llamaindex environment: ```bash pip install vllm \ @@ -60,24 +60,24 @@ pip install vllm \ ### Deploy -- Start the vLLM server with the supported embedding model, e.g. +1. Start the vLLM server with the supported embedding model, e.g. -```bash -# Start embedding service (port 8000) -vllm serve ssmits/Qwen2-7B-Instruct-embed-base -``` + ```bash + # Start embedding service (port 8000) + vllm serve ssmits/Qwen2-7B-Instruct-embed-base + ``` -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -# Start chat service (port 8001) -vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 -``` + ```bash + # Start chat service (port 8001) + vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 + ``` -- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py> +1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py> -- Run the script +1. Run the script: -```python -python retrieval_augmented_generation_with_llamaindex.py -``` + ```python + python retrieval_augmented_generation_with_llamaindex.py + ``` From 0fc36463e0faed724b20cb45561382cbf2a44972 Mon Sep 17 00:00:00 2001 From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com> Date: Thu, 11 Sep 2025 01:52:10 -0700 Subject: [PATCH 163/584] [CI]Add transformers_utils to Async Engine, Inputs, Utils, Worker Test (#24615) Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com> --- .buildkite/test-pipeline.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 35a849d70c313..0dbde21f36d84 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -54,6 +54,7 @@ steps: - tests/utils_ - tests/worker - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s mq_llm_engine # MQLLMEngine @@ -63,6 +64,7 @@ steps: - pytest -v -s multimodal - pytest -v -s utils_ # Utils - pytest -v -s worker # Worker + - pytest -v -s transformers_utils # transformers_utils - label: Python-only Installation Test # 10min timeout_in_minutes: 20 @@ -822,8 +824,8 @@ steps: # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' # test sequence parallel - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. From ed5ae4aaceacc28461f7e324c8f26be6fadd59df Mon Sep 17 00:00:00 2001 From: Kyuyeun Kim <62023335+kyuyeunk@users.noreply.github.com> Date: Thu, 11 Sep 2025 01:52:33 -0700 Subject: [PATCH 164/584] [Bugfix] Fix _synced_weight_loader (#24565) Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com> --- vllm/model_executor/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 41ed0b09c5a2a..65436786f82ac 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -52,10 +52,11 @@ def set_weight_attrs( def _make_synced_weight_loader(original_weight_loader): def _synced_weight_loader(param, *args, **kwargs): - original_weight_loader(param, *args, **kwargs) + out = original_weight_loader(param, *args, **kwargs) # torch._sync doesn't support, is not needed for CPU tensors. if param.device != torch.device("cpu"): torch._sync(param) + return out return _synced_weight_loader From a8b0361c9229c2584eea0a035e650e55e2d52f4e Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Thu, 11 Sep 2025 16:53:09 +0800 Subject: [PATCH 165/584] [CI] Split pooling from entrypoints Test (#24632) Signed-off-by: wang.yuqi <noooop@126.com> --- .buildkite/test-pipeline.yaml | 15 ++++++++++++++- tests/entrypoints/pooling/__init__.py | 0 tests/entrypoints/pooling/correctness/__init__.py | 0 .../correctness/test_mteb_embed.py | 0 .../correctness/test_mteb_score.py | 0 tests/entrypoints/pooling/llm/__init__.py | 0 .../{ => pooling}/llm/test_classify.py | 3 +-- .../{ => pooling}/llm/test_embedding.py | 0 .../entrypoints/{ => pooling}/llm/test_encode.py | 0 .../entrypoints/{ => pooling}/llm/test_reward.py | 3 +-- tests/entrypoints/{ => pooling}/llm/test_score.py | 3 +-- tests/entrypoints/pooling/openai/__init__.py | 0 .../{ => pooling}/openai/test_classification.py | 3 +-- .../{ => pooling}/openai/test_embedding.py | 9 ++++----- .../openai/test_embedding_dimensions.py | 11 +++++------ .../openai/test_embedding_long_text.py | 3 +-- .../{ => pooling}/openai/test_pooling.py | 3 +-- .../{ => pooling}/openai/test_rerank.py | 3 +-- .../{ => pooling}/openai/test_score.py | 3 +-- .../{ => pooling}/openai/test_truncation.py | 0 .../{ => pooling}/openai/test_vision_embedding.py | 3 +-- 21 files changed, 32 insertions(+), 30 deletions(-) create mode 100644 tests/entrypoints/pooling/__init__.py create mode 100644 tests/entrypoints/pooling/correctness/__init__.py rename tests/entrypoints/{openai => pooling}/correctness/test_mteb_embed.py (100%) rename tests/entrypoints/{openai => pooling}/correctness/test_mteb_score.py (100%) create mode 100644 tests/entrypoints/pooling/llm/__init__.py rename tests/entrypoints/{ => pooling}/llm/test_classify.py (98%) rename tests/entrypoints/{ => pooling}/llm/test_embedding.py (100%) rename tests/entrypoints/{ => pooling}/llm/test_encode.py (100%) rename tests/entrypoints/{ => pooling}/llm/test_reward.py (97%) rename tests/entrypoints/{ => pooling}/llm/test_score.py (97%) create mode 100644 tests/entrypoints/pooling/openai/__init__.py rename tests/entrypoints/{ => pooling}/openai/test_classification.py (99%) rename tests/entrypoints/{ => pooling}/openai/test_embedding.py (98%) rename tests/entrypoints/{ => pooling}/openai/test_embedding_dimensions.py (95%) rename tests/entrypoints/{ => pooling}/openai/test_embedding_long_text.py (99%) rename tests/entrypoints/{ => pooling}/openai/test_pooling.py (99%) rename tests/entrypoints/{ => pooling}/openai/test_rerank.py (99%) rename tests/entrypoints/{ => pooling}/openai/test_score.py (99%) rename tests/entrypoints/{ => pooling}/openai/test_truncation.py (100%) rename tests/entrypoints/{ => pooling}/openai/test_vision_embedding.py (98%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0dbde21f36d84..3f6b67e45de17 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -113,7 +113,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -148,6 +148,19 @@ steps: - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + - label: Distributed Tests (4 GPUs) # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] diff --git a/tests/entrypoints/pooling/__init__.py b/tests/entrypoints/pooling/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/pooling/correctness/__init__.py b/tests/entrypoints/pooling/correctness/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py similarity index 100% rename from tests/entrypoints/openai/correctness/test_mteb_embed.py rename to tests/entrypoints/pooling/correctness/test_mteb_embed.py diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py similarity index 100% rename from tests/entrypoints/openai/correctness/test_mteb_score.py rename to tests/entrypoints/pooling/correctness/test_mteb_score.py diff --git a/tests/entrypoints/pooling/llm/__init__.py b/tests/entrypoints/pooling/llm/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py similarity index 98% rename from tests/entrypoints/llm/test_classify.py rename to tests/entrypoints/pooling/llm/test_classify.py index 6c0c9cd015801..ff5cea11a9182 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -6,11 +6,10 @@ import weakref import pytest import torch +from tests.models.utils import softmax from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory -from ...models.utils import softmax - MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" prompts = ["The chef prepared a delicious meal."] diff --git a/tests/entrypoints/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py similarity index 100% rename from tests/entrypoints/llm/test_embedding.py rename to tests/entrypoints/pooling/llm/test_embedding.py diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py similarity index 100% rename from tests/entrypoints/llm/test_encode.py rename to tests/entrypoints/pooling/llm/test_encode.py diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py similarity index 97% rename from tests/entrypoints/llm/test_reward.py rename to tests/entrypoints/pooling/llm/test_reward.py index 2cee3c8d94e36..11d164c978a92 100644 --- a/tests/entrypoints/llm/test_reward.py +++ b/tests/entrypoints/pooling/llm/test_reward.py @@ -6,11 +6,10 @@ import weakref import pytest import torch +from tests.models.utils import softmax from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory -from ...models.utils import softmax - MODEL_NAME = "internlm/internlm2-1_8b-reward" prompts = ["The chef prepared a delicious meal."] diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py similarity index 97% rename from tests/entrypoints/llm/test_score.py rename to tests/entrypoints/pooling/llm/test_score.py index f715dacacb8ff..447378f989d09 100644 --- a/tests/entrypoints/llm/test_score.py +++ b/tests/entrypoints/pooling/llm/test_score.py @@ -6,11 +6,10 @@ import weakref import pytest import torch +from tests.models.utils import softmax from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory -from ...models.utils import softmax - MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" diff --git a/tests/entrypoints/pooling/openai/__init__.py b/tests/entrypoints/pooling/openai/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py similarity index 99% rename from tests/entrypoints/openai/test_classification.py rename to tests/entrypoints/pooling/openai/test_classification.py index 36c96d76c2e5f..26c2c8e6af17d 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/pooling/openai/test_classification.py @@ -6,10 +6,9 @@ import requests import torch import torch.nn.functional as F +from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import ClassificationResponse -from ...utils import RemoteOpenAIServer - MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" DTYPE = "float32" # Use float32 to avoid NaN issue diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py similarity index 98% rename from tests/entrypoints/openai/test_embedding.py rename to tests/entrypoints/pooling/openai/test_embedding.py index d46ab304ba6d5..37a10e79d4fc7 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -11,14 +11,13 @@ import requests import torch import torch.nn.functional as F +from tests.models.language.pooling.embed_utils import ( + run_embedding_correctness_test) +from tests.models.utils import check_embeddings_close +from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...models.language.pooling.embed_utils import ( - run_embedding_correctness_test) -from ...models.utils import check_embeddings_close -from ...utils import RemoteOpenAIServer - MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DTYPE = "bfloat16" diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py similarity index 95% rename from tests/entrypoints/openai/test_embedding_dimensions.py rename to tests/entrypoints/pooling/openai/test_embedding_dimensions.py index 91e91699b92ca..3c7e88daa8ff3 100644 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py @@ -9,13 +9,12 @@ from typing import Optional import openai import pytest -from vllm.entrypoints.openai.protocol import EmbeddingResponse - -from ...conftest import HfRunner -from ...models.language.pooling.embed_utils import ( +from tests.conftest import HfRunner +from tests.models.language.pooling.embed_utils import ( run_embedding_correctness_test) -from ...models.utils import EmbedModelInfo -from ...utils import RemoteOpenAIServer +from tests.models.utils import EmbedModelInfo +from tests.utils import RemoteOpenAIServer +from vllm.entrypoints.openai.protocol import EmbeddingResponse MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), diff --git a/tests/entrypoints/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py similarity index 99% rename from tests/entrypoints/openai/test_embedding_long_text.py rename to tests/entrypoints/pooling/openai/test_embedding_long_text.py index 86bd34abb97e0..2d3da238d245e 100644 --- a/tests/entrypoints/openai/test_embedding_long_text.py +++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py @@ -14,10 +14,9 @@ import openai import pytest import pytest_asyncio +from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse -from ...utils import RemoteOpenAIServer - def _generate_random_text(word_count: int) -> str: """Generate random text with approximately the specified word count.""" diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/pooling/openai/test_pooling.py similarity index 99% rename from tests/entrypoints/openai/test_pooling.py rename to tests/entrypoints/pooling/openai/test_pooling.py index 63f4205e0a42b..9f58955cfb40b 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/pooling/openai/test_pooling.py @@ -8,11 +8,10 @@ import pytest import requests from tests.models.utils import check_embeddings_close +from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import PoolingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...utils import RemoteOpenAIServer - MODEL_NAME = "internlm/internlm2-1_8b-reward" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py similarity index 99% rename from tests/entrypoints/openai/test_rerank.py rename to tests/entrypoints/pooling/openai/test_rerank.py index ce4d6c5f5d337..992cb5147ef0d 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -6,10 +6,9 @@ import requests import torch import torch.nn.functional as F +from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import RerankResponse -from ...utils import RemoteOpenAIServer - MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py similarity index 99% rename from tests/entrypoints/openai/test_score.py rename to tests/entrypoints/pooling/openai/test_score.py index 4fafcfb45fa22..d676ecccbc87c 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/pooling/openai/test_score.py @@ -8,10 +8,9 @@ import torch import torch.nn.functional as F from torch import tensor +from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import ScoreResponse -from ...utils import RemoteOpenAIServer - MODELS = [ { "name": "BAAI/bge-reranker-v2-m3", diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/pooling/openai/test_truncation.py similarity index 100% rename from tests/entrypoints/openai/test_truncation.py rename to tests/entrypoints/pooling/openai/test_truncation.py diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py similarity index 98% rename from tests/entrypoints/openai/test_vision_embedding.py rename to tests/entrypoints/pooling/openai/test_vision_embedding.py index dbd403fb7a7b5..48434e36eb265 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py @@ -7,11 +7,10 @@ import pytest import requests from transformers import AutoProcessor +from tests.utils import VLLM_PATH, RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.multimodal.utils import encode_image_base64, fetch_image -from ...utils import VLLM_PATH, RemoteOpenAIServer - MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 From a1213fae5f8fe624c4b954768d23a86e384abdb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Thu, 11 Sep 2025 11:18:09 +0200 Subject: [PATCH 166/584] [Misc] Add @NickLucche to codeowners (#24647) Signed-off-by: NickLucche <nlucches@redhat.com> --- .github/CODEOWNERS | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 61c59fe89005b..846b68054c0a1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,17 +8,18 @@ /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn -/vllm/multimodal @DarkLight1337 @ywang96 +/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche /vllm/v1/sample @22quinn @houseroad /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee /vllm/reasoning @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg +/vllm/distributed/kv_transfer @NickLucche CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, @@ -39,10 +40,10 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 -/tests/multimodal @DarkLight1337 @ywang96 +/tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/test_inputs.py @DarkLight1337 @ywang96 @@ -52,6 +53,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee /tests/models/language/generation/test_hybrid.py @tdoublep +/tests/v1/kv_connector/nixl_integration @NickLucche # Docs /docs @hmellor @@ -94,3 +96,9 @@ mkdocs.yaml @hmellor /vllm/v1/attention/backends/mla/rocm*.py @gshtras /vllm/attention/ops/rocm*.py @gshtras /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras + +# TPU +/vllm/v1/worker/tpu* @NickLucche +/vllm/platforms/tpu.py @NickLucche +/vllm/v1/sample/tpu @NickLucche +/vllm/tests/v1/tpu @NickLucche \ No newline at end of file From 25bb9e8c65424e3bf24d2eab259743f9a97b7a3c Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Thu, 11 Sep 2025 18:31:23 +0800 Subject: [PATCH 167/584] [CI Failure] fix models/language/pooling/test_auto_prefix_cache_support.py (#24636) Signed-off-by: wang.yuqi <noooop@126.com> --- vllm/config/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 587cfab35515f..2ead8f5a3741d 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3558,6 +3558,10 @@ class VllmConfig: disable_chunked_prefill_reasons.append( "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") + if not getattr(self.model_config.hf_config, "is_causal", True): + disable_chunked_prefill_reasons.append( + "Only models using causal attention supports chunked " + "prefill and prefix caching; disabling both.") elif self.model_config.is_encoder_decoder: self.scheduler_config.max_num_encoder_input_tokens = \ MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) From d6249d069965f88ff0042b638704f4cc66d52de4 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:41:39 +0100 Subject: [PATCH 168/584] Fix typing for `safetensors_load_strategy` (#24641) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/load.py | 2 +- vllm/engine/arg_utils.py | 3 +-- vllm/model_executor/model_loader/weight_utils.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/config/load.py b/vllm/config/load.py index 68253359fc567..26ffec23ad5c6 100644 --- a/vllm/config/load.py +++ b/vllm/config/load.py @@ -51,7 +51,7 @@ class LoadConfig: download_dir: Optional[str] = None """Directory to download and load the weights, default to the default cache directory of Hugging Face.""" - safetensors_load_strategy: Optional[str] = "lazy" + safetensors_load_strategy: str = "lazy" """Specifies the loading strategy for safetensors weights. - "lazy" (default): Weights are memory-mapped from the file. This enables on-demand loading and is highly efficient for models on local storage. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d9a29511eb529..be456af4d19d6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -289,8 +289,7 @@ class EngineArgs: trust_remote_code: bool = ModelConfig.trust_remote_code allowed_local_media_path: str = ModelConfig.allowed_local_media_path download_dir: Optional[str] = LoadConfig.download_dir - safetensors_load_strategy: Optional[ - str] = LoadConfig.safetensors_load_strategy + safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy load_format: Union[str, LoadFormats] = LoadConfig.load_format config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index c6ca9cd48d009..f2c66763d0816 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -519,7 +519,7 @@ def np_cache_weights_iterator( def safetensors_weights_iterator( hf_weights_files: list[str], use_tqdm_on_load: bool, - safetensors_load_strategy: Optional[str] = "lazy", + safetensors_load_strategy: str = "lazy", ) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" loading_desc = "Loading safetensors checkpoint shards" From 5f5271f1ee2ef281ed8ecc66b126aa34964e797d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:01:38 +0100 Subject: [PATCH 169/584] Move `LoRAConfig` from `config/__init__.py` to `config/lora.py` (#24644) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/core/test_scheduler.py | 3 +- tests/lora/test_layers.py | 2 +- tests/lora/test_lora_allowed_token_ids.py | 4 +- tests/lora/test_lora_manager.py | 2 +- tests/lora/test_peft_helper.py | 2 +- tests/lora/test_worker.py | 3 +- vllm/config/__init__.py | 111 +--------------- vllm/config/lora.py | 132 +++++++++++++++++++ vllm/core/scheduler.py | 3 +- vllm/engine/async_llm_engine.py | 5 +- vllm/engine/llm_engine.py | 6 +- vllm/lora/layers/base.py | 2 +- vllm/lora/layers/base_linear.py | 2 +- vllm/lora/layers/column_parallel_linear.py | 2 +- vllm/lora/layers/logits_processor.py | 2 +- vllm/lora/layers/replicated_linear.py | 2 +- vllm/lora/layers/row_parallel_linear.py | 2 +- vllm/lora/layers/vocal_parallel_embedding.py | 2 +- vllm/lora/models.py | 2 +- vllm/lora/peft_helper.py | 2 +- vllm/lora/utils.py | 2 +- vllm/lora/worker_manager.py | 2 +- vllm/model_executor/models/bart.py | 3 +- vllm/transformers_utils/tokenizer_group.py | 3 +- vllm/v1/worker/lora_model_runner_mixin.py | 3 +- 25 files changed, 167 insertions(+), 137 deletions(-) create mode 100644 vllm/config/lora.py diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index e1a840bb15039..86e08328c43b0 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -10,7 +10,8 @@ import pytest # noqa import torch from torch import Use # noqa -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.lora.request import LoRARequest diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index b0038a28ed89d..6735b7cd9e436 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -11,7 +11,7 @@ import pytest import torch import torch.nn.functional as F -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py index e77eae70445db..be6409000ae77 100644 --- a/tests/lora/test_lora_allowed_token_ids.py +++ b/tests/lora/test_lora_allowed_token_ids.py @@ -3,8 +3,8 @@ import pytest -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - VllmConfig) +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.config.lora import LoRAConfig from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c9ab32edc7f32..a5802c108c6be 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,7 +8,7 @@ import torch from safetensors.torch import load_file from torch import nn -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, RowParallelLinearWithLoRA) diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index df8696cf58e0f..ffffb5d8eab90 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -7,7 +7,7 @@ import shutil import pytest -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.lora.peft_helper import PEFTHelper ERROR_CASES = [ diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 02bfe0bf914a9..9c47abf8f4dce 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -6,9 +6,10 @@ import random import tempfile from unittest.mock import patch -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) from vllm.config.load import LoadConfig +from vllm.config.lora import LoRAConfig from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 2ead8f5a3741d..8a75b28f38a54 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -36,6 +36,7 @@ from vllm.config.compilation import (CompilationConfig, CompilationLevel, from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig +from vllm.config.lora import LoRAConfig from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -2400,116 +2401,6 @@ class SpeculativeConfig: return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" -LoRADType = Literal["auto", "float16", "bfloat16"] - - -@config -@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) -class LoRAConfig: - """Configuration for LoRA.""" - - max_lora_rank: int = 16 - """Max LoRA rank.""" - max_loras: int = 1 - """Max number of LoRAs in a single batch.""" - fully_sharded_loras: bool = False - """By default, only half of the LoRA computation is sharded with tensor - parallelism. Enabling this will use the fully sharded layers. At high - sequence length, max rank or tensor parallel size, this is likely faster. - """ - max_cpu_loras: Optional[int] = None - """Maximum number of LoRAs to store in CPU memory. Must be >= than - `max_loras`.""" - lora_dtype: Union[torch.dtype, LoRADType] = "auto" - """Data type for LoRA. If auto, will default to base model dtype.""" - lora_extra_vocab_size: int = 256 - """(Deprecated) Maximum size of extra vocabulary that can be present in a - LoRA adapter. Will be removed in v0.12.0.""" - lora_vocab_padding_size: ClassVar[int] = current_platform\ - .get_lora_vocab_padding_size() - default_mm_loras: Optional[dict[str, str]] = None - """Dictionary mapping specific modalities to LoRA model paths; this field - is only applicable to multimodal models and should be leveraged when a - model always expects a LoRA to be active when a given modality is present. - Note that currently, if a request provides multiple additional - modalities, each of which have their own LoRA, we do NOT apply - default_mm_loras because we currently only support one lora adapter - per prompt. When run in offline mode, the lora IDs for n modalities - will be automatically assigned to 1-n with the names of the modalities - in alphabetic order.""" - bias_enabled: bool = False - """[DEPRECATED] Enable bias for LoRA adapters. This option will be - removed in v0.12.0.""" - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - factors.append(self.max_lora_rank) - factors.append(self.max_loras) - factors.append(self.fully_sharded_loras) - factors.append(self.lora_dtype) - factors.append(self.lora_extra_vocab_size) - factors.append(self.lora_vocab_padding_size) - factors.append(self.bias_enabled) - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self): - # Deprecation warning for lora_extra_vocab_size - logger.warning( - "`lora_extra_vocab_size` is deprecated and will be removed " - "in v0.12.0. Additional vocabulary support for " - "LoRA adapters is being phased out.") - - # Deprecation warning for enable_lora_bias - if self.bias_enabled: - logger.warning("`enable_lora_bias` is deprecated " - "and will be removed in v0.12.0.") - - # Setting the maximum rank to 512 should be able to satisfy the vast - # majority of applications. - possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512) - possible_lora_extra_vocab_size = (256, 512) - if self.max_lora_rank not in possible_max_ranks: - raise ValueError( - f"max_lora_rank ({self.max_lora_rank}) must be one of " - f"{possible_max_ranks}.") - if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: - raise ValueError( - f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " - f"must be one of {possible_lora_extra_vocab_size}.") - if self.max_loras < 1: - raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") - if self.max_cpu_loras is None: - self.max_cpu_loras = self.max_loras - elif self.max_cpu_loras < self.max_loras: - raise ValueError( - f"max_cpu_loras ({self.max_cpu_loras}) must be >= " - f"max_loras ({self.max_loras})") - - def verify_with_cache_config(self, cache_config: CacheConfig): - if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: - raise ValueError( - "V0 LoRA does not support CPU offload, please use V1.") - - def verify_with_model_config(self, model_config: ModelConfig): - if self.lora_dtype in (None, "auto"): - self.lora_dtype = model_config.dtype - elif isinstance(self.lora_dtype, str): - self.lora_dtype = getattr(torch, self.lora_dtype) - - @config @dataclass class MultiModalConfig: diff --git a/vllm/config/lora.py b/vllm/config/lora.py new file mode 100644 index 0000000000000..3fe28f5dad4fa --- /dev/null +++ b/vllm/config/lora.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union + +import torch +from pydantic import ConfigDict +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.platforms import current_platform + +if TYPE_CHECKING: + from vllm.config import ModelConfig + from vllm.config.cache import CacheConfig +else: + ModelConfig = Any + CacheConfig = Any + +logger = init_logger(__name__) + +LoRADType = Literal["auto", "float16", "bfloat16"] + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class LoRAConfig: + """Configuration for LoRA.""" + + max_lora_rank: int = 16 + """Max LoRA rank.""" + max_loras: int = 1 + """Max number of LoRAs in a single batch.""" + fully_sharded_loras: bool = False + """By default, only half of the LoRA computation is sharded with tensor + parallelism. Enabling this will use the fully sharded layers. At high + sequence length, max rank or tensor parallel size, this is likely faster. + """ + max_cpu_loras: Optional[int] = None + """Maximum number of LoRAs to store in CPU memory. Must be >= than + `max_loras`.""" + lora_dtype: Union[torch.dtype, LoRADType] = "auto" + """Data type for LoRA. If auto, will default to base model dtype.""" + lora_extra_vocab_size: int = 256 + """(Deprecated) Maximum size of extra vocabulary that can be present in a + LoRA adapter. Will be removed in v0.12.0.""" + lora_vocab_padding_size: ClassVar[int] = current_platform\ + .get_lora_vocab_padding_size() + default_mm_loras: Optional[dict[str, str]] = None + """Dictionary mapping specific modalities to LoRA model paths; this field + is only applicable to multimodal models and should be leveraged when a + model always expects a LoRA to be active when a given modality is present. + Note that currently, if a request provides multiple additional + modalities, each of which have their own LoRA, we do NOT apply + default_mm_loras because we currently only support one lora adapter + per prompt. When run in offline mode, the lora IDs for n modalities + will be automatically assigned to 1-n with the names of the modalities + in alphabetic order.""" + bias_enabled: bool = False + """[DEPRECATED] Enable bias for LoRA adapters. This option will be + removed in v0.12.0.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.max_lora_rank) + factors.append(self.max_loras) + factors.append(self.fully_sharded_loras) + factors.append(self.lora_dtype) + factors.append(self.lora_extra_vocab_size) + factors.append(self.lora_vocab_padding_size) + factors.append(self.bias_enabled) + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self): + # Deprecation warning for lora_extra_vocab_size + logger.warning( + "`lora_extra_vocab_size` is deprecated and will be removed " + "in v0.12.0. Additional vocabulary support for " + "LoRA adapters is being phased out.") + + # Deprecation warning for enable_lora_bias + if self.bias_enabled: + logger.warning("`enable_lora_bias` is deprecated " + "and will be removed in v0.12.0.") + + # Setting the maximum rank to 512 should be able to satisfy the vast + # majority of applications. + possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512) + possible_lora_extra_vocab_size = (256, 512) + if self.max_lora_rank not in possible_max_ranks: + raise ValueError( + f"max_lora_rank ({self.max_lora_rank}) must be one of " + f"{possible_max_ranks}.") + if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: + raise ValueError( + f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " + f"must be one of {possible_lora_extra_vocab_size}.") + if self.max_loras < 1: + raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") + if self.max_cpu_loras is None: + self.max_cpu_loras = self.max_loras + elif self.max_cpu_loras < self.max_loras: + raise ValueError( + f"max_cpu_loras ({self.max_cpu_loras}) must be >= " + f"max_loras ({self.max_loras})") + + def verify_with_cache_config(self, cache_config: CacheConfig): + if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: + raise ValueError( + "V0 LoRA does not support CPU offload, please use V1.") + + def verify_with_model_config(self, model_config: ModelConfig): + if self.lora_dtype in (None, "auto"): + self.lora_dtype = model_config.dtype + elif isinstance(self.lora_dtype, str): + self.lora_dtype = getattr(torch, self.lora_dtype) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index d7864293e9647..92ebad778ea4b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -11,7 +11,8 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional from typing import Sequence as GenericSequence from typing import Set, Tuple, Union -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger from vllm.lora.request import LoRARequest diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6010a4647a0af..c53ece18964cb 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -10,8 +10,9 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, from weakref import ReferenceType import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig, + SchedulerConfig, VllmConfig) +from vllm.config.lora import LoRAConfig from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 47f56e58130fa..3462142d9fb91 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -16,9 +16,9 @@ import torch from typing_extensions import TypeVar import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, SchedulerConfig, - VllmConfig) +from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.config.lora import LoRAConfig from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase, Stats diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py index 0e759d5d5719b..a80a033e39b40 100644 --- a/vllm/lora/layers/base.py +++ b/vllm/lora/layers/base.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig if TYPE_CHECKING: from vllm.lora.punica_wrapper import PunicaWrapperBase diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 4e062971d9188..85a1f86ce6bf2 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -6,7 +6,7 @@ from typing import Optional, cast import torch from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed.utils import divide # yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index d2f8e05554c84..658fd23165da0 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index db974147ccca7..a50dcfa748f2f 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py index db922a02d40b0..3356297c1537a 100644 --- a/vllm/lora/layers/replicated_linear.py +++ b/vllm/lora/layers/replicated_linear.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.model_executor.layers.linear import ReplicatedLinear from .base_linear import BaseLinearLayerWithLoRA diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index bf1d9ae374f48..18ef6fd1ddd78 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index 192e154fe56a6..4d6218d970977 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.platforms import current_platform diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 3072047a2606c..7712438054914 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -16,7 +16,7 @@ from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel, from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter, get_adapter, list_adapters, remove_adapter, set_adapter_mapping) -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 8b8e5cb7d5fae..dc7249c386021 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -9,7 +9,7 @@ import os from dataclasses import MISSING, dataclass, field, fields from typing import Literal, Optional, Union -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.tensorizer import TensorizerConfig diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 2b05a2cf4d40c..10ba390bffd9e 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -11,7 +11,7 @@ from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, from torch import nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger # being imported for _all_lora_classes below # yapf conflicts with isort for this block diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 248d2954f1ef4..3a807b1e161d2 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -11,7 +11,7 @@ from vllm.adapter_commons.utils import (add_adapter_worker, list_adapters_worker, set_active_adapters_worker) from vllm.adapter_commons.worker_manager import AbstractWorkerManager -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 32551d8102f32..fd4d820a01e9d 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -29,7 +29,8 @@ from transformers import BartConfig from transformers.utils import logging from vllm.attention import Attention, AttentionType -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py index ae8220f9b9dc5..6b519cccd3cc6 100644 --- a/vllm/transformers_utils/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group.py @@ -5,7 +5,8 @@ from typing import Optional from typing_extensions import assert_never -from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig +from vllm.config import ModelConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, get_lora_tokenizer, diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 4b5f27d27541b..f2ebd5e10210b 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -11,7 +11,8 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig +from vllm.config import ModelConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest From 0cd72a7b72ed5021e148eb44d6ca41e0daff132c Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Thu, 11 Sep 2025 19:22:33 +0800 Subject: [PATCH 170/584] [XPU] add missing dependency tblib for XPU CI (#24639) Signed-off-by: Fanli Lin <fanli.lin@intel.com> --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index efcd10acf0b93..8c9b00990e995 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -30,6 +30,7 @@ docker run \ bash -c ' set -e echo $ZE_AFFINITY_MASK + pip install tblib==3.1.0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray From f946197473e20c46d51b013b8edb51a7ff7fd3b0 Mon Sep 17 00:00:00 2001 From: Tao He <linzhu.ht@alibaba-inc.com> Date: Thu, 11 Sep 2025 19:35:14 +0800 Subject: [PATCH 171/584] [Docs] Fixes a typo in the qwen3next model name. (#24654) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> --- docs/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index de28ca37396eb..db3dd2c252c76 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -403,7 +403,7 @@ th { | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3NextForCausalLM` | Qwen3.5MoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ | | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ | From f510715882304796a96e33028b4f6de1b026c2c7 Mon Sep 17 00:00:00 2001 From: youkaichao <youkaichao@gmail.com> Date: Thu, 11 Sep 2025 21:19:44 +0800 Subject: [PATCH 172/584] [build] add torch to tool.uv no-build-isolation-package (#24303) Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../installation/gpu/cuda.inc.md | 8 ++++---- pyproject.toml | 3 +++ use_existing_torch.py | 20 ++----------------- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 275232e12e08c..01c5f5fc02f3e 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -165,14 +165,14 @@ There are scenarios where the PyTorch dependency cannot be easily installed with - Building vLLM with PyTorch nightly or a custom PyTorch build. - Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it. -To build vLLM using an existing PyTorch installation: +To build vLLM using an existing PyTorch installation, it is recommended to use `uv`, because it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) for disabling build isolation for specific packages and vLLM leverages this mechanism to specify `torch` as the package to disable build isolation. ```bash +# install PyTorch first, either from PyPI or from source git clone https://github.com/vllm-project/vllm.git cd vllm -python use_existing_torch.py -uv pip install -r requirements/build.txt -uv pip install --no-build-isolation -e . +# pip install -e . does not work directly, only uv can do this +uv pip install -e . ``` ##### Use the local cutlass for compilation diff --git a/pyproject.toml b/pyproject.toml index 7aa0371f38dc5..416423abcad88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -345,3 +345,6 @@ extend-ignore-re = [] windo = "windo" [tool.typos.type.vimscript.extend-words] + +[tool.uv] +no-build-isolation-package = ["torch"] diff --git a/use_existing_torch.py b/use_existing_torch.py index a9f79e16981c4..b5aafdde16c28 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -1,21 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import glob - -requires_files = glob.glob('requirements/*.txt') -requires_files += ["pyproject.toml"] -for file in requires_files: - print(f">>> cleaning {file}") - with open(file) as f: - lines = f.readlines() - if "torch" in "".join(lines).lower(): - print("removed:") - with open(file, 'w') as f: - for line in lines: - if 'torch' not in line.lower(): - f.write(line) - else: - print(line.strip()) - print(f"<<< done cleaning {file}") - print() +print("vLLM is now using 'uv' to disable build isolation for 'torch'.") +print("Please instead install vLLM with 'uv pip install -e .' (must use 'uv')") From d11ec124a0409a78a23539212356510c79c2111e Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Thu, 11 Sep 2025 21:29:43 +0800 Subject: [PATCH 173/584] [Bench] Add qwen-next in benchmark_moe.py (#24661) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- benchmarks/kernels/benchmark_moe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 6259aa0dd6290..94f3f1ae11f27 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -594,7 +594,11 @@ def main(args: argparse.Namespace): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): + elif config.architectures[0] in ( + "Qwen2MoeForCausalLM", + "Qwen3MoeForCausalLM", + "Qwen3NextForCausalLM", + ): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size From fd1ce98cdd1f4699c781d2f24b39f7b16a338886 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Thu, 11 Sep 2025 21:37:51 +0800 Subject: [PATCH 174/584] [CI] Split mteb test from Language Models Test (#24634) Signed-off-by: wang.yuqi <noooop@126.com> --- .buildkite/test-pipeline.yaml | 10 ++++++++++ .../pooling/correctness/test_mteb_embed.py | 7 +++---- .../pooling/correctness/test_mteb_score.py | 12 +++--------- tests/models/language/pooling_mteb_test/__init__.py | 0 .../{pooling => pooling_mteb_test}/mteb_utils.py | 0 .../{pooling => pooling_mteb_test}/test_baai.py | 10 ++++++---- .../test_bge_reranker_v2_gemma.py | 6 +++--- .../test_cross_encoder.py | 5 +++-- .../{pooling => pooling_mteb_test}/test_gte.py | 10 ++++++---- .../{pooling => pooling_mteb_test}/test_intfloat.py | 6 ++++-- .../{pooling => pooling_mteb_test}/test_jina.py | 9 +++++---- .../test_mxbai_rerank.py | 2 +- .../{pooling => pooling_mteb_test}/test_nomic.py | 6 ++++-- .../test_qwen3_reranker.py | 2 +- .../test_snowflake_arctic_embed.py | 6 ++++-- .../test_st_projector.py | 5 +++-- 16 files changed, 56 insertions(+), 40 deletions(-) create mode 100644 tests/models/language/pooling_mteb_test/__init__.py rename tests/models/language/{pooling => pooling_mteb_test}/mteb_utils.py (100%) rename tests/models/language/{pooling => pooling_mteb_test}/test_baai.py (93%) rename tests/models/language/{pooling => pooling_mteb_test}/test_bge_reranker_v2_gemma.py (96%) rename tests/models/language/{pooling => pooling_mteb_test}/test_cross_encoder.py (85%) rename tests/models/language/{pooling => pooling_mteb_test}/test_gte.py (94%) rename tests/models/language/{pooling => pooling_mteb_test}/test_intfloat.py (92%) rename tests/models/language/{pooling => pooling_mteb_test}/test_jina.py (92%) rename tests/models/language/{pooling => pooling_mteb_test}/test_mxbai_rerank.py (97%) rename tests/models/language/{pooling => pooling_mteb_test}/test_nomic.py (90%) rename tests/models/language/{pooling => pooling_mteb_test}/test_qwen3_reranker.py (98%) rename tests/models/language/{pooling => pooling_mteb_test}/test_snowflake_arctic_embed.py (94%) rename tests/models/language/{pooling => pooling_mteb_test}/test_st_projector.py (86%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3f6b67e45de17..adbd08b13f75b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -642,6 +642,16 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + - label: Multi-Modal Processor Test # 44min timeout_in_minutes: 60 source_file_dependencies: diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py index 1601c18d9b787..12a4875bdacfd 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py @@ -4,10 +4,9 @@ import os import pytest -from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, - MTEB_EMBED_TOL, - OpenAIClientMtebEncoder, - run_mteb_embed_task) +from tests.models.language.pooling_mteb_test.mteb_utils import ( + MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder, + run_mteb_embed_task) from tests.utils import RemoteOpenAIServer os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py index 417f85adc6e06..7c059d16b3863 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_score.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py @@ -4,15 +4,9 @@ import os import pytest -# yapf conflicts with isort for this block -# yapf: disable -from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS, - MTEB_RERANK_TASKS, - MTEB_RERANK_TOL, - RerankClientMtebEncoder, - ScoreClientMtebEncoder, - run_mteb_rerank) -# yapf: enable +from tests.models.language.pooling_mteb_test.mteb_utils import ( + MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL, + RerankClientMtebEncoder, ScoreClientMtebEncoder, run_mteb_rerank) from tests.utils import RemoteOpenAIServer os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/models/language/pooling_mteb_test/__init__.py b/tests/models/language/pooling_mteb_test/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py similarity index 100% rename from tests/models/language/pooling/mteb_utils.py rename to tests/models/language/pooling_mteb_test/mteb_utils.py diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py similarity index 93% rename from tests/models/language/pooling/test_baai.py rename to tests/models/language/pooling_mteb_test/test_baai.py index be8cb6fa76994..e131c9b1038de 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling_mteb_test/test_baai.py @@ -2,10 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, - EmbedModelInfo, LASTPoolingEmbedModelInfo, - RerankModelInfo) -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import (CLSPoolingEmbedModelInfo, + CLSPoolingRerankModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo, RerankModelInfo) + from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py similarity index 96% rename from tests/models/language/pooling/test_bge_reranker_v2_gemma.py rename to tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index fc888157b402a..1eca2a2c0abd9 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -7,9 +7,9 @@ import pytest import torch from tests.conftest import HfRunner - -from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo -from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models +from tests.models.language.pooling_mteb_test.mteb_utils import ( + VllmMtebEncoder, mteb_test_rerank_models) +from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo RERANK_MODELS = [ LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma", diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py similarity index 85% rename from tests/models/language/pooling/test_cross_encoder.py rename to tests/models/language/pooling_mteb_test/test_cross_encoder.py index b49908c9ce6a6..ad320fae0c85a 100644 --- a/tests/models/language/pooling/test_cross_encoder.py +++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo, - RerankModelInfo) +from tests.models.utils import (CLSPoolingRerankModelInfo, + LASTPoolingRerankModelInfo, RerankModelInfo) + from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py similarity index 94% rename from tests/models/language/pooling/test_gte.py rename to tests/models/language/pooling_mteb_test/test_gte.py index 98d215b0ad25e..9ae43fd05bf78 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling_mteb_test/test_gte.py @@ -3,10 +3,12 @@ import pytest -from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, - EmbedModelInfo, LASTPoolingEmbedModelInfo, - RerankModelInfo) -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import (CLSPoolingEmbedModelInfo, + CLSPoolingRerankModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo, RerankModelInfo) + from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py similarity index 92% rename from tests/models/language/pooling/test_intfloat.py rename to tests/models/language/pooling_mteb_test/test_intfloat.py index bc95475836e87..0d6026898ad4a 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling_mteb_test/test_intfloat.py @@ -2,8 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo + from .mteb_utils import mteb_test_embed_models MODELS = [ diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py similarity index 92% rename from tests/models/language/pooling/test_jina.py rename to tests/models/language/pooling_mteb_test/test_jina.py index c4e4835556a54..0a77a78bb31b6 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling_mteb_test/test_jina.py @@ -4,12 +4,13 @@ from functools import partial import pytest +from tests.models.language.pooling.embed_utils import ( + check_embeddings_close, correctness_test_embed_models, matryoshka_fy) +from tests.models.utils import (CLSPoolingEmbedModelInfo, + CLSPoolingRerankModelInfo, EmbedModelInfo, + RerankModelInfo) from vllm import PoolingParams -from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, - EmbedModelInfo, RerankModelInfo) -from .embed_utils import (check_embeddings_close, - correctness_test_embed_models, matryoshka_fy) from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models EMBEDDING_MODELS = [ diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py similarity index 97% rename from tests/models/language/pooling/test_mxbai_rerank.py rename to tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index 1731c6ae6fff7..05ebb4ec4d3f5 100644 --- a/tests/models/language/pooling/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -6,8 +6,8 @@ import pytest import torch from tests.conftest import HfRunner +from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo -from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo from .mteb_utils import mteb_test_rerank_models mxbai_rerank_hf_overrides = { diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py similarity index 90% rename from tests/models/language/pooling/test_nomic.py rename to tests/models/language/pooling_mteb_test/test_nomic.py index 52a8ce6e6671f..61512fd0dff18 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling_mteb_test/test_nomic.py @@ -3,8 +3,10 @@ import pytest -from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo + from .mteb_utils import mteb_test_embed_models MODELS = [ diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py similarity index 98% rename from tests/models/language/pooling/test_qwen3_reranker.py rename to tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index ebdacf9d0c673..65403081dc0f8 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -6,9 +6,9 @@ import pytest import torch from tests.conftest import HfRunner +from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.utils import multi_gpu_test -from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo from .mteb_utils import mteb_test_rerank_models qwen3_reranker_hf_overrides = { diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py similarity index 94% rename from tests/models/language/pooling/test_snowflake_arctic_embed.py rename to tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py index 864f3d75ef5aa..91bad2c4e42fc 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py @@ -3,8 +3,10 @@ import pytest -from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo -from .embed_utils import correctness_test_embed_models +from tests.models.language.pooling.embed_utils import ( + correctness_test_embed_models) +from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo + from .mteb_utils import mteb_test_embed_models MODELS = [ diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py similarity index 86% rename from tests/models/language/pooling/test_st_projector.py rename to tests/models/language/pooling_mteb_test/test_st_projector.py index 9301e705c4335..bd493e7e2ba09 100644 --- a/tests/models/language/pooling/test_st_projector.py +++ b/tests/models/language/pooling_mteb_test/test_st_projector.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo) +from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo) + from .mteb_utils import mteb_test_embed_models # ST models with projector (Dense) layers From 94e6b2d55f7fd539f01591c7521ec104b25f4e68 Mon Sep 17 00:00:00 2001 From: Boyuan Feng <boyuan@meta.com> Date: Thu, 11 Sep 2025 06:41:07 -0700 Subject: [PATCH 175/584] Allow users to specify kv cache memory size (#21489) Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/cache.py | 9 ++ vllm/engine/arg_utils.py | 12 ++- vllm/engine/llm_engine.py | 3 +- vllm/entrypoints/llm.py | 10 ++ vllm/utils/__init__.py | 5 +- vllm/v1/utils.py | 3 +- vllm/v1/worker/gpu_model_runner.py | 5 +- vllm/v1/worker/gpu_worker.py | 84 +++++++++++++++-- vllm/worker/model_runner.py | 6 +- vllm/worker/worker.py | 146 ++++++++++++++++++++++------- 10 files changed, 236 insertions(+), 47 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index bf85aad452d0f..4c4e39c37ee50 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -113,6 +113,15 @@ class CacheConfig: necessary for implementing this optimization in some models (e.g. Gemma3n) """ + kv_cache_memory_bytes: Optional[int] = None + """Size of KV Cache per GPU in bytes. By default, this is set to None + and vllm can automatically infer the kv cache size based on + gpu_memory_utilization. However, users may want to manually specify + the kv cache memory size. kv_cache_memory_bytes allows more fine-grain + control of how much memory gets used when compared with using + gpu_memory_memory_utilization. Note that kv_cache_memory_bytes + (when not-None) ignores gpu_memory_utilization""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index be456af4d19d6..ba1543a8d5a30 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -227,8 +227,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: elif contains_type(type_hints, int): kwargs[name]["type"] = int # Special case for large integers - if name in {"max_model_len", "max_num_batched_tokens"}: + human_readable_ints = { + "max_model_len", + "max_num_batched_tokens", + "kv_cache_memory_bytes", + } + if name in human_readable_ints: kwargs[name]["type"] = human_readable_int + kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}" elif contains_type(type_hints, float): kwargs[name]["type"] = float elif (contains_type(type_hints, dict) @@ -335,6 +341,7 @@ class EngineArgs: swap_space: float = CacheConfig.swap_space cpu_offload_gb: float = CacheConfig.cpu_offload_gb gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization + kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes max_num_batched_tokens: Optional[ int] = SchedulerConfig.max_num_batched_tokens max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills @@ -734,6 +741,8 @@ class EngineArgs: cache_group.add_argument("--block-size", **cache_kwargs["block_size"]) cache_group.add_argument("--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]) + cache_group.add_argument("--kv-cache-memory-bytes", + **cache_kwargs["kv_cache_memory_bytes"]) cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"]) cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"]) @@ -1174,6 +1183,7 @@ class EngineArgs: cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, + kv_cache_memory_bytes=self.kv_cache_memory_bytes, swap_space=self.swap_space, cache_dtype=self.kv_cache_dtype, is_attention_free=model_config.is_attention_free, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3462142d9fb91..c303d093f6324 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -278,7 +278,8 @@ class LLMEngine: self.cache_config.block_size, "gpu_memory_utilization": self.cache_config.gpu_memory_utilization, - + "kv_cache_memory_bytes": + self.cache_config.kv_cache_memory_bytes, # Quantization "quantization": self.model_config.quantization, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e6fd61ae1aad9..a6174161f115a 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -110,6 +110,14 @@ class LLM: values will increase the KV cache size and thus improve the model's throughput. However, if the value is too high, it may cause out-of- memory (OOM) errors. + kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default, + this is set to None and vllm can automatically infer the kv cache + size based on gpu_memory_utilization. However, users may want to + manually specify the kv cache memory size. kv_cache_memory_bytes + allows more fine-grain control of how much memory gets used when + compared with using gpu_memory_memory_utilization. Note that + kv_cache_memory_bytes (when not-None) ignores + gpu_memory_utilization swap_space: The size (GiB) of CPU memory per GPU to use as swap space. This can be used for temporarily storing the states of the requests when their `best_of` sampling parameters are larger than 1. If all @@ -184,6 +192,7 @@ class LLM: hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, override_pooler_config: Optional[PoolerConfig] = None, + kv_cache_memory_bytes: Optional[int] = None, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, logits_processors: Optional[list[Union[str, @@ -251,6 +260,7 @@ class LLM: tokenizer_revision=tokenizer_revision, seed=seed, gpu_memory_utilization=gpu_memory_utilization, + kv_cache_memory_bytes=kv_cache_memory_bytes, swap_space=swap_space, cpu_offload_gb=cpu_offload_gb, enforce_eager=enforce_eager, diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index e166e64e11016..ea860ca10b3a8 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2791,7 +2791,10 @@ def memory_profiling( result.torch_peak_increase = diff_profile.torch_peak result.non_torch_increase = diff_from_create.non_torch_memory result.profile_time = diff_profile.timestamp - result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa + + non_torch_memory = result.non_torch_increase + peak_activation_memory = result.torch_peak_increase + result.non_kv_cache_memory = non_torch_memory + peak_activation_memory + result.weights_memory # noqa # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index e0c7d9094aa6d..fd84b4a111f58 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -355,7 +355,8 @@ def report_usage_stats( vllm_config.cache_config.block_size, "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization, - + "kv_cache_memory_bytes": + vllm_config.cache_config.kv_cache_memory_bytes, # Quantization "quantization": vllm_config.model_config.quantization, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1b785af96a9ab..ebb18e81c38a8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3041,12 +3041,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.encoder_cache.clear() gc.collect() - def capture_model(self) -> None: + def capture_model(self) -> int: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: logger.warning( "Skipping CUDA graph capture. To turn on CUDA graph capture, " "ensure `cudagraph_mode` was not manually set to `NONE`") - return + return 0 else: self.initialize_cudagraph_capture() @@ -3117,6 +3117,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # This usually takes 5~20 seconds. logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) + return cuda_graph_size def _capture_cudagraphs(self, compilation_cases: list[int], cudagraph_runtime_mode: CUDAGraphMode, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 726f59603437f..37dd431fd68f8 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -231,18 +231,40 @@ class Worker(WorkerBase): You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ + GiB = lambda b: b / GiB_bytes + if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes: + # still need a profile run which compiles the model for + # max_num_batched_tokens + self.model_runner.profile_run() + + msg = ( + f"Initial free memory {GiB(self.init_snapshot.free_memory)} " + f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for " + "KV Cache as specified by kv_cache_memory_bytes config and " + "skipped memory profiling. This does does not respect the " + "gpu_memory_utilization config. Only use kv_cache_memory_bytes " + "config when you want manual control of KV cache memory " + "size. If OOM'ed, check the difference of initial free " + "memory between the current run and the previous run " + "where kv_cache_memory_bytes is suggested and update it " + "correspondingly.") + logger.info(msg) + return kv_cache_memory_bytes + torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() - GiB = lambda b: b / GiB_bytes # Execute a forward pass with dummy inputs to profile the memory usage # of the model. with memory_profiling( self.init_snapshot, - weights_memory=int( - self.model_runner.model_memory_usage)) as profile_result: + weights_memory=int(self.model_runner.model_memory_usage), + ) as profile_result: self.model_runner.profile_run() + self.non_torch_memory = profile_result.non_torch_increase + self.peak_activation_memory = profile_result.torch_peak_increase + free_gpu_memory = profile_result.after_profile.free_memory # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. @@ -254,7 +276,7 @@ class Worker(WorkerBase): "release GPU memory while vLLM is profiling during initialization. " "To fix this, ensure consistent GPU memory allocation or " "isolate vLLM in its own container.") - available_kv_cache_memory = self.requested_memory \ + self.available_kv_cache_memory_bytes = self.requested_memory \ - profile_result.non_kv_cache_memory unrequested_memory = self.init_snapshot.free_memory \ @@ -274,10 +296,10 @@ class Worker(WorkerBase): ) logger.debug(profile_result) logger.info("Available KV cache memory: %.2f GiB", - GiB(available_kv_cache_memory)) + GiB(self.available_kv_cache_memory_bytes)) gc.collect() - return int(available_kv_cache_memory) + return int(self.available_kv_cache_memory_bytes) def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return self.model_runner.get_kv_cache_spec() @@ -317,8 +339,56 @@ class Worker(WorkerBase): # cuda graph capture. kernel_warmup(self) + cuda_graph_memory_bytes = 0 if not self.model_config.enforce_eager: - self.model_runner.capture_model() + cuda_graph_memory_bytes = self.model_runner.capture_model() + + if (self.cache_config.kv_cache_memory_bytes is None + and hasattr(self, "peak_activation_memory")): + # Suggests optimal kv cache memory size if we rely on + # memory_profiling to guess the kv cache memory size which + # provides peak_activation_memory and a few other memory + # consumption. `memory_profiling` does not consider + # CUDAGraph memory size and may not utilize all gpu memory. + # Users may want fine-grained control to specify kv cache + # memory size. + GiB = lambda b: round(b / GiB_bytes, 2) + + # empirically observed that the memory profiling may + # slightly underestimate the memory consumption. + # So leave a small buffer (=150MiB) to avoid OOM. + redundancy_buffer_memory = 150 * (1 << 20) + non_kv_cache_memory = (self.model_runner.model_memory_usage + + self.peak_activation_memory + + self.non_torch_memory + + cuda_graph_memory_bytes) + kv_cache_memory_bytes_to_gpu_limit = ( + self.init_snapshot.free_memory - non_kv_cache_memory - + redundancy_buffer_memory) + kv_cache_memory_bytes_to_requested_limit = ( + int(self.requested_memory) - non_kv_cache_memory - + redundancy_buffer_memory) + + msg = ( + f"Free memory on device " + f"({GiB(self.init_snapshot.free_memory)}/" + f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. " + f"Desired GPU memory utilization is " + f"({self.cache_config.gpu_memory_utilization}, " + f"{GiB(self.requested_memory)} GiB). " + f"Actual usage is {GiB(self.model_runner.model_memory_usage)} " + f"GiB for weight, {GiB(self.peak_activation_memory)} GiB " + f"for peak activation, {GiB(self.non_torch_memory)} GiB " + f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " + f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " + f"config with `--kv-cache-memory=" + f"{kv_cache_memory_bytes_to_requested_limit}` to fit into " + f"requested memory, or `--kv-cache-memory=" + f"{kv_cache_memory_bytes_to_gpu_limit}` to fully " + f"utilize gpu memory. Current kv cache memory in use is " + f"{int(self.available_kv_cache_memory_bytes)} bytes.") + + logger.info(msg) # Warm up sampler and preallocate memory buffer for logits and other # sampling related tensors of max possible shape to avoid memory diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index f05401fd01327..88f83c9dd7e6c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1337,8 +1337,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): return self.lora_manager.list_adapters() @torch.inference_mode() - def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: - """Cuda graph capture a model. + def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> int: + """Cuda graph capture a model and return cudagraph memory + consumption in bytes. Note that CUDA graph's performance gain is negligible if number of batched tokens are larger than 200. And since CUDA graph @@ -1505,6 +1506,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): # This usually takes < 10 seconds. logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / GiB_bytes) + return cuda_graph_size def _update_inputs_to_capture_for_enc_dec_model(self, capture_inputs: Dict[str, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 142b1afce8c3f..670f256c0bf65 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -229,6 +229,67 @@ class Worker(LocalOrDistributedWorkerBase): self.model_runner.save_tensorized_model( tensorizer_config=tensorizer_config, ) + @torch.inference_mode() + def determine_available_kv_cache_memory(self, + total_gpu_memory: int) -> float: + if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes: + # still need a profile run which compiles the model for + # max_num_batched_tokens + self.model_runner.profile_run() + + GiB = lambda b: b / GiB_bytes + msg = ( + f"Initial free memory " + f"{GiB(self.baseline_snapshot.free_memory):.2f} " + f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for " + "KV Cache as specified by kv_cache_memory_bytes config and " + "skipped memory profiling. This does does not respect the " + "gpu_memory_utilization config. Only use kv_cache_memory_bytes " + "config when you want manual control of KV cache memory " + "size. If OOM'ed, check the difference of initial free " + "memory between the current run and the previous run " + "where kv_cache_memory_bytes is suggested and update it " + "correspondingly.") + logger.info(msg) + return self.cache_config.kv_cache_memory_bytes + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + with memory_profiling( + self.baseline_snapshot, + weights_memory=self.model_runner.model_memory_usage) as result: + self.model_runner.profile_run() + + self.non_torch_memory = result.non_torch_increase + self.peak_activation_memory = result.torch_peak_increase + + self._assert_memory_footprint_increased_during_profiling() + + self.requested_memory = total_gpu_memory * \ + self.cache_config.gpu_memory_utilization + + self.available_kv_cache_memory = (self.requested_memory - + result.non_kv_cache_memory) + + msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n" + "the current vLLM instance can use " + "total_gpu_memory " + f"({(total_gpu_memory / GiB_bytes):.2f}GiB)" + " x gpu_memory_utilization " + f"({self.cache_config.gpu_memory_utilization:.2f})" + f" = {(self.requested_memory / GiB_bytes):.2f}GiB\n" + "model weights take " + f"{(result.weights_memory / GiB_bytes):.2f}GiB;" + " non_torch_memory takes " + f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" + " PyTorch activation peak memory takes " + f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" + " the rest of the memory reserved for KV Cache is " + f"{(self.available_kv_cache_memory / GiB_bytes):.2f}GiB.") + + logger.info(msg) + return self.available_kv_cache_memory + @torch.inference_mode() def determine_num_available_blocks(self) -> Tuple[int, int]: """Profiles the peak memory usage of the model to determine how many @@ -248,20 +309,8 @@ class Worker(LocalOrDistributedWorkerBase): torch.cuda.reset_peak_memory_stats() free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info() - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - with memory_profiling( - self.baseline_snapshot, - weights_memory=self.model_runner.model_memory_usage) as result: - self.model_runner.profile_run() - - self._assert_memory_footprint_increased_during_profiling() - - memory_for_current_instance = total_gpu_memory * \ - self.cache_config.gpu_memory_utilization - available_kv_cache_memory = (memory_for_current_instance - - result.non_kv_cache_memory) + available_kv_cache_memory = self.determine_available_kv_cache_memory( + total_gpu_memory) # Calculate the number of blocks that can be allocated with the # profiled peak memory. @@ -276,23 +325,6 @@ class Worker(LocalOrDistributedWorkerBase): num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) - msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n" - "the current vLLM instance can use " - "total_gpu_memory " - f"({(total_gpu_memory / GiB_bytes):.2f}GiB)" - " x gpu_memory_utilization " - f"({self.cache_config.gpu_memory_utilization:.2f})" - f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" - "model weights take " - f"{(result.weights_memory / GiB_bytes):.2f}GiB;" - " non_torch_memory takes " - f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" - " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" - " the rest of the memory reserved for KV Cache is " - f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.") - - logger.info(msg) # Final cleanup gc.collect() @@ -382,8 +414,58 @@ class Worker(LocalOrDistributedWorkerBase): for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size) + + cuda_graph_memory_bytes = 0 if not self.model_config.enforce_eager: - self.model_runner.capture_model(self.gpu_cache) + cuda_graph_memory_bytes = self.model_runner.capture_model( + self.gpu_cache) + + if (self.cache_config.kv_cache_memory_bytes is None + and hasattr(self, "peak_activation_memory")): + # Suggests optimal kv cache memory size if we rely on + # memory_profiling to guess the kv cache memory size which + # provides peak_activation_memory and a few other memory + # consumption. `memory_profiling` does not consider + # CUDAGraph memory size and may not utilize all gpu memory. + # Users may want fine-grained control to specify kv cache + # memory size. + GiB = lambda b: round(b / GiB_bytes, 2) + non_kv_cache_memory = (self.model_runner.model_memory_usage + + self.peak_activation_memory + + self.non_torch_memory + + cuda_graph_memory_bytes) + + # empirically observed that the memory profiling may + # slightly underestimate the memory consumption. + # So leave a small buffer (=150MiB) to avoid OOM. + redundancy_buffer_memory = 150 * (1 << 20) + kv_cache_memory_bytes_to_gpu_limit = ( + self.baseline_snapshot.free_memory - non_kv_cache_memory - + redundancy_buffer_memory) + kv_cache_memory_bytes_to_requested_limit = ( + int(self.requested_memory) - non_kv_cache_memory - + redundancy_buffer_memory) + + msg = ( + f"Free memory on device " + f"({GiB(self.baseline_snapshot.free_memory)}/" + f"{GiB(self.baseline_snapshot.total_memory)} GiB) on startup. " + f"Desired GPU memory utilization is " + f"({self.cache_config.gpu_memory_utilization}, " + f"{GiB(self.requested_memory)} GiB). " + f"Actual usage is {GiB(self.model_runner.model_memory_usage)} " + f"GiB for weight, {GiB(self.peak_activation_memory)} GiB " + f"for peak activation, {GiB(self.non_torch_memory)} GiB " + f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " + f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " + f"config with `--kv-cache-memory=" + f"{kv_cache_memory_bytes_to_requested_limit}` to fit into " + f"requested memory, or `--kv-cache-memory=" + f"{kv_cache_memory_bytes_to_gpu_limit}` to fully " + f"utilize gpu memory. Current kv cache memory in use is " + f"{int(self.available_kv_cache_memory)} bytes.") + logger.info(msg) + # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) From 4f6593b058dc7ba66d887442ba5763c6c1b3886e Mon Sep 17 00:00:00 2001 From: Mengqing Cao <cmq0113@163.com> Date: Thu, 11 Sep 2025 21:47:58 +0800 Subject: [PATCH 176/584] [HybridKVCache][Platform] Add support_hybrid_kv_cache for platform (#24646) Signed-off-by: MengqingCao <cmq0113@163.com> --- vllm/config/__init__.py | 3 +-- vllm/platforms/cpu.py | 4 ++++ vllm/platforms/cuda.py | 4 ++++ vllm/platforms/interface.py | 7 +++++++ vllm/platforms/rocm.py | 4 ++++ 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 8a75b28f38a54..24eaf2e360ab5 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3529,8 +3529,7 @@ class VllmConfig: # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later. - if not (current_platform.is_cuda() or current_platform.is_rocm() - or current_platform.is_cpu()): + if not current_platform.support_hybrid_kv_cache(): # Hybrid KV cache manager is not supported on non-GPU platforms. self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_transfer_config is not None: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 31d626a9e9667..c5b6d91a62b6d 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -347,3 +347,7 @@ class CpuPlatform(Platform): @classmethod def opaque_attention_op(cls) -> bool: return True + + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index dc94cfcc3ce87..52b33849dae6b 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -571,6 +571,10 @@ class CudaPlatformBase(Platform): "You can use float16 instead by explicitly setting the " "`dtype` flag in CLI, for example: --dtype=half.") + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index ab0eaa82ef201..59aa468185698 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -586,6 +586,13 @@ class Platform: """ raise NotImplementedError + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + """ + Returns if the hybrid kv cache is supported by the current platform. + """ + return False + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index fd899dcc9a65c..f4d136c5e0aaa 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -498,3 +498,7 @@ class RocmPlatform(Platform): f"Your {gpu_name} GPU {compute_str}. " "You can use float16 instead by explicitly setting the " "`dtype` flag in CLI, for example: --dtype=half.") + + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True From 817beef7f3860cb52047c59578ea6eee5426ce67 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Thu, 11 Sep 2025 22:26:17 +0800 Subject: [PATCH 177/584] [Bugifx] Fix qwen-next packed_modules_mapping (#24656) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- vllm/model_executor/models/qwen3_next.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 12db3d9461ef8..55c16c462885d 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -1053,7 +1053,7 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, "k_proj", "v_proj", ], - "gate_up_proj": ["up_proj", "down_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], "in_proj": ["in_proj_qkvz", "in_proj_ba"], } From 404c85ca7290d314bbbf4d130bb55becc437c4c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Thu, 11 Sep 2025 16:39:01 +0200 Subject: [PATCH 178/584] [Docs] Add transcription support to model (#24664) Signed-off-by: NickLucche <nlucches@redhat.com> --- docs/.nav.yml | 1 + docs/contributing/model/README.md | 1 + docs/contributing/model/transcription.md | 273 +++++++++++++++++++++++ 3 files changed, 275 insertions(+) create mode 100644 docs/contributing/model/transcription.md diff --git a/docs/.nav.yml b/docs/.nav.yml index 327755582a821..c103ed476d76d 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -44,6 +44,7 @@ nav: - contributing/model/registration.md - contributing/model/tests.md - contributing/model/multimodal.md + - contributing/model/transcription.md - CI: contributing/ci - Design Documents: design - API Reference: diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md index 0ca77fa499db7..6c013738ac1ec 100644 --- a/docs/contributing/model/README.md +++ b/docs/contributing/model/README.md @@ -15,6 +15,7 @@ Read through these pages for a step-by-step guide: - [Registering a Model](registration.md) - [Unit Testing](tests.md) - [Multi-Modal Support](multimodal.md) +- [Speech-to-Text Support](transcription.md) !!! tip If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md new file mode 100644 index 0000000000000..bba85adf55916 --- /dev/null +++ b/docs/contributing/model/transcription.md @@ -0,0 +1,273 @@ +# Speech-to-Text (Transcription/Translation) Support + +This document walks you through the steps to add support for speech-to-text (ASR) models to vLLM’s transcription and translation APIs by implementing [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription]. +Please refer to the [supported models](../../models/supported_models.md#transcription) for further guidance. + +## 1. Update the base vLLM model + +It is assumed you have already implemented your model in vLLM according to the basic model guide. Extend your model with the [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription] interface and implement the following class attributes and methods. + +- Declare supported languages and capabilities: + + ??? code + + ```python + from typing import ClassVar, Mapping, Optional, Literal + import numpy as np + import torch + from torch import nn + + from vllm.config import ModelConfig, SpeechToTextConfig + from vllm.inputs.data import PromptType + from vllm.model_executor.models.interfaces import SupportsTranscription + + class YourASRModel(nn.Module, SupportsTranscription): + # Map of ISO 639-1 language codes to language names + supported_languages: ClassVar[Mapping[str, str]] = { + "en": "English", + "it": "Italian", + # ... add more as needed + } + + # If your model only supports audio-conditioned generation + # (no text-only generation), enable this flag. + supports_transcription_only: ClassVar[bool] = True + ``` + + - The `supported_languages` mapping is validated at init time. + - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). + +- Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config]. + This is for controlling general behavior of the API when serving your model: + + ??? code + + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_speech_to_text_config( + cls, + model_config: ModelConfig, + task_type: Literal["transcribe", "translate"], + ) -> SpeechToTextConfig: + return SpeechToTextConfig( + sample_rate=16_000, + max_audio_clip_s=30, + # Set to None to disable server-side chunking if your + # model/processor handles it already + min_energy_split_window_size=None, + ) + ``` + + See the “Audio preprocessing and chunking” section for what each field controls. + +- Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns: + + #### A. Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n) + + Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: + + ??? code + + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str], + ) -> PromptType: + # Example with a free-form instruction prompt + task_word = "Transcribe" if task_type == "transcribe" else "Translate" + prompt = ( + "<start_of_turn>user\n" + f"{task_word} this audio: <audio_soft_token>" + "<end_of_turn>\n<start_of_turn>model\n" + ) + + return { + "multi_modal_data": {"audio": (audio, stt_config.sample_rate)}, + "prompt": prompt, + } + ``` + + For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md). + + #### B. Encoder–decoder audio-only (e.g., Whisper) + + Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: + + ??? code + + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str], + ) -> PromptType: + if language is None: + raise ValueError("Language must be specified") + + prompt = { + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": (audio, stt_config.sample_rate), + }, + }, + "decoder_prompt": ( + (f"<|prev|>{request_prompt}" if request_prompt else "") + + f"<|startoftranscript|><|{language}|>" + + f"<|{task_type}|><|notimestamps|>" + ), + } + return cast(PromptType, prompt) + ``` + + +- (Optional) Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language] + + If your model requires a language and you want a default, override this method (see Whisper): + + ```python + @classmethod + def validate_language(cls, language: Optional[str]) -> Optional[str]: + if language is None: + logger.warning( + "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") + language = "en" + return super().validate_language(language) + ``` + +- (Optional) Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.models.interfaces.SupportsTranscription.get_num_audio_tokens] + + Provide a fast duration→token estimate to improve streaming usage statistics: + + ??? code + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_num_audio_tokens( + cls, + audio_duration_s: float, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + ) -> Optional[int]: + # Return None if unknown; otherwise return an estimate. + return int(audio_duration_s * stt_config.sample_rate // 320) # example + ``` + + +## 2. Audio preprocessing and chunking + +The API server takes care of basic audio I/O and optional chunking before building prompts: + +- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `librosa`. +- Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`. +- Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words. + +Relevant server logic: +??? code + ```python + # vllm/entrypoints/openai/speech_to_text.py + async def _preprocess_speech_to_text(...): + language = self.model_cls.validate_language(request.language) + ... + y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate) + duration = librosa.get_duration(y=y, sr=sr) + do_split_audio = (self.asr_config.allow_audio_chunking + and duration > self.asr_config.max_audio_clip_s) + chunks = [y] if not do_split_audio else self._split_audio(y, int(sr)) + prompts = [] + for chunk in chunks: + prompt = self.model_cls.get_generation_prompt( + audio=chunk, + stt_config=self.asr_config, + model_config=self.model_config, + language=language, + task_type=self.task_type, + request_prompt=request.prompt, + to_language=to_language, + ) + prompts.append(prompt) + return prompts, duration + ``` + +## 3. Exposing tasks automatically + +- vLLM automatically advertises transcription support if your model implements the interface: + +```python +if supports_transcription(model): + if model.supports_transcription_only: + return ["transcription"] + supported_tasks.append("transcription") +``` + +- When enabled, the server initializes the transcription and translation handlers: + +```python +state.openai_serving_transcription = OpenAIServingTranscription(...) if "transcription" in supported_tasks else None +state.openai_serving_translation = OpenAIServingTranslation(...) if "transcription" in supported_tasks else None +``` + +No extra registration is required beyond having your model class available via the model registry and implementing `SupportsTranscription`. + +## 4. Examples in-tree + +- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py> +- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py> +- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py> + +## 5. Test with the API + +Once your model implements `SupportsTranscription`, you can test the endpoints (API mimics OpenAI): + +- Transcription (ASR): + + ```bash + curl -s -X POST \ + -H "Authorization: Bearer $VLLM_API_KEY" \ + -H "Content-Type: multipart/form-data" \ + -F "file=@/path/to/audio.wav" \ + -F "model=$MODEL_ID" \ + http://localhost:8000/v1/audio/transcriptions + ``` + +- Translation (source → English unless otherwise supported): + + ```bash + curl -s -X POST \ + -H "Authorization: Bearer $VLLM_API_KEY" \ + -H "Content-Type: multipart/form-data" \ + -F "file=@/path/to/audio.wav" \ + -F "model=$MODEL_ID" \ + http://localhost:8000/v1/audio/translations + ``` +Or check out more examples in <gh-file:examples/online_serving>. + +!!! note + +- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. +- Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass. +- For multilingual behavior, keep `supported_languages` aligned with actual model capabilities. \ No newline at end of file From 4984a291d55bf6ec6b60c198cd1398315a9a2958 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:05:59 -0400 Subject: [PATCH 179/584] [Doc] Fix Markdown Pre-commit Error (#24670) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- docs/contributing/model/transcription.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index bba85adf55916..cf25ad5bbbce3 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -37,7 +37,7 @@ It is assumed you have already implemented your model in vLLM according to the b - The `supported_languages` mapping is validated at init time. - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). -- Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config]. +- Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config]. This is for controlling general behavior of the API when serving your model: ??? code @@ -65,7 +65,7 @@ It is assumed you have already implemented your model in vLLM according to the b - Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns: - #### A. Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n) +### A. Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n) Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: @@ -102,7 +102,7 @@ It is assumed you have already implemented your model in vLLM according to the b For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md). - #### B. Encoder–decoder audio-only (e.g., Whisper) +### B. Encoder–decoder audio-only (e.g., Whisper) Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: @@ -142,7 +142,6 @@ It is assumed you have already implemented your model in vLLM according to the b return cast(PromptType, prompt) ``` - - (Optional) Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language] If your model requires a language and you want a default, override this method (see Whisper): @@ -177,7 +176,6 @@ It is assumed you have already implemented your model in vLLM according to the b return int(audio_duration_s * stt_config.sample_rate // 320) # example ``` - ## 2. Audio preprocessing and chunking The API server takes care of basic audio I/O and optional chunking before building prompts: @@ -264,10 +262,11 @@ Once your model implements `SupportsTranscription`, you can test the endpoints ( -F "model=$MODEL_ID" \ http://localhost:8000/v1/audio/translations ``` + Or check out more examples in <gh-file:examples/online_serving>. !!! note - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. - Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass. -- For multilingual behavior, keep `supported_languages` aligned with actual model capabilities. \ No newline at end of file +- For multilingual behavior, keep `supported_languages` aligned with actual model capabilities. From 51d41265ad841a3b6efea665c83cdc5d54eb7c1d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:07:23 +0100 Subject: [PATCH 180/584] [Docs] Fix typos in EP deployment doc (#24669) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/serving/expert_parallel_deployment.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index f8701870864dc..494d2ad021e71 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -158,10 +158,10 @@ vllm serve Qwen/Qwen3-30B-A3B \ ### Memory Footprint Overhead -EPLB uses redundant experts to that need to fit in GPU memory. This means that EPLB may not be a good fit for memory constrained environments or when KV cache space is at a premium. +EPLB uses redundant experts that need to fit in GPU memory. This means that EPLB may not be a good fit for memory constrained environments or when KV cache space is at a premium. This overhead equals `NUM_MOE_LAYERS * BYTES_PER_EXPERT * (NUM_TOTAL_EXPERTS + NUM_REDUNDANT_EXPERTS) ÷ NUM_EP_RANKS`. -For DeepSeekV3, this is approximately `2.4 GB` for one redundant expert per rank. +For DeepSeekV3, this is approximately `2.4 GB` for one redundant expert per EP rank. ### Example Command From bcbe2a4d9e792dea8b76ac1a006aa8596af02c41 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Fri, 12 Sep 2025 00:44:34 +0800 Subject: [PATCH 181/584] [VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- .../multimodal/processing/test_glm4_1v.py | 47 ++++++++ tests/multimodal/test_utils.py | 26 +++++ vllm/assets/video.py | 16 +-- vllm/model_executor/models/glm4_1v.py | 100 +++++++++++------- vllm/multimodal/video.py | 99 +++++++++++++++-- 5 files changed, 233 insertions(+), 55 deletions(-) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index a49842e1099c2..dfb8d9b2a038d 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -5,6 +5,7 @@ import pytest from vllm.assets.video import VideoAsset from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend from ...utils import build_model_context @@ -50,3 +51,49 @@ def test_processor_override( assert grid_t == expected_grid_t assert video_tok_count == expected_toks_per_frame * grid_t + + +@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"]) +@pytest.mark.parametrize("fps", [2]) +def test_video_loader_consistency( + model_id: str, + fps: int, +): + """ + Ensure dynamic video loader (pre-sampled by loader) and normal video + loader (post-sampled by processor) produce same video processing outputs. + """ + ctx = build_model_context( + model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"video": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + hf_processor_mm_kwargs = {"fps": fps} + + # Build the image str / prompt based on the number of images we pass + prompt = "<|begin_of_video|><|video|><|end_of_video|>" + + video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path + with open(video_path, "rb") as f: + video_bytes = f.read() + + static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes) + dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes( + video_bytes, requested_fps=fps) + + # pre-sampled loader shouldn't read all frames + assert len(dynamic_video) < len(static_video) + + static_mm_data = {"video": [(static_video, static_metadata)]} + dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]} + + static_outputs = processor.apply(prompt, static_mm_data, + hf_processor_mm_kwargs) + dynamic_outputs = processor.apply(prompt, dynamic_mm_data, + hf_processor_mm_kwargs) + + assert static_outputs["prompt_token_ids"] == dynamic_outputs[ + "prompt_token_ids"] + assert static_outputs["mm_kwargs"].get_data( + ) == dynamic_outputs["mm_kwargs"].get_data() diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 886582a516409..e1e8282dd66d4 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int): assert metadata_sync == metadata_async +@pytest.mark.asyncio +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +@pytest.mark.parametrize("max_duration", [1, 60, 1800]) +@pytest.mark.parametrize("requested_fps", [2, 24]) +async def test_fetch_video_http_with_dynamic_loader( + video_url: str, max_duration: int, requested_fps: int, + monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic") + connector = MediaConnector( + media_io_kwargs={ + "video": { + "max_duration": max_duration, + "requested_fps": requested_fps, + } + }) + + video_sync, metadata_sync = connector.fetch_video(video_url) + video_async, metadata_async = await connector.fetch_video_async( + video_url) + + assert np.array_equal(video_sync, video_async) + assert metadata_sync == metadata_async + assert metadata_sync["video_backend"] == "opencv_dynamic" + + # Used for `test_argsort_mm_positions`. class TestCase(NamedTuple): mm_positions: "MultiModalPlaceholderDict" diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 8ab0e9760be87..983e9114cccfb 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -110,22 +110,23 @@ class VideoAsset: def filename(self) -> str: return self._NAME_TO_FILE[self.name] + @property + def video_path(self) -> str: + return download_video_asset(self.filename) + @property def pil_images(self) -> list[Image.Image]: - video_path = download_video_asset(self.filename) - ret = video_to_pil_images_list(video_path, self.num_frames) + ret = video_to_pil_images_list(self.video_path, self.num_frames) return ret @property def np_ndarrays(self) -> npt.NDArray: - video_path = download_video_asset(self.filename) - ret = video_to_ndarrays(video_path, self.num_frames) + ret = video_to_ndarrays(self.video_path, self.num_frames) return ret @property def metadata(self) -> dict[str, Any]: - video_path = download_video_asset(self.filename) - ret = video_get_metadata(video_path) + ret = video_get_metadata(self.video_path) return ret def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray: @@ -134,5 +135,4 @@ class VideoAsset: See also: examples/offline_inference/qwen2_5_omni/only_thinker.py """ - video_path = download_video_asset(self.filename) - return librosa.load(video_path, sr=sampling_rate)[0] + return librosa.load(self.video_path, sr=sampling_rate)[0] diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 846b4b800dd59..539381b618000 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1023,6 +1023,43 @@ class Glm4vProcessingInfo(BaseProcessingInfo): selected_timestamps.append(timestamps_list[idx]) return selected_timestamps + def _construct_video_placeholder( + self, + video_array: np.ndarray, + metadata: dict[str, Any], + grid_thw: torch.Tensor, + ) -> str: + hf_processor = self.get_hf_processor() + tokenizer = self.get_tokenizer() + image_processor = hf_processor.image_processor + + hf_config = self.get_hf_config() + boi_token_id = hf_config.image_start_token_id + eoi_token_id = hf_config.image_end_token_id + bov_token_id = hf_config.video_start_token_id + eov_token_id = hf_config.video_end_token_id + merge_length = image_processor.merge_size**2 + + assert isinstance(grid_thw, torch.Tensor) + timestamps = self._get_video_second_idx(metadata, len(video_array)) + frames_idx_token = [ + tokenizer.encode(str(i), add_special_tokens=False) + for i in timestamps + ] + T, H, W = grid_thw + num_tokens_per_frame = int(H * W) // merge_length + placeholder = [] + placeholder.append(bov_token_id) + for frame_idx in frames_idx_token: + placeholder.append(boi_token_id) + placeholder.extend([hf_processor.video_token_id] * + num_tokens_per_frame) + placeholder.append(eoi_token_id) + placeholder.extend(frame_idx) + placeholder.append(eov_token_id) + + return placeholder + class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): @@ -1118,17 +1155,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): for item in mm_data.pop("videos", []): video_array, metadata = item - # FIXME(Isotr0py): Activate the below logic after we can disable - # resampling from video loader backend. - # assert metadata["total_num_frames"] == len(video_array), ( - # f"Total frames {metadata['total_num_frames']} does not " - # f"match the length of video array {len(video_array)}.") + if metadata["video_backend"] == "opencv_dynamic": + mm_kwargs["do_sample_frames"] = False - # NOTE: Temporary workaround for resampled videos. - # this can cause a divergence with HF implementation if - # the input video is resampled in advance. - - if metadata["total_num_frames"] != len(video_array): + elif metadata["total_num_frames"] != len(video_array): logger.warning( "Total frames in metadata " "(%s) does not match the length of " @@ -1140,11 +1170,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): len(video_array), ) metadata["total_num_frames"] = len(video_array) - metadata = VideoMetadata(**metadata) video_mm_data = dict() video_mm_data["videos"] = [[video_array]] - video_mm_data["video_metadata"] = [[metadata]] + video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]] video_outputs = super()._call_hf_processor( prompt="<|begin_of_video|><|video|><|end_of_video|>", @@ -1152,11 +1181,23 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): mm_kwargs=mm_kwargs, tok_kwargs=tok_kwargs, ) - input_ids = video_outputs.pop("input_ids") - input_ids[input_ids == processor.image_token_id] = ( - processor.video_token_id) - video_placeholder = processor.tokenizer.batch_decode( - input_ids)[0] + if "do_sample_frames" in mm_kwargs and not mm_kwargs[ + "do_sample_frames"]: + # Transformers v4.55 has incorrect timestamps issue for + # skip sampling. We construct the placeholder manually to + # get placeholders with correct timestamps. + placeholder = self.info._construct_video_placeholder( + video_array, + metadata, + video_outputs["video_grid_thw"].squeeze(0), + ) + video_placeholder = processor.tokenizer.decode(placeholder) + else: + input_ids = video_outputs.pop("input_ids") + input_ids[input_ids == processor.image_token_id] = ( + processor.video_token_id) + video_placeholder = processor.tokenizer.batch_decode( + input_ids)[0] prompt = prompt.replace( "<|begin_of_video|><|video|><|end_of_video|>", video_placeholder, @@ -1202,14 +1243,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( **hf_processor_mm_kwargs) - tokenizer = self.info.get_tokenizer() - hf_config = self.info.get_hf_config() - - boi_token_id = hf_config.image_start_token_id - eoi_token_id = hf_config.image_end_token_id - - bov_token_id = hf_config.video_start_token_id - eov_token_id = hf_config.video_end_token_id merge_length = image_processor.merge_size**2 @@ -1227,21 +1260,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): assert isinstance(grid_thw, torch.Tensor) video, metadata = mm_items["video"][item_idx] - timestamps = self.info._get_video_second_idx(metadata, len(video)) - frames_idx_token = [ - tokenizer.encode(str(i), add_special_tokens=False) - for i in timestamps - ] - num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length - placeholder = [] - placeholder.append(bov_token_id) - for frame_idx in frames_idx_token: - placeholder.append(boi_token_id) - placeholder.extend([hf_processor.video_token_id] * - num_tokens_per_frame) - placeholder.append(eoi_token_id) - placeholder.extend(frame_idx) - placeholder.append(eov_token_id) + placeholder = self.info._construct_video_placeholder( + video, metadata, grid_thw) return PromptUpdateDetails.select_token_id( placeholder, embed_token_id=hf_processor.video_token_id, diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ef1380bdb614c..df6e19da82ca2 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import base64 +import math from abc import abstractmethod from functools import partial from io import BytesIO from pathlib import Path -from typing import Any +from typing import Any, Union import numpy as np import numpy.typing as npt @@ -104,10 +104,12 @@ class OpenCVVideoBackend(VideoLoader): return api_pref @classmethod - def load_bytes(cls, - data: bytes, - num_frames: int = -1, - **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + def load_bytes( + cls, + data: bytes, + num_frames: int = -1, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: import cv2 backend = cls().get_cv2_video_api() @@ -119,6 +121,15 @@ class OpenCVVideoBackend(VideoLoader): original_fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames_num / original_fps if original_fps > 0 else 0 + # Use transformers transformers.video_utils.VideoMetadata format + metadata = { + "total_num_frames": total_frames_num, + "fps": original_fps, + "duration": duration, + "video_backend": "opencv" + } + + # resample video to target num_frames full_read = num_frames == -1 or total_frames_num < num_frames if full_read: num_frames = total_frames_num @@ -148,14 +159,88 @@ class OpenCVVideoBackend(VideoLoader): assert i == num_frames, (f"Expected reading {num_frames} frames, " f"but only loaded {i} frames from video.") + return frames, metadata + + +@VIDEO_LOADER_REGISTRY.register("opencv_dynamic") +class OpenCVDynamicVideoBackend(OpenCVVideoBackend): + + @classmethod + def load_bytes( + cls, + data: bytes, + num_frames: int = -1, + requested_fps: int = 2, + max_duration: int = 300, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + import cv2 + + backend = cls().get_cv2_video_api() + cap = cv2.VideoCapture(BytesIO(data), backend, []) + if not cap.isOpened(): + raise ValueError("Could not open video stream") + + total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + original_fps = cap.get(cv2.CAP_PROP_FPS) + duration = total_frames_num / original_fps if original_fps > 0 else 0 + # Use transformers transformers.video_utils.VideoMetadata format metadata = { "total_num_frames": total_frames_num, "fps": original_fps, "duration": duration, - "video_backend": "opencv" + "video_backend": "opencv_dynamic" } + # resample video to target num_frames + max_frame_idx = total_frames_num - 1 + duration = duration or round(max_frame_idx / original_fps) + 1 + + # Refer to: + # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 + frame_indices: Union[range, list[int]] + if duration <= max_duration: + n = int(math.floor(duration * requested_fps)) + frame_indices = sorted({ + min(max_frame_idx, + int(math.ceil(i * original_fps / requested_fps))) + for i in range(n) + }) + else: + num_samples = int(max_duration * requested_fps) + if num_samples >= total_frames_num: + frame_indices = range(total_frames_num) + else: + target_seconds = np.linspace(0, + duration, + num_samples, + endpoint=True) + frame_indices = sorted({ + min(max_frame_idx, int(math.ceil(t * original_fps))) + for t in target_seconds + }) + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames = np.empty((len(frame_indices), height, width, 3), + dtype=np.uint8) + + i = 0 + for idx in range(total_frames_num): + ok = cap.grab() + if not ok: + break + if idx in frame_indices: + ret, frame = cap.retrieve() + if ret: + frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + i += 1 + + assert i == len(frame_indices), ( + f"Expected reading {len(frame_indices)} frames, " + f"but only loaded {i} frames from video.") + return frames, metadata From 1fdd5c42d7c59d6c73e969bc2c3820ead0528eaf Mon Sep 17 00:00:00 2001 From: Ilya Markov <markovilya197@gmail.com> Date: Thu, 11 Sep 2025 18:45:31 +0200 Subject: [PATCH 182/584] [Kernels] Enable Torch Symmetric Memory All-Reduce By Default (#24111) Signed-off-by: ilmarkov <markovilya197@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- .../kernels/benchmark_device_communicators.py | 486 ++++++++++++++++++ csrc/custom_all_reduce.cuh | 53 +- .../device_communicators/all_reduce_utils.py | 4 +- .../device_communicators/cuda_communicator.py | 13 +- .../device_communicators/custom_all_reduce.py | 5 +- .../device_communicators/symm_mem.py | 37 +- vllm/envs.py | 4 +- 7 files changed, 572 insertions(+), 30 deletions(-) create mode 100644 benchmarks/kernels/benchmark_device_communicators.py diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py new file mode 100644 index 0000000000000..a61c17edc1e28 --- /dev/null +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Benchmark script for device communicators: +CustomAllreduce (oneshot, twoshot), PyNcclCommunicator, +and SymmMemCommunicator (multimem, two-shot). + +Usage: + torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options] + +Example: + torchrun --nproc_per_node=2 benchmark_device_communicators.py + --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100 +""" + +import json +import os +import time +from contextlib import nullcontext +from typing import Callable, Optional + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator +from vllm.logger import init_logger +from vllm.utils import FlexibleArgumentParser + +logger = init_logger(__name__) + +# Default sequence lengths to benchmark +DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] + +# Fixed hidden size and dtype for all benchmarks +HIDDEN_SIZE = 8192 +BENCHMARK_DTYPE = torch.bfloat16 + +# CUDA graph settings +CUDA_GRAPH_CAPTURE_CYCLES = 10 + + +class CommunicatorBenchmark: + """Benchmark class for testing device communicators.""" + + def __init__( + self, + rank: int, + world_size: int, + device: torch.device, + cpu_group: ProcessGroup, + sequence_lengths: list[int], + ): + self.rank = rank + self.world_size = world_size + self.device = device + self.cpu_group = cpu_group + + # Calculate max_size_override based on largest sequence length + max_seq_len = max(sequence_lengths) + max_tensor_elements = max_seq_len * HIDDEN_SIZE + self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1 + + # Initialize communicators + self.custom_allreduce = None + self.pynccl_comm = None + self.symm_mem_comm = None + self.symm_mem_comm_multimem = None + self.symm_mem_comm_two_shot = None + + self._init_communicators() + + def _init_communicators(self): + """Initialize all available communicators.""" + try: + self.custom_allreduce = CustomAllreduce( + group=self.cpu_group, + device=self.device, + max_size=self.max_size_override, + ) + if not self.custom_allreduce.disabled: + logger.info("Rank %s: CustomAllreduce initialized", self.rank) + else: + logger.info("Rank %s: CustomAllreduce disabled", self.rank) + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e + ) + self.custom_allreduce = None + + try: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, device=self.device + ) + if not self.pynccl_comm.disabled: + logger.info("Rank %s: PyNcclCommunicator initialized", self.rank) + else: + logger.info("Rank %s: PyNcclCommunicator disabled", self.rank) + self.pynccl_comm = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e + ) + self.pynccl_comm = None + + # Initialize variants for SymmMemCommunicator + try: + self.symm_mem_comm_multimem = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=True, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_multimem.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank + ) + else: + self.symm_mem_comm_multimem = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s", + self.rank, + e, + ) + self.symm_mem_comm_multimem = None + + try: + self.symm_mem_comm_two_shot = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=False, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_two_shot.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank + ) + else: + self.symm_mem_comm_two_shot = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s", + self.rank, + e, + ) + self.symm_mem_comm_two_shot = None + + def benchmark_allreduce( + self, sequence_length: int, num_warmup: int, num_trials: int + ) -> dict[str, float]: + """Benchmark allreduce operations for all available communicators.""" + + results = {} + + # Define communicators with their benchmark functions + communicators = [] + + if self.custom_allreduce is not None: + comm = self.custom_allreduce + # CustomAllreduce one-shot + communicators.append( + ( + "ca_1stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "1stage", # env variable value + ) + ) + # CustomAllreduce two-shot + communicators.append( + ( + "ca_2stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "2stage", # env variable value + ) + ) + + if self.pynccl_comm is not None: + comm = self.pynccl_comm + communicators.append( + ( + "pynccl", + lambda t, c=comm: c.all_reduce(t), + lambda t: True, # Always available if initialized + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_multimem is not None: + comm = self.symm_mem_comm_multimem + communicators.append( + ( + "symm_mem_multimem", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_two_shot is not None: + comm = self.symm_mem_comm_two_shot + communicators.append( + ( + "symm_mem_two_shot", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + # Benchmark each communicator + for name, allreduce_fn, should_use_fn, context, env_var in communicators: + # Set environment variable if needed + if env_var is not None: + os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var + else: + # Clear the environment variable to avoid interference + os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) + + latency = self.benchmark_allreduce_single( + sequence_length, + allreduce_fn, + should_use_fn, + context, + num_warmup, + num_trials, + ) + if latency is not None: + results[name] = latency + + return results + + def benchmark_allreduce_single( + self, + sequence_length: int, + allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], + should_use_fn: Callable[[torch.Tensor], bool], + context, + num_warmup: int, + num_trials: int, + ) -> Optional[float]: + """Benchmark method with CUDA graph optimization.""" + try: + # Create test tensor (2D: sequence_length x hidden_size) + tensor = torch.randn( + sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device + ) + if not should_use_fn(tensor): + return None + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + graph_input = tensor.clone() + + # Warmup before capture + for _ in range(3): + allreduce_fn(graph_input) + + # Capture the graph using context manager + with context: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): + allreduce_fn(graph_input) + + torch.cuda.synchronize() + for _ in range(num_warmup): + graph.replay() + torch.cuda.synchronize() + + torch.cuda.synchronize() + start_time = time.perf_counter() + + for _ in range(num_trials): + graph.replay() + torch.cuda.synchronize() + + end_time = time.perf_counter() + + # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES + return ( + (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000 + ) + + except Exception as e: + logger.error("CUDA graph benchmark failed: %s", e) + raise RuntimeError( + f"CUDA graph benchmark failed for communicator: {e}" + ) from e + + +def _calculate_speedup_info(comm_results: dict[str, float]) -> str: + """Calculate speedup information for a single tensor size.""" + if not comm_results: + return "N/A" + + # Find the fastest communicator + fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k]) + fastest_time = comm_results[fastest_comm] + + # Calculate speedup vs PyNccl if available + if "pynccl" in comm_results: + pynccl_time = comm_results["pynccl"] + speedup = pynccl_time / fastest_time + return f"{fastest_comm} ({speedup:.2f}x)" + else: + return f"{fastest_comm} (N/A)" + + +def print_results( + results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int +): + """Print benchmark results in a formatted table.""" + + print(f"\n{'=' * 130}") + print("Device Communicator Benchmark Results") + print( + f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, " + f"Hidden Size: {HIDDEN_SIZE}" + ) + print(f"{'=' * 130}") + + # Get all communicator names + all_comms = set() + for size_results in results.values(): + all_comms.update(size_results.keys()) + + all_comms = sorted(list(all_comms)) + + # Print header + header = f"{'Tensor Shape':<20}{'Tensor Size':<15}" + for comm in all_comms: + header += f"{comm:<20}" + header += f"{'Best (Speedup vs PyNccl)':<30}" + print(header) + print("-" * len(header)) + + # Print results for each sequence length + for seq_len in sequence_lengths: + if seq_len in results: + # Calculate tensor size in elements and bytes + tensor_elements = seq_len * HIDDEN_SIZE + tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize + + # Format tensor size (MB) + tensor_size_mb = tensor_bytes / (1024 * 1024) + tensor_size_str = f"{tensor_size_mb:.2f} MB" + + # Format tensor shape + tensor_shape = f"({seq_len}, {HIDDEN_SIZE})" + + row = f"{tensor_shape:<20}{tensor_size_str:<15}" + for comm in all_comms: + if comm in results[seq_len]: + row += f"{results[seq_len][comm]:<20.3f}" + else: + row += f"{'N/A':<20}" + + # Calculate speedup information + speedup_info = _calculate_speedup_info(results[seq_len]) + row += f"{speedup_info:<30}" + + print(row) + + print(f"{'=' * 130}") + print("All times are in milliseconds (ms) per allreduce operation") + print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)") + + +def main(): + parser = FlexibleArgumentParser(description="Benchmark device communicators") + + parser.add_argument( + "--sequence-lengths", + type=int, + nargs="+", + default=DEFAULT_SEQUENCE_LENGTHS, + help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)", + ) + + parser.add_argument( + "--num-warmup", type=int, default=5, help="Number of warmup iterations" + ) + + parser.add_argument( + "--num-trials", type=int, default=50, help="Number of benchmark trials" + ) + + parser.add_argument("--output-json", type=str, help="Output results to JSON file") + + args = parser.parse_args() + + # Initialize distributed + if not dist.is_initialized(): + dist.init_process_group(backend="gloo") + rank = dist.get_rank() + world_size = dist.get_world_size() + + # Set device + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + + # Get CPU process group + cpu_group = dist.new_group(backend="gloo") + + # Disable USE_SYMM_MEM to avoid affecting the max_sizes + # in symm_mem and custom_all_reduce for benchmark + os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" + + # Initialize benchmark + benchmark = CommunicatorBenchmark( + rank, world_size, device, cpu_group, args.sequence_lengths + ) + + # Run benchmarks + all_results = {} + + for seq_len in args.sequence_lengths: + if rank == 0: + logger.info( + "Benchmarking sequence length: %s (tensor shape: %s x %s)", + seq_len, + seq_len, + HIDDEN_SIZE, + ) + + results = benchmark.benchmark_allreduce( + sequence_length=seq_len, + num_warmup=args.num_warmup, + num_trials=args.num_trials, + ) + + all_results[seq_len] = results + + # Synchronize between ranks + dist.barrier() + + # Print results (only rank 0) + if rank == 0: + print_results(all_results, args.sequence_lengths, world_size) + + # Save to JSON if requested + if args.output_json: + # Add speedup information to results + enhanced_results = {} + for seq_len, comm_results in all_results.items(): + enhanced_results[seq_len] = { + "timings": comm_results, + "speedup_info": _calculate_speedup_info(comm_results), + } + + output_data = { + "world_size": world_size, + "dtype": str(BENCHMARK_DTYPE), + "hidden_size": HIDDEN_SIZE, + "sequence_lengths": args.sequence_lengths, + "num_warmup": args.num_warmup, + "num_trials": args.num_trials, + "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES, + "results": enhanced_results, + } + + with open(args.output_json, "w") as f: + json.dump(output_data, f, indent=2) + + logger.info("Results saved to %s", args.output_json) + + # Cleanup + if cpu_group != dist.group.WORLD: + dist.destroy_process_group(cpu_group) + + +if __name__ == "__main__": + main() diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 44709b4597765..58926f6429dd3 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16; #include <map> #include <unordered_map> #include <vector> +#include <cstdlib> +#include <cstring> namespace vllm { #define CUDACHECK(cmd) \ @@ -555,22 +557,47 @@ class CustomAllreduce { size /= d; auto bytes = size * sizeof(typename packed_t<T>::P); int blocks = std::min(block_limit, (size + threads - 1) / threads); + + // Check environment variable once + const char* env_algo = std::getenv("VLLM_CUSTOM_ALLREDUCE_ALGO"); + bool force_1stage = false; + bool force_2stage = false; + if (env_algo != nullptr) { + if (std::strcmp(env_algo, "1stage") == 0 || + std::strcmp(env_algo, "oneshot") == 0) { + force_1stage = true; + } else if (std::strcmp(env_algo, "2stage") == 0 || + std::strcmp(env_algo, "twoshot") == 0) { + force_2stage = true; + } else { + throw std::runtime_error( + "Invalid VLLM_CUSTOM_ALLREDUCE_ALGO: " + std::string(env_algo) + + ". Valid values: 1stage, oneshot, 2stage, twoshot"); + } + } + #define KL(ngpus, name) \ name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \ rank_, size); -#define REDUCE_CASE(ngpus) \ - case ngpus: { \ - if (world_size_ == 2) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else if (fully_connected_) { \ - if ((world_size_ <= 4 && bytes < 512 * 1024) || \ - (world_size_ <= 8 && bytes < 256 * 1024)) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else { \ - KL(ngpus, cross_device_reduce_2stage); \ - } \ - } \ - break; \ +#define REDUCE_CASE(ngpus) \ + case ngpus: { \ + if (force_1stage) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else if (force_2stage) { \ + KL(ngpus, cross_device_reduce_2stage); \ + } else { \ + if (world_size_ == 2) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else if (fully_connected_) { \ + if ((world_size_ <= 4 && bytes < 512 * 1024) || \ + (world_size_ <= 8 && bytes < 256 * 1024)) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else { \ + KL(ngpus, cross_device_reduce_2stage); \ + } \ + } \ + } \ + break; \ } switch (world_size_) { diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py index 5c64e7d5c4ba3..805a88854b77c 100644 --- a/vllm/distributed/device_communicators/all_reduce_utils.py +++ b/vllm/distributed/device_communicators/all_reduce_utils.py @@ -36,8 +36,8 @@ CUSTOM_ALL_REDUCE_MAX_SIZES = { "10.0": { 2: 2 * MiB, # 2 MB 4: 2 * MiB, # 2 MB - 6: 2 * MiB, # 2 MB - 8: 2 * MiB, # 2 MB + 6: 1 * MiB, # 1 MB + 8: 1 * MiB, # 1 MB } } diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index eef3f9f75f9f1..78c90b006ffc8 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -57,11 +57,19 @@ class CudaCommunicator(DeviceCommunicatorBase): self.ca_comm: Optional[CustomAllreduce] = None self.qr_comm: Optional[QuickAllReduce] = None self.symm_mem_comm: Optional[SymmMemCommunicator] = None + if envs.VLLM_ALLREDUCE_USE_SYMM_MEM and current_platform.is_cuda(): + self.symm_mem_comm = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + ) + if use_custom_allreduce and self.world_size > 1: # Initialize a custom fast all-reduce implementation. self.ca_comm = CustomAllreduce( group=self.cpu_group, device=self.device, + symm_mem_enabled=(self.symm_mem_comm is not None + and not self.symm_mem_comm.disabled), ) if current_platform.is_rocm(): @@ -72,11 +80,6 @@ class CudaCommunicator(DeviceCommunicatorBase): # currently be an MI300 series. self.qr_comm = QuickAllReduce(group=self.cpu_group, device=self.device) - if envs.VLLM_ALLREDUCE_USE_SYMM_MEM and current_platform.is_cuda(): - self.symm_mem_comm = SymmMemCommunicator( - group=self.cpu_group, - device=self.device, - ) if self.use_all2all: all2all_backend = envs.VLLM_ALL2ALL_BACKEND diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index c8cc35f99785c..3cc4bbb258244 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -54,7 +54,8 @@ class CustomAllreduce: def __init__(self, group: ProcessGroup, device: Union[int, str, torch.device], - max_size=8192 * 1024) -> None: + max_size=8192 * 1024, + symm_mem_enabled=False) -> None: """ Args: group: the process group to work on. If None, it will use the @@ -111,7 +112,7 @@ class CustomAllreduce: self.device = device device_capability = current_platform.get_device_capability( ).as_version_str() - if (current_platform.is_cuda() and envs.VLLM_ALLREDUCE_USE_SYMM_MEM + if (current_platform.is_cuda() and symm_mem_enabled and device_capability in CUSTOM_ALL_REDUCE_MAX_SIZES): max_size = min( CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability][world_size], diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py index d907e1b833d04..09012d16978d9 100644 --- a/vllm/distributed/device_communicators/symm_mem.py +++ b/vllm/distributed/device_communicators/symm_mem.py @@ -27,8 +27,13 @@ class SymmMemCommunicator: "10.0": [6, 8], } - def __init__(self, group: ProcessGroup, device: Union[int, str, - torch.device]): + def __init__( + self, + group: ProcessGroup, + device: Union[int, str, torch.device], + # add options for testing + force_multimem: Optional[bool] = None, + max_size_override: Optional[int] = None): self.disabled = True if not symm_mem_available: @@ -64,8 +69,17 @@ class SymmMemCommunicator: self.world_size, ) return - self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][ - self.world_size] + # Use override max_size if provided, otherwise use default + if max_size_override is not None: + self.max_size = max_size_override + logger.info( + "SymmMemCommunicator: Using override max_size: %s bytes", + self.max_size, + ) + else: + self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[ + self.device_capability][self.world_size] + self.buffer = torch_symm_mem.empty( self.max_size // self.dtype.itemsize, device=self.device, @@ -76,6 +90,7 @@ class SymmMemCommunicator: logger.warning("SymmMemCommunicator: symmetric memory " "multicast operations are not supported.") return + self.force_multimem = force_multimem self.disabled = False def should_use_symm_mem(self, inp: torch.Tensor): @@ -98,8 +113,18 @@ class SymmMemCommunicator: if out is None: out = torch.empty_like(inp) self.buffer[:inp.numel()].copy_(inp.view(-1)) - if self.world_size in self._WORLD_SIZES_MULTIMEM[ - self.device_capability]: + + # Determine which algorithm to use + use_multimem = False + if self.force_multimem is not None: + # Test override: use forced setting + use_multimem = self.force_multimem + else: + # Normal logic: use multimem for supported world sizes + use_multimem = self.world_size in self._WORLD_SIZES_MULTIMEM[ + self.device_capability] + + if use_multimem: torch.ops.symm_mem.multimem_all_reduce_(self.buffer[:inp.numel()], "sum", self.group.group_name) diff --git a/vllm/envs.py b/vllm/envs.py index 184d339089a21..d7956a2adff68 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -166,7 +166,7 @@ if TYPE_CHECKING: VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False - VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False + VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False VLLM_GPT_OSS_USE_CONTAINER_TOOL: bool = False @@ -1203,7 +1203,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use pytorch symmetric memory for allreduce "VLLM_ALLREDUCE_USE_SYMM_MEM": - lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))), + lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))), # Allows vllm to find tuned config under customized folder "VLLM_TUNED_CONFIG_FOLDER": From 4aa23892d6dfdc3ec6de7ff74e982b833adf6c52 Mon Sep 17 00:00:00 2001 From: Konrad Zawora <kzawora@habana.ai> Date: Thu, 11 Sep 2025 19:15:01 +0200 Subject: [PATCH 183/584] [Bugfix] Fix platform-specific routing in CustomOp implementations (#24444) Signed-off-by: Konrad Zawora <kzawora@habana.ai> --- vllm/model_executor/layers/activation.py | 5 +++- vllm/model_executor/layers/fused_moe/layer.py | 9 +++++++- .../rotary_embedding/deepseek_scaling_rope.py | 11 ++++++++- .../rotary_embedding/dual_chunk_rope.py | 11 ++++++++- .../rotary_embedding/ernie45_vl_rope.py | 10 +++++++- .../rotary_embedding/llama4_vision_rope.py | 9 +++++++- .../layers/rotary_embedding/mrope.py | 23 ------------------- .../layers/vocab_parallel_embedding.py | 5 +++- 8 files changed, 53 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 319fa938d400e..235df1a77c5ce 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -454,7 +454,7 @@ class XIELU(CustomOp): ) return result.view(original_shape) - def forward(self, input: torch.Tensor) -> torch.Tensor: + def forward_native(self, input: torch.Tensor) -> torch.Tensor: if self._xielu_cuda_obj is not None and input.is_cuda: if not torch._dynamo.is_compiling(): return self._xielu_cuda_fn(input) @@ -464,6 +464,9 @@ class XIELU(CustomOp): ) return self._xielu_python(input) + def forward_cuda(self, input: torch.Tensor) -> torch.Tensor: + return self.forward_native(input) + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f80ba3a7aa23b..89676f98cb0ed 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1593,7 +1593,7 @@ class FusedMoE(CustomOp): else: return tensor_model_parallel_all_reduce(final_hidden_states) - def forward( + def forward_native( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, @@ -1627,6 +1627,13 @@ class FusedMoE(CustomOp): return (shared_output[..., :og_hidden_states], fused_output[..., :og_hidden_states]) + def forward_cuda( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + return self.forward_native(hidden_states, router_logits) + def forward_impl_chunked( self, full_hidden_states: torch.Tensor, diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index cd888b733426b..7ac2e4bb6c34f 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -88,7 +88,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): cache = torch.cat((cos, sin), dim=-1) return cache - def forward( + def forward_native( self, positions: torch.Tensor, query: torch.Tensor, @@ -129,3 +129,12 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): query = query_rot key = key_rot return query, key + + def forward_cuda( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + return self.forward_native(positions, query, key, offsets) diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py index 3d8da0fa9d8f5..27e41dd0fa97e 100644 --- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py @@ -111,7 +111,7 @@ class DualChunkRotaryEmbedding(CustomOp): device=self.device) return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache - def forward( + def forward_native( self, positions: torch.Tensor, query: torch.Tensor, @@ -161,6 +161,15 @@ class DualChunkRotaryEmbedding(CustomOp): dim=-1) return query, key + def forward_cuda( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self.forward_native(positions, query, key, offsets) + def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass): cos, sin = cos_sin.chunk(2, dim=-1) if self.is_neox_style: diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py index 05322e56f2620..4960c20f4060a 100644 --- a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py @@ -12,7 +12,7 @@ from .mrope import MRotaryEmbedding class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding): """3D rotary positional embedding. 3D is t:time h:height w:width""" - def forward( + def forward_native( # type: ignore[override] self, positions: torch.Tensor, query: torch.Tensor, @@ -70,3 +70,11 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding): self.is_neox_style) key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key + + def forward_cuda( # type: ignore[override] + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + return self.forward_native(positions, query, key) \ No newline at end of file diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py index 415a85ab698bc..37ead43e22bc4 100644 --- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -53,7 +53,7 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)) return cache - def forward( + def forward_native( # type: ignore[override] self, query: torch.Tensor, key: Optional[torch.Tensor] = None, @@ -72,3 +72,10 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): query_out = torch.view_as_real(query_ * freqs_ci).flatten(3) key_out = torch.view_as_real(key_ * freqs_ci).flatten(3) return query_out.type_as(query), key_out.type_as(key) + + def forward_cuda( # type: ignore[override] + self, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + return self.forward_native(query, key) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 0ab4bc5375daf..0acb5ea742455 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -8,7 +8,6 @@ import numpy as np import torch from transformers import PretrainedConfig -from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from .base import RotaryEmbedding @@ -202,28 +201,6 @@ class MRotaryEmbedding(RotaryEmbedding): if self.mrope_section: assert sum(self.mrope_section) == rotary_dim // 2 - self.use_triton = current_platform.is_cuda_alike() - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - """MRope forward. - - Args: - positions: - [num_tokens,] (text only) or - [3, num_tokens] (T/H/W positions with multimodal inputs) - query: [num_tokens, num_heads * head_size] - key: [num_tokens, num_kv_heads * head_size] - """ - if self.use_triton: - return self.forward_cuda(positions, query, key) - else: - return self.forward_native(positions, query, key) - def forward_native( self, positions: torch.Tensor, diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 15e628177b3f6..c915ebac91c59 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -399,7 +399,7 @@ class VocabParallelEmbedding(CustomOp): param[:loaded_weight.shape[0]].data.copy_(loaded_weight) param[loaded_weight.shape[0]:].data.fill_(0) - def forward(self, input_): + def forward_native(self, input_): if self.tp_size > 1: # Build the mask. masked_input, input_mask = get_masked_input_and_mask( @@ -420,6 +420,9 @@ class VocabParallelEmbedding(CustomOp): output = tensor_model_parallel_all_reduce(output_parallel) return output + def forward_cuda(self, input_): + return self.forward_native(input_) + def extra_repr(self) -> str: s = f"num_embeddings={self.num_embeddings_per_partition}" s += f", embedding_dim={self.embedding_dim}" From c1eda615ba3aee7dd815adee74e058c2cc8c3ebb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:47:51 +0100 Subject: [PATCH 184/584] Fix model name included in responses (#24663) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/openai/test_chat.py | 55 +------------------ tests/entrypoints/openai/test_serving_chat.py | 42 +++++++++++++- vllm/entrypoints/openai/serving_chat.py | 2 +- .../openai/serving_classification.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/entrypoints/openai/serving_embedding.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 11 ---- vllm/entrypoints/openai/serving_pooling.py | 2 +- vllm/entrypoints/openai/serving_responses.py | 2 +- vllm/entrypoints/openai/serving_score.py | 4 +- 10 files changed, 50 insertions(+), 74 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index c9947c54a9181..4608850c7dae2 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -12,7 +12,7 @@ import pytest_asyncio import regex as re import requests import torch -from openai import BadRequestError, OpenAI +from openai import BadRequestError from ...utils import RemoteOpenAIServer @@ -968,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI): or "less_than_equal" in exc_info.value.message) -@pytest.mark.asyncio -async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer): - url = f"http://localhost:{server.port}/v1/chat/completions" - headers = { - "Content-Type": "application/json", - } - data = { - # model_name is avoided here. - "messages": [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "what is 1+1?" - }], - "max_tokens": - 5 - } - - response = requests.post(url, headers=headers, json=data) - response_data = response.json() - print(response_data) - assert response_data.get("model") == MODEL_NAME - choice = response_data.get("choices")[0] - message = choice.get("message") - assert message is not None - content = message.get("content") - assert content is not None - assert len(content) > 0 - - -@pytest.mark.asyncio -async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer): - openai_api_key = "EMPTY" - openai_api_base = f"http://localhost:{server.port}/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - messages = [ - { - "role": "user", - "content": "Hello, vLLM!" - }, - ] - response = client.chat.completions.create( - model="", # empty string - messages=messages, - ) - assert response.model == MODEL_NAME - - @pytest.mark.asyncio async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI): diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 39a18b9daeca1..d219a1f311f15 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -213,8 +213,12 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI, MODEL_NAME = "openai-community/gpt2" +MODEL_NAME_SHORT = "gpt2" CHAT_TEMPLATE = "Dummy chat template for testing {}" -BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT) +] @dataclass @@ -270,6 +274,42 @@ def test_async_serving_chat_init(): assert serving_completion.chat_template == CHAT_TEMPLATE +@pytest.mark.asyncio +async def test_serving_chat_returns_correct_model_name(): + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=MockModelConfig()) + serving_chat = OpenAIServingChat(mock_engine, + MockModelConfig(), + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None) + messages = [{"role": "user", "content": "what is 1+1?"}] + + async def return_model_name(*args): + return args[3] + + serving_chat.chat_completion_full_generator = return_model_name + + # Test that full name is returned when short name is requested + req = ChatCompletionRequest(model=MODEL_NAME_SHORT, messages=messages) + assert await serving_chat.create_chat_completion(req) == MODEL_NAME + + # Test that full name is returned when empty string is specified + req = ChatCompletionRequest(model="", messages=messages) + assert await serving_chat.create_chat_completion(req) == MODEL_NAME + + # Test that full name is returned when no model is specified + req = ChatCompletionRequest(messages=messages) + assert await serving_chat.create_chat_completion(req) == MODEL_NAME + + @pytest.mark.asyncio async def test_serving_chat_should_set_correct_max_tokens(): mock_engine = MagicMock(spec=MQLLMEngineClient) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 5c7adc53f49b2..579f6f537ee2d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -186,7 +186,7 @@ class OpenAIServingChat(OpenAIServing): lora_request = self._maybe_get_adapters( request, supports_default_mm_loras=True) - model_name = self._get_model_name(request.model, lora_request) + model_name = self.models.model_name(lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index ef730442691cf..7e88424c169ce 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -146,7 +146,7 @@ class ServingClassification(ClassificationMixin): request: ClassificationRequest, raw_request: Request, ) -> Union[ClassificationResponse, ErrorResponse]: - model_name = self._get_model_name(request.model) + model_name = self.models.model_name() request_id = (f"{self.request_id_prefix}-" f"{self._base_request_id(raw_request)}") diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 10b1c54fe6a98..c2de449a96994 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -232,7 +232,7 @@ class OpenAIServingCompletion(OpenAIServing): result_generator = merge_async_iterators(*generators) - model_name = self._get_model_name(request.model, lora_request) + model_name = self.models.model_name(lora_request) num_prompts = len(engine_prompts) # Similar to the OpenAI API, when n != best_of, we do not stream the diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 476c21ad6a302..c0d1fe4b6e168 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -599,7 +599,7 @@ class OpenAIServingEmbedding(EmbeddingMixin): See https://platform.openai.com/docs/api-reference/embeddings/create for the API specification. This API mimics the OpenAI Embedding API. """ - model_name = self._get_model_name(request.model) + model_name = self.models.model_name() request_id = ( f"{self.request_id_prefix}-" f"{self._base_request_id(raw_request, request.request_id)}") diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 708b1a5facf49..d391cc50ad232 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -980,17 +980,6 @@ class OpenAIServing: return True return self.models.is_base_model(model_name) - def _get_model_name( - self, - model_name: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - ) -> str: - if lora_request: - return lora_request.lora_name - if not model_name: - return self.models.base_model_paths[0].name - return model_name - def clamp_prompt_logprobs( prompt_logprobs: Union[PromptLogprobs, diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index d70e1a808aba6..cac1d1ba56839 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -91,7 +91,7 @@ class OpenAIServingPooling(OpenAIServing): if error_check_ret is not None: return error_check_ret - model_name = self._get_model_name(request.model) + model_name = self.models.model_name() request_id = f"pool-{self._base_request_id(raw_request)}" created_time = int(time.time()) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index c5177bdf5375d..401ba6c53331c 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -237,7 +237,7 @@ class OpenAIServingResponses(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - model_name = self._get_model_name(request.model, lora_request) + model_name = self.models.model_name(lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) if self.use_harmony: diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 847c014a11dc3..24767ed66fc6a 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -353,7 +353,7 @@ class ServingScores(OpenAIServing): final_res_batch, request_id, created_time, - self._get_model_name(request.model), + self.models.model_name(), ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") @@ -399,7 +399,7 @@ class ServingScores(OpenAIServing): return self.request_output_to_rerank_response( final_res_batch, request_id, - self._get_model_name(request.model), + self.models.model_name(), documents, top_n, ) From e26fef83977d0361522dd294c8e9659e57b2eb5f Mon Sep 17 00:00:00 2001 From: co63oc <co63oc@users.noreply.github.com> Date: Fri, 12 Sep 2025 01:48:46 +0800 Subject: [PATCH 185/584] fix some typos (#24616) Signed-off-by: co63oc <co63oc@users.noreply.github.com> --- tests/compile/test_basic_correctness.py | 2 +- tests/kernels/mamba/test_mamba_ssm_ssd.py | 2 +- vllm/model_executor/layers/fla/ops/cumsum.py | 2 +- vllm/v1/attention/backends/mla/common.py | 2 +- vllm/v1/worker/block_table.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index f6783704342f6..fd2b1866e62e1 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -23,7 +23,7 @@ class TestSetting: fullgraph: bool -# we cannot afford testing the full Catesian product +# we cannot afford testing the full Cartesian product # of all models and all levels @pytest.mark.parametrize( "test_setting", diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 1ce7f9d85e876..fc60d5ac82b27 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -345,7 +345,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens): # in the mamba2 ssd kernels, by comparing concatenation (in the sequence # dimension) of chunked results with the full sequence result. # It is different from test_mamba_chunk_scan_cont_batch by: - # 1. Not using the naive torch implementaion (ssd_minimal_discrete) to get + # 1. Not using the naive torch implementation (ssd_minimal_discrete) to get # reference outputs. Instead, it compares chunked kernel outputs to full # sequence kernel outputs. This is the most straightforward way to # assert chunked prefill correctness. diff --git a/vllm/model_executor/layers/fla/ops/cumsum.py b/vllm/model_executor/layers/fla/ops/cumsum.py index 59152e2c845ad..370a45fe16358 100644 --- a/vllm/model_executor/layers/fla/ops/cumsum.py +++ b/vllm/model_executor/layers/fla/ops/cumsum.py @@ -179,7 +179,7 @@ def chunk_local_cumsum_vector( def grid(meta): return (triton.cdiv(meta['S'], meta['BS']), NT, B * H) - # keep cummulative normalizer in fp32 + # keep cumulative normalizer in fp32 # this kernel is equivalent to # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1) chunk_local_cumsum_vector_kernel[grid](g_org, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 5308a1113a1ad..036a281f1d26e 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1322,7 +1322,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): k_scale: torch.Tensor, dcp_world_size: int, ): - assert k_scale is None, "DCP not support sacled kvcache now." + assert k_scale is None, "DCP not support scaled kvcache now." assert attn_metadata.prefill is not None prefill_metadata = attn_metadata.prefill assert prefill_metadata.chunked_context is not None diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 1901de6d2e5b3..194984bf50536 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -112,9 +112,9 @@ class BlockTable: # tokens. virtual_block_offsets = positions % virtual_block_size mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank - # Calcuate local block_offsets + # Calculate local block_offsets block_offsets = virtual_block_offsets // self.dcp_world_size - # Calcuate slot_mapping + # Calculate slot_mapping slot_mapping = block_numbers * self.block_size + block_offsets # Write final slots, use -1 for not-local self.slot_mapping_np[:req_indices.shape[0]] = np.where( From 361ae27f8a336b82efb932458e3e79e06027d4ce Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Sep 2025 19:18:06 +0100 Subject: [PATCH 186/584] [Docs] Fix formatting of transcription doc (#24676) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/contributing/model/transcription.md | 310 ++++++++++++----------- 1 file changed, 157 insertions(+), 153 deletions(-) diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index cf25ad5bbbce3..62e58e5c6ac58 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -3,149 +3,151 @@ This document walks you through the steps to add support for speech-to-text (ASR) models to vLLM’s transcription and translation APIs by implementing [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription]. Please refer to the [supported models](../../models/supported_models.md#transcription) for further guidance. -## 1. Update the base vLLM model +## Update the base vLLM model It is assumed you have already implemented your model in vLLM according to the basic model guide. Extend your model with the [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription] interface and implement the following class attributes and methods. -- Declare supported languages and capabilities: +### `supported_languages` and `supports_transcription_only` - ??? code +Declare supported languages and capabilities: - ```python - from typing import ClassVar, Mapping, Optional, Literal - import numpy as np - import torch - from torch import nn +- The `supported_languages` mapping is validated at init time. +- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). + +??? code "supported_languages and supports_transcription_only" + ```python + from typing import ClassVar, Mapping, Optional, Literal + import numpy as np + import torch + from torch import nn + + from vllm.config import ModelConfig, SpeechToTextConfig + from vllm.inputs.data import PromptType + from vllm.model_executor.models.interfaces import SupportsTranscription + + class YourASRModel(nn.Module, SupportsTranscription): + # Map of ISO 639-1 language codes to language names + supported_languages: ClassVar[Mapping[str, str]] = { + "en": "English", + "it": "Italian", + # ... add more as needed + } - from vllm.config import ModelConfig, SpeechToTextConfig - from vllm.inputs.data import PromptType - from vllm.model_executor.models.interfaces import SupportsTranscription - - class YourASRModel(nn.Module, SupportsTranscription): - # Map of ISO 639-1 language codes to language names - supported_languages: ClassVar[Mapping[str, str]] = { - "en": "English", - "it": "Italian", - # ... add more as needed + # If your model only supports audio-conditioned generation + # (no text-only generation), enable this flag. + supports_transcription_only: ClassVar[bool] = True + ``` + +Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config]. + +This is for controlling general behavior of the API when serving your model: + +??? code "get_speech_to_text_config()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_speech_to_text_config( + cls, + model_config: ModelConfig, + task_type: Literal["transcribe", "translate"], + ) -> SpeechToTextConfig: + return SpeechToTextConfig( + sample_rate=16_000, + max_audio_clip_s=30, + # Set to None to disable server-side chunking if your + # model/processor handles it already + min_energy_split_window_size=None, + ) + ``` + +See [Audio preprocessing and chunking](#audio-preprocessing-and-chunking) for what each field controls. + +Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns: + +#### Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n) + +Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: + +??? code "get_generation_prompt()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str], + ) -> PromptType: + # Example with a free-form instruction prompt + task_word = "Transcribe" if task_type == "transcribe" else "Translate" + prompt = ( + "<start_of_turn>user\n" + f"{task_word} this audio: <audio_soft_token>" + "<end_of_turn>\n<start_of_turn>model\n" + ) + + return { + "multi_modal_data": {"audio": (audio, stt_config.sample_rate)}, + "prompt": prompt, } - - # If your model only supports audio-conditioned generation - # (no text-only generation), enable this flag. - supports_transcription_only: ClassVar[bool] = True - ``` + ``` - - The `supported_languages` mapping is validated at init time. - - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). + For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md). -- Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config]. - This is for controlling general behavior of the API when serving your model: +#### Encoder–decoder audio-only (e.g., Whisper) - ??? code +Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: - ```python - class YourASRModel(nn.Module, SupportsTranscription): - ... +??? code "get_generation_prompt()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... - @classmethod - def get_speech_to_text_config( - cls, - model_config: ModelConfig, - task_type: Literal["transcribe", "translate"], - ) -> SpeechToTextConfig: - return SpeechToTextConfig( - sample_rate=16_000, - max_audio_clip_s=30, - # Set to None to disable server-side chunking if your - # model/processor handles it already - min_energy_split_window_size=None, - ) - ``` + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str], + ) -> PromptType: + if language is None: + raise ValueError("Language must be specified") - See the “Audio preprocessing and chunking” section for what each field controls. - -- Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns: - -### A. Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n) - - Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: - - ??? code - - ```python - class YourASRModel(nn.Module, SupportsTranscription): - ... - - @classmethod - def get_generation_prompt( - cls, - audio: np.ndarray, - stt_config: SpeechToTextConfig, - model_config: ModelConfig, - language: Optional[str], - task_type: Literal["transcribe", "translate"], - request_prompt: str, - to_language: Optional[str], - ) -> PromptType: - # Example with a free-form instruction prompt - task_word = "Transcribe" if task_type == "transcribe" else "Translate" - prompt = ( - "<start_of_turn>user\n" - f"{task_word} this audio: <audio_soft_token>" - "<end_of_turn>\n<start_of_turn>model\n" - ) - - return { - "multi_modal_data": {"audio": (audio, stt_config.sample_rate)}, - "prompt": prompt, - } - ``` - - For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md). - -### B. Encoder–decoder audio-only (e.g., Whisper) - - Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: - - ??? code - - ```python - class YourASRModel(nn.Module, SupportsTranscription): - ... - - @classmethod - def get_generation_prompt( - cls, - audio: np.ndarray, - stt_config: SpeechToTextConfig, - model_config: ModelConfig, - language: Optional[str], - task_type: Literal["transcribe", "translate"], - request_prompt: str, - to_language: Optional[str], - ) -> PromptType: - if language is None: - raise ValueError("Language must be specified") - - prompt = { - "encoder_prompt": { - "prompt": "", - "multi_modal_data": { - "audio": (audio, stt_config.sample_rate), - }, + prompt = { + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": (audio, stt_config.sample_rate), }, - "decoder_prompt": ( - (f"<|prev|>{request_prompt}" if request_prompt else "") - + f"<|startoftranscript|><|{language}|>" - + f"<|{task_type}|><|notimestamps|>" - ), - } - return cast(PromptType, prompt) - ``` + }, + "decoder_prompt": ( + (f"<|prev|>{request_prompt}" if request_prompt else "") + + f"<|startoftranscript|><|{language}|>" + + f"<|{task_type}|><|notimestamps|>" + ), + } + return cast(PromptType, prompt) + ``` -- (Optional) Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language] +### `validate_language` (optional) - If your model requires a language and you want a default, override this method (see Whisper): +Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language] +If your model requires a language and you want a default, override this method (see Whisper): + +??? code "validate_language()" ```python @classmethod def validate_language(cls, language: Optional[str]) -> Optional[str]: @@ -156,27 +158,29 @@ It is assumed you have already implemented your model in vLLM according to the b return super().validate_language(language) ``` -- (Optional) Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.models.interfaces.SupportsTranscription.get_num_audio_tokens] +### `get_num_audio_tokens` (optional) - Provide a fast duration→token estimate to improve streaming usage statistics: +Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.models.interfaces.SupportsTranscription.get_num_audio_tokens] - ??? code - ```python - class YourASRModel(nn.Module, SupportsTranscription): - ... +Provide a fast duration→token estimate to improve streaming usage statistics: - @classmethod - def get_num_audio_tokens( - cls, - audio_duration_s: float, - stt_config: SpeechToTextConfig, - model_config: ModelConfig, - ) -> Optional[int]: - # Return None if unknown; otherwise return an estimate. - return int(audio_duration_s * stt_config.sample_rate // 320) # example - ``` +??? code "get_num_audio_tokens()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... -## 2. Audio preprocessing and chunking + @classmethod + def get_num_audio_tokens( + cls, + audio_duration_s: float, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + ) -> Optional[int]: + # Return None if unknown; otherwise return an estimate. + return int(audio_duration_s * stt_config.sample_rate // 320) # example + ``` + +## Audio preprocessing and chunking The API server takes care of basic audio I/O and optional chunking before building prompts: @@ -185,7 +189,8 @@ The API server takes care of basic audio I/O and optional chunking before buildi - Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words. Relevant server logic: -??? code + +??? code "_preprocess_speech_to_text()" ```python # vllm/entrypoints/openai/speech_to_text.py async def _preprocess_speech_to_text(...): @@ -211,9 +216,9 @@ Relevant server logic: return prompts, duration ``` -## 3. Exposing tasks automatically +## Exposing tasks automatically -- vLLM automatically advertises transcription support if your model implements the interface: +vLLM automatically advertises transcription support if your model implements the interface: ```python if supports_transcription(model): @@ -222,7 +227,7 @@ if supports_transcription(model): supported_tasks.append("transcription") ``` -- When enabled, the server initializes the transcription and translation handlers: +When enabled, the server initializes the transcription and translation handlers: ```python state.openai_serving_transcription = OpenAIServingTranscription(...) if "transcription" in supported_tasks else None @@ -231,13 +236,13 @@ state.openai_serving_translation = OpenAIServingTranslation(...) if "transcripti No extra registration is required beyond having your model class available via the model registry and implementing `SupportsTranscription`. -## 4. Examples in-tree +## Examples in-tree - Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py> - Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py> - Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py> -## 5. Test with the API +## Test with the API Once your model implements `SupportsTranscription`, you can test the endpoints (API mimics OpenAI): @@ -266,7 +271,6 @@ Once your model implements `SupportsTranscription`, you can test the endpoints ( Or check out more examples in <gh-file:examples/online_serving>. !!! note - -- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. -- Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass. -- For multilingual behavior, keep `supported_languages` aligned with actual model capabilities. + - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. + - Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass. + - For multilingual behavior, keep `supported_languages` aligned with actual model capabilities. From bb2b5126dac8cbad4c27c379144bbaaa572ccd6c Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Fri, 12 Sep 2025 02:30:41 +0800 Subject: [PATCH 187/584] [VLM] Migrate remain DP-supported ViT models to use `disable_tp` (#24363) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- .../models/idefics2_vision_model.py | 85 +++++-------------- vllm/model_executor/models/mllama4.py | 32 +++---- vllm/model_executor/models/qwen2_5_vl.py | 43 +++++----- 3 files changed, 54 insertions(+), 106 deletions(-) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index ea5d6f29f6cfd..76737a4428232 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -31,7 +31,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -139,37 +138,22 @@ class Idefics2VisionAttention(nn.Module): assert self.num_heads % tp_size == 0 self.num_heads_per_partition = self.num_heads // tp_size - if use_data_parallel: - self.q_size = self.num_heads * self.head_dim - self.qkv_proj = ReplicatedLinear( - self.embed_dim, - 3 * self.q_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.out_proj = ReplicatedLinear( - self.embed_dim, - self.embed_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.out_proj", - ) - else: - self.qkv_proj = QKVParallelLinear( - self.embed_dim, - self.head_dim, - self.num_heads, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.out_proj = RowParallelLinear( - self.embed_dim, - self.embed_dim, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.out_proj", - ) + self.qkv_proj = QKVParallelLinear( + self.embed_dim, + self.head_dim, + self.num_heads, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + disable_tp=use_data_parallel, + ) + self.out_proj = RowParallelLinear( + self.embed_dim, + self.embed_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + disable_tp=use_data_parallel, + ) # Use unified MultiHeadAttention with Flash Attention support self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) @@ -201,23 +185,21 @@ class Idefics2VisionMLP(nn.Module): super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) - cls_fc1 = (ReplicatedLinear - if use_data_parallel else ColumnParallelLinear) - self.fc1 = cls_fc1( + self.fc1 = ColumnParallelLinear( config.hidden_size, config.intermediate_size, bias=True, quant_config=quant_config, prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel, ) - cls_fc2 = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.fc2 = cls_fc2( + self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, bias=True, quant_config=quant_config, prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -389,30 +371,6 @@ class Idefics2VisionTransformer(nn.Module): last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state - def _consolidate_qkv_weights( - self, weights: Iterable[tuple[str, torch.Tensor]] - ) -> Iterable[tuple[str, torch.Tensor]]: - qkv_idx_mappings = { - ".self_attn.q_proj": 0, - ".self_attn.k_proj": 1, - ".self_attn.v_proj": 2, - } - qkv_weights = {} - for name, loaded_weight in weights: - for weight_name, idx in qkv_idx_mappings.items(): - if weight_name not in name: - continue - new_name = name.replace(weight_name, ".self_attn.qkv_proj") - if new_name not in qkv_weights: - qkv_weights[new_name] = [None] * 3 - qkv_weights[new_name][idx] = loaded_weight - break - else: - yield name, loaded_weight - for key, weight in qkv_weights.items(): - qkv_weight = torch.cat(weight, dim=0) - yield key, qkv_weight - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ @@ -425,9 +383,6 @@ class Idefics2VisionTransformer(nn.Module): loaded_params: set[str] = set() layer_count = len(self.encoder.layers) - if self.use_data_parallel: - weights = self._consolidate_qkv_weights(weights) - for name, loaded_weight in weights: # skip pooling header if name.startswith("head."): diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index ecbbb5f57bec8..6dc77666e1582 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -106,22 +106,21 @@ class Llama4VisionMLP(nn.Module): use_data_parallel: bool = False, ): super().__init__() - cls_fc1 = (ReplicatedLinear - if use_data_parallel else ColumnParallelLinear) - self.fc1 = cls_fc1( + self.fc1 = ColumnParallelLinear( input_size=input_size, output_size=intermediate_size, bias=bias, quant_config=quant_config, prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel, ) - cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear - self.fc2 = cls_fc2( + self.fc2 = RowParallelLinear( input_size=intermediate_size, output_size=output_size, bias=bias, quant_config=quant_config, prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel, ) self.activation_fn = nn.GELU() self.output_activation = output_activation @@ -419,20 +418,15 @@ class Llama4UnfoldConvolution(nn.Module): kernel_size = (kernel_size, kernel_size) self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size) - params = { - "input_size": - config.num_channels * kernel_size[0] * kernel_size[1], - "output_size": config.hidden_size, - "bias": False, - "quant_config": quant_config, - "prefix": f"{prefix}.linear", - } - if use_data_parallel: - cls = ReplicatedLinear - else: - cls = ColumnParallelLinear - params["gather_output"] = True - self.linear = cls(**params) + self.linear = ColumnParallelLinear( + input_size=config.num_channels * kernel_size[0] * kernel_size[1], + output_size=config.hidden_size, + bias=False, + gather_output=True, + quant_config=quant_config, + prefix=f"{prefix}.linear", + disable_tp=use_data_parallel, + ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.unfold(hidden_states) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 020142b2c1c1a..8aa7775570297 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) # yapf: enable from vllm.model_executor.layers.quantization import QuantizationConfig @@ -510,32 +509,32 @@ class Qwen2_5_VisionPatchMerger(nn.Module): norm_layer = partial(nn.LayerNorm, eps=1e-6) self.ln_q = norm_layer(context_dim) - cls_fc1 = (ReplicatedLinear - if use_data_parallel else ColumnParallelLinear) - cls_fc2 = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.mlp = nn.ModuleList([ - cls_fc1(self.hidden_size, - self.hidden_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.mlp.0"), + self.mlp = nn.Sequential( + ColumnParallelLinear( + self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.0", + return_bias=False, + disable_tp=use_data_parallel, + ), nn.GELU(), - cls_fc2(self.hidden_size, - d_model, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.mlp.2"), - ]) + RowParallelLinear( + self.hidden_size, + d_model, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.2", + return_bias=False, + disable_tp=use_data_parallel, + ), + ) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.ln_q(x) x = x.view(-1, self.hidden_size) - - mlp_fc1, mlp_act, mlp_fc2 = self.mlp - x_parallel, _ = mlp_fc1(x) - x_parallel = mlp_act(x_parallel) - out, _ = mlp_fc2(x_parallel) + out = self.mlp(x) return out From 127ded0a9e3dce17fdcff6a105900439c076b9ba Mon Sep 17 00:00:00 2001 From: Peter Salas <peter@fixie.ai> Date: Thu, 11 Sep 2025 11:52:24 -0700 Subject: [PATCH 188/584] [Ultravox] Use wrapped_model_config to instantiate inner model (#24679) Signed-off-by: Peter Salas <peter@fixie.ai> --- vllm/model_executor/models/ultravox.py | 4 ++-- vllm/transformers_utils/configs/ultravox.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 9885309035e6c..9e28b0c443df4 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -418,7 +418,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + config: UltravoxConfig = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multi_modal_config = multimodal_config @@ -438,7 +438,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): self.multi_modal_projector = UltravoxProjector(config) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, - hf_config=config.text_config, + hf_config=config.wrapped_model_config, prefix=maybe_prefix(prefix, "language_model"), ) if config.text_model_id is not None: diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 71266b9327369..e67479516560f 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -46,7 +46,7 @@ class UltravoxConfig(transformers.PretrainedConfig): projector or at the end. Versions v0.4.1 and below use `False`, but v0.5 and above use `True`. """ - + wrapped_model_config: transformers.PretrainedConfig model_type = "ultravox" audio_token = "<|audio|>" is_composition = False @@ -113,9 +113,8 @@ class UltravoxConfig(transformers.PretrainedConfig): return super().__setattr__(key, value) @property - def text_config(self) -> Optional[transformers.PretrainedConfig]: + def text_config(self) -> transformers.PretrainedConfig: # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate # the full model, but the text config is the text config of the inner # model. - return (self.wrapped_model_config.get_text_config() - if self.wrapped_model_config else None) + return self.wrapped_model_config.get_text_config() From a892b259b4503526511d9ec17d5f8961e46969a6 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Sep 2025 15:25:47 -0400 Subject: [PATCH 189/584] [Doc] Remove Useless Comments (#24687) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/model_executor/layers/quantization/fp8.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 65e0b70621532..d2616da84a00a 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -757,10 +757,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) - # DeepGemm scales need to be transposed and aligned. We try to do + # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. if self.allow_deep_gemm and not is_deep_gemm_e8m0_used(): - # Lazy import to avoid CUDA initialization problems. if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = \ get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous() From c733bd5e873d556a650de0234064db48a23e46b0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Thu, 11 Sep 2025 12:40:15 -0700 Subject: [PATCH 190/584] [Qwen3-Next] Add MoE Config for H200 (#24688) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- .../E=512,N=64,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..d5b6d02123d71 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} From b971f91504b60f15909e1d6cc35e6468db4666e5 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Thu, 11 Sep 2025 12:44:04 -0700 Subject: [PATCH 191/584] [BugFix] Fix tokenize asyncio task leak (#24677) Signed-off-by: Nick Hill <nhill@redhat.com> --- vllm/entrypoints/renderer.py | 58 +++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 68d33ada40975..f0798afbcf212 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -168,8 +168,8 @@ class BaseRenderer(ABC): if isinstance(prompt_embeds, list): return [_load_and_validate_embed(embed) for embed in prompt_embeds] - else: - return [_load_and_validate_embed(prompt_embeds)] + + return [_load_and_validate_embed(prompt_embeds)] class CompletionRenderer(BaseRenderer): @@ -182,7 +182,7 @@ class CompletionRenderer(BaseRenderer): AsyncMicrobatchTokenizer]] = None, ): super().__init__(model_config, tokenizer) - self.async_tokenizer_pool = async_tokenizer_pool or {} + self.async_tokenizer_pool = async_tokenizer_pool self.async_tokenizer: Optional[AsyncMicrobatchTokenizer] = None async def render_prompt( @@ -208,23 +208,21 @@ class CompletionRenderer(BaseRenderer): for prompt_input in batch_inputs: if prompt_input["is_tokens"] is True: # Token input - detokenize_task = asyncio.create_task( - # Note: detokenization is needed when echo is enabled, - # where the input token IDs are decoded back to text. - self._maybe_detokenize(prompt_input["content"], - config.max_length, - truncate_prompt_tokens, - config.cache_salt, - config.needs_detokenization)) - tasks.append(detokenize_task) + # Note: detokenization is needed when echo is enabled, + # where the input token IDs are decoded back to text. + task = self._maybe_detokenize(prompt_input["content"], + config.max_length, + truncate_prompt_tokens, + config.cache_salt, + config.needs_detokenization) else: # Text input - tokenize_task = asyncio.create_task( - self._tokenize(prompt_input["content"], config.max_length, - truncate_prompt_tokens, - config.add_special_tokens, - config.cache_salt)) - tasks.append(tokenize_task) + task = self._tokenize(prompt_input["content"], + config.max_length, + truncate_prompt_tokens, + config.add_special_tokens, + config.cache_salt) + tasks.append(task) # Wait for all text tokenization to finish if tasks: @@ -356,20 +354,24 @@ class CompletionRenderer(BaseRenderer): def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer: """Get or create async tokenizer using shared pool.""" - if self.async_tokenizer is not None: - return self.async_tokenizer + async_tokenizer = self.async_tokenizer + if async_tokenizer is not None: + return async_tokenizer + + tokenizer = self.tokenizer if self.tokenizer is None: raise ValueError( "No tokenizer available for text input processing") - # Check shared pool first - if self.tokenizer in self.async_tokenizer_pool: - return self.async_tokenizer_pool[self.tokenizer] - - # Create new async tokenizer and add to pool - self.async_tokenizer = AsyncMicrobatchTokenizer(self.tokenizer) - self.async_tokenizer_pool[self.tokenizer] = self.async_tokenizer - return self.async_tokenizer + if self.async_tokenizer_pool is None: + async_tokenizer = AsyncMicrobatchTokenizer(tokenizer) + else: + async_tokenizer = self.async_tokenizer_pool.get(tokenizer) + if async_tokenizer is None: + async_tokenizer = AsyncMicrobatchTokenizer(tokenizer) + self.async_tokenizer_pool[tokenizer] = async_tokenizer + self.async_tokenizer = async_tokenizer + return async_tokenizer def _create_tokens_prompt( self, From 79ac59f32e5e04b1da606ce4bdad14cadb0075d6 Mon Sep 17 00:00:00 2001 From: Andrew Xia <mitandrewxia@gmail.com> Date: Thu, 11 Sep 2025 12:58:43 -0700 Subject: [PATCH 192/584] Update Spec Decode metrics to include drafted and accepted token throughput (#24127) Signed-off-by: Andrew Xia <axia@meta.com> --- vllm/v1/spec_decode/metrics.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index b4bc3058c570a..2aa8962f5739c 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time from dataclasses import dataclass, field from typing import Optional @@ -58,6 +59,7 @@ class SpecDecodingLogging: self.num_draft_tokens: list[int] = [] self.num_accepted_tokens: list[int] = [] self.accepted_tokens_per_pos_lists: list[list[int]] = [] + self.last_log_time = time.monotonic() def observe(self, spec_decoding_stats: SpecDecodingStats): self.num_drafts.append(spec_decoding_stats.num_drafts) @@ -73,6 +75,13 @@ class SpecDecodingLogging: num_drafts = np.sum(self.num_drafts) num_draft_tokens = np.sum(self.num_draft_tokens) num_accepted_tokens = np.sum(self.num_accepted_tokens) + draft_throughput = 0 + accepted_throughput = 0 + + elapsed_time = time.monotonic() - self.last_log_time + if elapsed_time > 0: + draft_throughput = num_draft_tokens / elapsed_time + accepted_throughput = num_accepted_tokens / elapsed_time draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens * 100 if num_draft_tokens > 0 else float("nan")) @@ -86,16 +95,20 @@ class SpecDecodingLogging: log_fn( "SpecDecoding metrics: " - "Draft acceptance rate: %.1f%%, " "Mean acceptance length: %.2f, " + "Accepted throughput: %.2f tokens/s, " + "Drafted throughput: %.2f tokens/s, " "Accepted: %d tokens, " "Drafted: %d tokens, " - "Per-position acceptance rate: %s", - draft_acceptance_rate, + "Per-position acceptance rate: %s, " + "Avg Draft acceptance rate: %.1f%%", mean_acceptance_length, + accepted_throughput, + draft_throughput, num_accepted_tokens, num_draft_tokens, rates_str, + draft_acceptance_rate, ) self.reset() From 074854b24f6e0b1e237a004283e1f46d98c0d73c Mon Sep 17 00:00:00 2001 From: Duncan Moss <djm.moss@gmail.com> Date: Thu, 11 Sep 2025 14:04:56 -0700 Subject: [PATCH 193/584] [Kernel][B200] `mxfp4` fused cutlass moe (#23696) Signed-off-by: Duncan Moss <djm.moss@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- tests/kernels/moe/test_mxfp4_moe.py | 319 +++++++++++++++++ vllm/envs.py | 13 +- vllm/model_executor/layers/fused_moe/layer.py | 13 +- .../layers/quantization/mxfp4.py | 333 +++++++++++++++--- vllm/model_executor/warmup/kernel_warmup.py | 4 +- 5 files changed, 622 insertions(+), 60 deletions(-) diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index 882b034e2f230..9fd72ee152b55 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -11,6 +11,7 @@ import torch from packaging import version from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( "quark") is not None and version.parse( @@ -19,6 +20,10 @@ QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda( ) and current_platform.is_device_capability(100) +HOPPER_MXFP4_BF16_AVAILABLE = (current_platform.is_cuda() + and current_platform.is_device_capability(90) + and has_flashinfer()) + if TRTLLM_GEN_MXFP4_AVAILABLE: from flashinfer import (fp4_quantize, mxfp8_quantize, next_positive_power_of_2, @@ -542,3 +547,317 @@ def test_trtllm_gen_mxfp4_fused_moe( transpose_optimized=transpose_optimized) # relatively loose check since the mxfp4 quantization is less accurate check_accuracy(ref_result, tg_result, atol=0, rtol=0.3, percent=0.8) + + +def _interleave_scales_lastdim_by4(scales: torch.Tensor) -> torch.Tensor: + """Interleave scales on the last dimension by groups of 4, matching + the transformation in mxfp4.py's BF16 (Hopper) path.""" + s = scales.to(torch.uint8) + s_shape = s.shape + assert s_shape[-1] % 4 == 0 + s = s.reshape(*s_shape[:-1], s_shape[-1] // 4, 4) + # Move the 4-group dimension before the row dimension + permuted = s.permute(0, 2, 1, 3) + # Merge the row dim with the 4-group dim + return permuted.reshape(s_shape[0], s_shape[-1] // 4, s_shape[1] * 4) + + +@pytest.mark.parametrize("topk", [1, 4]) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("num_tokens", [1, 128]) +@pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)]) +@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None), + (1.702, 1.0, 7.0)]) +@pytest.mark.skipif( + not HOPPER_MXFP4_BF16_AVAILABLE, + reason="nvidia gpu sm90 and flashinfer are required for this test", +) +def test_flashinfer_cutlass_mxfp4_fused_moe( + topk: int, + num_experts: int, + num_tokens: int, + intermediate_size: int, + hidden_size: int, + alpha: float, + beta: float, + limit: Optional[float], +): + torch.manual_seed(42) + device = "cuda:0" + + # Inputs + hidden_states = torch.randn(num_tokens, + hidden_size, + device=device, + dtype=torch.bfloat16) + # Random MXFP4 weights and scales (uint8), contiguous [w1; w3] + w13_q = torch.randint( + 0, + 256, (num_experts, 2 * intermediate_size, hidden_size // 2), + device=device, + dtype=torch.uint8) + w13_scale = torch.randint( + 118, + 123, (num_experts, 2 * intermediate_size, hidden_size // 32), + device=device, + dtype=torch.uint8) + + w2_q = torch.randint(0, + 256, + (num_experts, hidden_size, intermediate_size // 2), + device=device, + dtype=torch.uint8) + w2_scale = torch.randint( + 118, + 123, (num_experts, hidden_size, intermediate_size // 32), + device=device, + dtype=torch.uint8) + # Bias contiguous [b1; b3] + bias13 = (torch.randn(num_experts, + 2 * intermediate_size, + device=device, + dtype=torch.bfloat16) * 10) + bias2 = (torch.randn( + num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10) + router_logits = torch.rand(num_tokens, + num_experts, + dtype=torch.float32, + device=device) + + w13_ref = mxfp4_dequantize(w13_q.clone(), w13_scale.clone()).reshape( + num_experts, 2 * intermediate_size, hidden_size) + w2_ref = mxfp4_dequantize(w2_q.clone(), w2_scale.clone()).reshape( + num_experts, hidden_size, intermediate_size) + ref = reference_moe(router_logits.to(torch.float32), topk, num_experts, + hidden_states.to(torch.float32), w13_ref, + bias13.to(torch.float32), w2_ref, + bias2.to(torch.float32), alpha, beta, limit, 'bf16') + + from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe + + # Swap halves to arrange as [w3; w1] (kernel expectation) + w1_w, w3_w = torch.chunk(w13_q, 2, dim=1) + w13_q_swapped = torch.cat([w3_w, w1_w], dim=1) + + b1, b3 = torch.chunk(bias13.to(torch.float32), 2, dim=-1) + w13_b = torch.cat([b3, b1], dim=-1).to(torch.bfloat16) + + w1_s, w3_s = torch.chunk(w13_scale, 2, dim=1) + w13_s = torch.cat([w3_s, w1_s], dim=1) + w13_s_inter = _interleave_scales_lastdim_by4(w13_s) + w2_s_inter = _interleave_scales_lastdim_by4(w2_scale) + + routing_weights = torch.nn.functional.softmax(router_logits, + dim=1, + dtype=torch.float32) + token_final_scales, token_selected_experts = torch.topk(routing_weights, + topk, + dim=-1) + token_final_scales = (token_final_scales / + token_final_scales.sum(dim=-1, keepdim=True)) + token_selected_experts = token_selected_experts.to(torch.int).contiguous() + + out = torch.empty_like(hidden_states, dtype=torch.bfloat16) + if alpha is not None: + alpha = torch.full((num_experts, ), alpha, device=hidden_states.device) + if beta is not None: + beta = torch.full((num_experts, ), beta, device=hidden_states.device) + if limit is not None: + limit = torch.full((num_experts, ), limit, device=hidden_states.device) + + _ = flashinfer_cutlass_fused_moe( + input=hidden_states, + token_selected_experts=token_selected_experts, + token_final_scales=token_final_scales, + fc1_expert_weights=w13_q_swapped, + fc2_expert_weights=w2_q, + output_dtype=torch.bfloat16, + output=out, + quant_scales=[w13_s_inter.to(torch.uint8), + w2_s_inter.to(torch.uint8)], + fc1_expert_biases=w13_b, + fc2_expert_biases=bias2.to(torch.bfloat16), + swiglu_alpha=alpha, + swiglu_beta=beta, + swiglu_limit=limit, + tp_size=1, + tp_rank=0, + ep_size=1, + ep_rank=0, + use_w4_group_scaling=True, + ) + + # Allow some mismatch due to MXFP4 quantization + check_accuracy(ref, out, atol=0, rtol=0.3, percent=0.8) + + +@pytest.mark.parametrize("topk", [1, 4]) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("num_tokens", [1, 128]) +@pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)]) +@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None), + (1.702, 1.0, 7.0)]) +@pytest.mark.skipif( + not (current_platform.is_cuda() + and current_platform.is_device_capability(100) and has_flashinfer()), + reason="NVIDIA GPU sm100 and flashinfer are required for this test", +) +def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe( + topk: int, + num_experts: int, + num_tokens: int, + intermediate_size: int, + hidden_size: int, + alpha: Optional[float], + beta: Optional[float], + limit: Optional[float], +): + torch.manual_seed(42) + device = "cuda:0" + + # Inputs + hidden_states = torch.randn(num_tokens, + hidden_size, + device=device, + dtype=torch.bfloat16) + # Float weights in w13 format [w1; w3] + w13 = (torch.randn(num_experts, + 2 * intermediate_size, + hidden_size, + device=device, + dtype=torch.bfloat16) / 10) + w2 = (torch.randn(num_experts, + hidden_size, + intermediate_size, + device=device, + dtype=torch.bfloat16) / 10) + # Bias contiguous [b1; b3] + bias13 = (torch.randn(num_experts, + 2 * intermediate_size, + device=device, + dtype=torch.bfloat16) * 10) + bias2 = (torch.randn( + num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10) + router_logits = torch.rand(num_tokens, + num_experts, + dtype=torch.float32, + device=device) + + # Quantize weights to MXFP4 per expert (SM100 path) + from flashinfer import mxfp4_quantize + + def quant_mxfp4_batches(a: torch.Tensor, e: int): + qs, sfs = [], [] + for i in range(e): + q, sf = mxfp4_quantize(a[i].cuda()) + qs.append(q) + sfs.append(sf) + return torch.stack(qs), torch.stack(sfs) + + def dequant_mxfp4_batches(mat_fp4: torch.Tensor, + scale_tensor: torch.Tensor): + num_batches = mat_fp4.size(0) + scale_tensor = scale_tensor.view(num_batches, -1) + from flashinfer import mxfp4_dequantize + return torch.stack([ + mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :]) + for b in range(num_batches) + ]) + + w13_q, w13_scale = quant_mxfp4_batches(w13, num_experts) + w2_q, w2_scale = quant_mxfp4_batches(w2, num_experts) + + # Reference result using dequantized tensors and reference_moe + w13_ref = dequant_mxfp4_batches( + w13_q.view(torch.uint8), + w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape( + num_experts, 2 * intermediate_size, hidden_size) + w2_ref = dequant_mxfp4_batches( + w2_q.view(torch.uint8), + w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape( + num_experts, hidden_size, intermediate_size) + + # Quantize activations for SM100 path and dequantize for reference + hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32) + # Reference uses BF16 input but quantizes intermediate activation to MXFP8 + ref = reference_moe(router_logits.to(torch.float32), topk, num_experts, + hidden_states.to(torch.float32), w13_ref, + bias13.to(torch.float32), w2_ref, + bias2.to(torch.float32), alpha, beta, limit, 'mxfp8') + + # Prepare inputs for FlashInfer CUTLASS fused MoE + from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe + + # Swap halves to arrange as [w3; w1] (kernel expectation) + w1_w, w3_w = torch.chunk(w13_q, 2, dim=1) + w13_q_swapped = torch.cat([w3_w, w1_w], dim=1) + + # Swap scales halves to match swapped weights + s1, s3 = torch.chunk(w13_scale, 2, dim=1) + w13_scale_swapped = torch.cat([s3, s1], dim=1) + + b1, b3 = torch.chunk(bias13.to(torch.float32), 2, dim=-1) + w13_b = torch.cat([b3, b1], dim=-1).to(torch.bfloat16) + + # Build routing for kernel + routing_weights = torch.nn.functional.softmax(router_logits, + dim=1, + dtype=torch.float32) + token_final_scales, token_selected_experts = torch.topk(routing_weights, + topk, + dim=-1) + token_final_scales = (token_final_scales / + token_final_scales.sum(dim=-1, keepdim=True)) + token_selected_experts = token_selected_experts.to(torch.int).contiguous() + + out = torch.empty_like(hidden_states, dtype=torch.bfloat16) + if alpha is not None: + alpha_t = torch.full((num_experts, ), + alpha, + device=hidden_states.device) + else: + alpha_t = None + if beta is not None: + beta_t = torch.full((num_experts, ), beta, device=hidden_states.device) + else: + beta_t = None + if limit is not None: + limit_t = torch.full((num_experts, ), + limit, + device=hidden_states.device) + else: + limit_t = None + + # Quant scales for SM100 MXFP8+MXFP4 path + fake_input_scale = torch.ones(num_experts, device=device) + quant_scales = [ + w13_scale_swapped.view(torch.int32), + fake_input_scale, + w2_scale.view(torch.int32), + fake_input_scale, + ] + + _ = flashinfer_cutlass_fused_moe( + input=hidden_states_q, + token_selected_experts=token_selected_experts, + token_final_scales=token_final_scales, + fc1_expert_weights=w13_q_swapped.contiguous().view(torch.long), + fc2_expert_weights=w2_q.contiguous().view(torch.long), + output_dtype=torch.bfloat16, + output=out, + quant_scales=quant_scales, + fc1_expert_biases=w13_b, + fc2_expert_biases=bias2.to(torch.bfloat16), + swiglu_alpha=alpha_t, + swiglu_beta=beta_t, + swiglu_limit=limit_t, + tp_size=1, + tp_rank=0, + ep_size=1, + ep_rank=0, + use_mxfp8_act_scaling=True, + input_sf=hidden_states_sf, + ) + + # Allow some mismatch due to MXFP4 quantization + check_accuracy(ref, out, atol=0, rtol=0.3, percent=0.8) diff --git a/vllm/envs.py b/vllm/envs.py index d7956a2adff68..165cd32721fe5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -166,7 +166,8 @@ if TYPE_CHECKING: VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False - VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False + VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False VLLM_GPT_OSS_USE_CONTAINER_TOOL: bool = False @@ -1004,6 +1005,15 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))), + # If set to 1, use the FlashInfer CUTLASS backend for + # MXFP8 (activation) x MXFP4 (weight) MoE. + # This is separate from the TRTLLMGEN path controlled by + # VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8. + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": + lambda: bool(int( + os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0") + )), + # If set to 1, use the FlashInfer # BF16 (activation) x MXFP4 (weight) MoE backend. "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": @@ -1296,6 +1306,7 @@ def compute_hash() -> str: "VLLM_USE_FLASHINFER_MOE_FP8", "VLLM_USE_FLASHINFER_MOE_FP4", "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "VLLM_USE_CUDNN_PREFILL", "VLLM_USE_TRTLLM_ATTENTION", diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 89676f98cb0ed..a90a71159f721 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -813,9 +813,16 @@ class FusedMoE(CustomOp): # we are padding globally so EP buffer allocation works if quant_config and quant_config.get_name() == "mxfp4": - from vllm.model_executor.layers.quantization.mxfp4 import ( # noqa: E501 - should_use_flashinfer_mxfp4) - if current_platform.is_rocm() or should_use_flashinfer_mxfp4(): + from vllm.model_executor.layers.quantization.mxfp4 import ( + Mxfp4Backend, get_mxfp4_backend) + current_mxfp4_backend = get_mxfp4_backend() + if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + or current_mxfp4_backend + == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS): + hidden_size = round_up(hidden_size, 128) + elif (current_platform.is_rocm() or current_mxfp4_backend + == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or + current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): hidden_size = round_up(hidden_size, 256) # For smuggling this layer into the fused moe custom op diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 74922756afe56..f935bdd84124a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import Enum from typing import Callable, Optional, Union import torch @@ -33,33 +34,72 @@ from vllm.utils.flashinfer import has_flashinfer logger = init_logger(__name__) -def _should_use_flashinfer_mxfp4_bf16(): - """Determine if FlashInfer MXFP4 BF16 should be used.""" - # If explicitly set, respect the setting - if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"): - return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 +# enum for mxfp4 backend +class Mxfp4Backend(Enum): + NONE = 0 - # Enable by default on SM100 if MXFP8 is not explicitly enabled - if (current_platform.is_device_capability(100) and has_flashinfer() - and not envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")): - logger.info_once( - "Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell. " - "For faster performance, consider setting " - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, " - "though this may impact accuracy.") - return True + # FlashInfer Backend + SM100_FI_MXFP4_MXFP8_TRTLLM = 1 + SM100_FI_MXFP4_MXFP8_CUTLASS = 2 + SM100_FI_MXFP4_BF16 = 3 + SM90_FI_MXFP4_BF16 = 4 - return False + # Marlin Backend + MARLIN = 5 + + # Triton Backend + TRITON = 6 -def _should_use_flashinfer_mxfp4_mxfp8(): - """Determine if FlashInfer MXFP4 MXFP8 should be used.""" - return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 +def get_mxfp4_backend(): + # Backend Selection + if current_platform.is_cuda(): + if (current_platform.is_device_capability(90) and has_flashinfer() + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16): + logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90") + return Mxfp4Backend.SM90_FI_MXFP4_BF16 + elif (current_platform.is_device_capability(100) and has_flashinfer() + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS): + logger.info_once( + "Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100") + return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + elif (current_platform.is_device_capability(100) and has_flashinfer() + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8): + logger.info_once( + "Using FlashInfer MXFP4 MXFP8 TRTLLM backend for SM100, " + "for high concurrency throughput workloads consider setting " + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS=1 for better " + "performance") + return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + elif current_platform.is_device_capability(100) and has_flashinfer(): + logger.info_once( + "Using FlashInfer MXFP4 BF16 backend for SM100, " + "For faster performance on SM100, consider setting " + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact " + "accuracy.") + return Mxfp4Backend.SM100_FI_MXFP4_BF16 + elif ((current_platform.is_device_capability(100) + or current_platform.is_device_capability(90)) + and not has_flashinfer()): + logger.warning_once( + "MXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer " + "is not available. This may result in degraded performance. " + "Please `pip install vllm[flashinfer]` for best results.") + # If FlashInfer is not available, try either Marlin or Triton + if current_platform.get_device_capability( + )[0] < 9 or not has_triton_kernels() or not is_torch_equal_or_newer( + "2.8.0"): + logger.info_once("Using Marlin backend") + return Mxfp4Backend.MARLIN + else: + logger.info_once("Using Triton backend") + return Mxfp4Backend.TRITON + elif current_platform.is_rocm() and has_triton_kernels(): + logger.info_once("Using Triton backend") + return Mxfp4Backend.TRITON -def should_use_flashinfer_mxfp4(): - return (_should_use_flashinfer_mxfp4_mxfp8() - or _should_use_flashinfer_mxfp4_bf16()) + return Mxfp4Backend.NONE class Mxfp4Config(QuantizationConfig): @@ -113,31 +153,15 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): super().__init__(moe) self.topk_indices_dtype = None self.moe = moe - self.use_marlin = self._should_use_marlin() + self.mxfp4_backend = get_mxfp4_backend() self.max_capture_size = get_current_vllm_config( ).compilation_config.max_capture_size - if current_platform.is_device_capability(100) and not has_flashinfer(): - logger.warning_once( - "MXFP4 MoE is enabled on Blackwell but FlashInfer " - "is not available. This may result in degraded performance. " - "Please `pip install vllm[flashinfer]` for best results.") + assert self.mxfp4_backend != Mxfp4Backend.NONE, ( + "No MXFP4 MoE backend (FlashInfer/Marlin/Triton) available." + "Please check your environment and try again.") self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {} - def _should_use_marlin(self): - if envs.VLLM_MXFP4_USE_MARLIN is not None: - return envs.VLLM_MXFP4_USE_MARLIN - if current_platform.is_cuda() and \ - not current_platform.is_device_capability(100): - if not current_platform.has_device_capability(90): - # marlin kernel has better performance on ampere - return True - if not has_triton_kernels(): - return True - if not is_torch_equal_or_newer("2.8.0"): - return True - return False - def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -157,7 +181,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): intermediate_size_per_partition_after_pad = \ intermediate_size_per_partition - if self.use_marlin: + if self.mxfp4_backend == Mxfp4Backend.MARLIN: # The moe marlin kernel requires that for each linear # n % 256 == 0 and k % 128 == 0. # In gate_up_proj: @@ -175,16 +199,20 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer.hidden_size = hidden_size layer.intermediate_size_per_partition = \ intermediate_size_per_partition_after_pad - elif should_use_flashinfer_mxfp4(): + elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): # pad the intermediate size to be a multiple of 2 * mxfp4_block # for to hold non-uniform sharded tensor as well as swizzling # other padding to increase performance intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 256) hidden_size = round_up(hidden_size, 256) - elif current_platform.is_rocm(): + elif current_platform.is_rocm() or ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16): intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 128) + hidden_size = round_up(hidden_size, 128) else: intermediate_size_per_partition_after_pad = round_up( intermediate_size_per_partition, 64) @@ -264,9 +292,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): set_weight_attrs(w2_bias, extra_weight_attrs) def process_weights_after_loading(self, layer): - if self.use_marlin: + if self.mxfp4_backend == Mxfp4Backend.MARLIN: prepare_moe_fp4_layer_for_marlin(layer) - elif should_use_flashinfer_mxfp4(): + elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): from flashinfer.fp4_quantization import ( nvfp4_block_scale_interleave) from flashinfer.fused_moe.core import ( @@ -429,7 +458,116 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape( self.num_experts, -1), requires_grad=False) - else: + elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16): + layer.gemm1_alpha = Parameter(torch.tensor( + [1.702] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + layer.gemm1_beta = Parameter(torch.tensor( + [1.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + layer.gemm1_clamp_limit = Parameter(torch.tensor( + [7.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False) + + sf_block_size = 32 # mxfp4 block size + + # Common shape assertions + assert (layer.w13_weight.dim() == 3 + and layer.w13_weight.shape[0] == self.num_experts + and layer.w13_weight.shape[1] == self.intermediate_size * 2 + and layer.w13_weight.shape[2] == self.hidden_size // 2) + assert (layer.w13_weight_scale.dim() == 3 + and layer.w13_weight_scale.shape[0] == self.num_experts + and layer.w13_weight_scale.shape[1] + == self.intermediate_size * 2 + and layer.w13_weight_scale.shape[2] + == self.hidden_size // sf_block_size) + assert (layer.w2_weight.dim() == 3 + and layer.w2_weight.shape[0] == self.num_experts + and layer.w2_weight.shape[1] == self.hidden_size and + layer.w2_weight.shape[2] == self.intermediate_size // 2) + assert (layer.w2_weight_scale.dim() == 3 + and layer.w2_weight_scale.shape[1] == self.hidden_size + and layer.w2_weight_scale.shape[2] + == self.intermediate_size // sf_block_size) + assert (layer.w13_bias.dim() == 2 + and layer.w13_bias.shape[0] == self.num_experts + and layer.w13_bias.shape[1] == self.intermediate_size * 2) + assert (layer.w2_bias.dim() == 2 + and layer.w2_bias.shape[0] == self.num_experts + and layer.w2_bias.shape[1] == self.hidden_size) + + # De-interleave and swap for w13 weight, bias, and scales + w13_w = layer.w13_weight.data + gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :] + deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1) + w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1) + w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1) + + w13_b = layer.w13_bias.data.to(torch.float32) + gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2] + deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1) + b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1) + w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16) + + w13_s = layer.w13_weight_scale.data + gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :] + deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1) + s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1) + w13_scale_swapped = torch.cat([s3, s1], dim=1) + + if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS: + from flashinfer import block_scale_interleave + + orig_shape = w13_scale_swapped.shape + w13_scale_interleaved = block_scale_interleave( + w13_scale_swapped.view(torch.uint8)).reshape(orig_shape) + + w2_s = layer.w2_weight_scale.data + orig_shape = w2_s.shape + w2_scale_interleaved = block_scale_interleave( + w2_s.view(torch.uint8)).reshape(orig_shape) + + layer.w13_weight = Parameter(w13_weight_swapped, + requires_grad=False) + layer.w13_weight_scale = Parameter(w13_scale_interleaved, + requires_grad=False) + layer.w13_bias = Parameter(w13_bias_swapped, + requires_grad=False) + layer.w2_weight_scale = Parameter(w2_scale_interleaved, + requires_grad=False) + elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16: + + def _interleave_mxfp4_cutlass_sm90(w): + w_shape = w.shape + w_interleaved = w.reshape(w_shape[0], w_shape[1], + (w_shape[2] // 4), 4) + w_interleaved = w_interleaved.permute(0, 2, 1, 3) + w_interleaved = w_interleaved.reshape( + w_shape[0], w_shape[2] // 4, w_shape[1] * 4) + return w_interleaved + + w31_scales = w13_scale_swapped.to(torch.uint8).view( + torch.uint8) + w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90( + w31_scales) + + w2_weight_scale = layer.w2_weight_scale.data + w2_scales = w2_weight_scale.to(torch.uint8).view(torch.uint8) + w2_scales_interleaved = _interleave_mxfp4_cutlass_sm90( + w2_scales) + + layer.w13_weight = torch.nn.Parameter(torch.cat([w3_w, w1_w], + dim=1), + requires_grad=False) + layer.w13_bias = torch.nn.Parameter(w13_bias_swapped, + requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter( + w31_scales_interleaved, requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter( + w2_scales_interleaved, requires_grad=False) + elif self.mxfp4_backend == Mxfp4Backend.TRITON: from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig w13_bias = layer.w13_bias.to(torch.float32) @@ -464,6 +602,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer.w13_weight = None layer.w2_weight = None torch.cuda.empty_cache() + else: + raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int): # Number of tokens in the input tensor. @@ -500,7 +640,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): raise NotImplementedError( "Mxfp4 does not support batched experts format for EP") else: - if should_use_flashinfer_mxfp4(): + if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): # B200 code-path kwargs = { "gemm1_alpha": layer.gemm1_alpha, @@ -601,7 +742,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): if enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") - if self.use_marlin: + if self.mxfp4_backend == Mxfp4Backend.MARLIN: topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -665,16 +806,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): logical_replica_count), ( "MXFP4 are not supported with this configuration.") - if should_use_flashinfer_mxfp4(): - from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe - if _should_use_flashinfer_mxfp4_bf16(): + if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): + from flashinfer import trtllm_fp4_block_scale_moe + if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16: assert x.dtype == torch.bfloat16 x_quant = x x_scale = None - else: + elif self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM: + from flashinfer import mxfp8_quantize x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8 x_scale = x_scale.view(torch.float8_e4m3fn).reshape( *x.shape[:-1], -1) + trtllm_gen_output = trtllm_fp4_block_scale_moe( router_logits.to(torch.bfloat16), None, # routing_bias @@ -706,7 +850,86 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): tune_max_num_tokens=self.max_capture_size, )[0] return trtllm_gen_output - else: + elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16): + from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + # Backend-specific preparation + if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS: + + from flashinfer import mxfp8_quantize + + x_quant, x_scale = mxfp8_quantize(x, True, 32) + + fake_input_scale = torch.ones(self.num_experts, + device=x.device) + quant_scales = [ + layer.w13_weight_scale.contiguous().view(torch.int32), + fake_input_scale, + layer.w2_weight_scale.contiguous().view(torch.int32), + fake_input_scale, + ] + + fi_input = x_quant + extra_kwargs = dict( + use_mxfp8_act_scaling=True, + input_sf=x_scale, + fc1_expert_weights=layer.w13_weight.contiguous().view( + torch.long), + fc2_expert_weights=layer.w2_weight.contiguous().view( + torch.long), + ) + elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16: + assert x.dtype == torch.bfloat16 + + quant_scales = [ + layer.w13_weight_scale, + layer.w2_weight_scale, + ] + + fi_input = x + extra_kwargs = dict( + use_w4_group_scaling=True, + fc1_expert_weights=layer.w13_weight, + fc2_expert_weights=layer.w2_weight, + ) + + output = torch.empty_like(x, dtype=torch.bfloat16) + _ = flashinfer_cutlass_fused_moe( + input=fi_input, + token_selected_experts=topk_ids.to(torch.int).contiguous(), + token_final_scales=topk_weights, + output_dtype=torch.bfloat16, + output=output, + quant_scales=quant_scales, + fc1_expert_biases=layer.w13_bias, + fc2_expert_biases=layer.w2_bias, + swiglu_alpha=layer.gemm1_alpha, + swiglu_beta=layer.gemm1_beta, + swiglu_limit=layer.gemm1_clamp_limit, + tp_size=self.moe.tp_size, + tp_rank=self.moe.tp_rank, + ep_size=self.moe.ep_size, + ep_rank=self.moe.ep_rank, + tune_max_num_tokens=self.max_capture_size, + **extra_kwargs, + ) + + return output + elif self.mxfp4_backend == Mxfp4Backend.TRITON: from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501 triton_kernel_moe_forward) return triton_kernel_moe_forward( @@ -724,3 +947,5 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): w2_precision=self.w2_precision_config, apply_router_weight_on_input=apply_router_weight_on_input, ) + else: + raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index e42e34ebc77b9..89ce20308f447 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -33,8 +33,8 @@ def kernel_warmup(worker: "Worker"): max_tokens = worker.scheduler_config.max_num_batched_tokens deep_gemm_warmup(model, max_tokens) - # FlashInfer kernel autotune for Blackwell (SM 10.0) GPUs - if has_flashinfer() and current_platform.is_device_capability(100): + # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs + if has_flashinfer() and current_platform.has_device_capability(90): flashinfer_autotune(worker.model_runner) # FlashInfer attention warmup From e42af78b18c539c0e62bf00f5b336c824808c35c Mon Sep 17 00:00:00 2001 From: Xiaozhu Meng <mxz297@gmail.com> Date: Thu, 11 Sep 2025 14:20:09 -0700 Subject: [PATCH 194/584] [flashinfer] [kernel] support for fp8 kv cache for trtllm prefill attention (#24197) Signed-off-by: Xiaozhu <mxz297@gmail.com> --- vllm/envs.py | 6 ++ vllm/utils/flashinfer.py | 11 ++- vllm/v1/attention/backends/flashinfer.py | 118 +++++++++++++++++++++-- 3 files changed, 121 insertions(+), 14 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 165cd32721fe5..c5688a72e11d5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -163,6 +163,7 @@ if TYPE_CHECKING: VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False VLLM_ENABLE_RESPONSES_API_STORE: bool = False VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None + VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False @@ -1155,6 +1156,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_TRTLLM_ATTENTION": lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), + # If set to 1, when we use fp8 kv, we do not quantize Q to fp8 + "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": + lambda: bool(int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0"))), + # If set, it means we pre-downloaded cubin files and flashinfer will # read the cubin files directly. "VLLM_HAS_FLASHINFER_CUBIN": @@ -1310,6 +1315,7 @@ def compute_hash() -> str: "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "VLLM_USE_CUDNN_PREFILL", "VLLM_USE_TRTLLM_ATTENTION", + "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "VLLM_ROCM_USE_AITER", "VLLM_ROCM_USE_AITER_PAGED_ATTN", "VLLM_ROCM_USE_AITER_LINEAR", diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index fab134733d4fd..83ec65c9b4594 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -200,11 +200,6 @@ def use_trtllm_attention( logger.info_once("Using TRTLLM attention (query is quantized).") return True - # TRTLLM prefill attention does not support FP8 kv cache with - # non-quantized query - if is_prefill and kv_cache_dtype.startswith("fp8"): - return False - # If sinks are being used, we must use TRTLLM attention as it's # the only backend that supports them if has_sinks: @@ -353,6 +348,12 @@ def flashinfer_scaled_fp8_mm( return output +@functools.cache +def flashinfer_disable_q_quantization() -> bool: + """Cache result which only depends on the environment""" + return envs.VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION + + __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 9e05cc8ab2f18..98a4cf38bc195 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -25,7 +25,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import cdiv, is_pin_memory_available -from vllm.utils.flashinfer import (supports_trtllm_attention, +from vllm.utils.flashinfer import (flashinfer_disable_q_quantization, + supports_trtllm_attention, use_trtllm_attention) from vllm.v1.attention.backends.flash_attn import use_cascade_attention # yapf conflicts with isort for this block @@ -48,8 +49,89 @@ FP4_DTYPE = torch.uint8 logger = init_logger(__name__) -class FlashInferBackend(AttentionBackend): +@triton.jit +def _trtllm_prefill_attn_kvfp8_dequant( + kv_cache_ptr, + block_tables_prefill_ptr, + block_table_stride, + mock_kv_cache_ptr, + k_scale_ptr, + v_scale_ptr, + K_CACHE_STRIDE: tl.constexpr, + KV_CACHE_STRIDE: tl.constexpr, +): + batch_idx = tl.program_id(0).to(tl.int64) + mock_block_table_idx = tl.program_id(1).to(tl.int64) + orig_page_num = tl.load(block_tables_prefill_ptr + + batch_idx * block_table_stride + + mock_block_table_idx).to(tl.int64) + if orig_page_num <= 0: + return + dequant_dtype = mock_kv_cache_ptr.dtype.element_ty + # Dequantize K + k_scale_val = tl.load(k_scale_ptr) + offset = orig_page_num * KV_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE) + fp8_vals = tl.load(kv_cache_ptr + offset) + dequantized_vals = fp8_vals.to(tl.float32) * k_scale_val + mock_cache_offset = (batch_idx * block_table_stride + mock_block_table_idx + + 1) * KV_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE) + dequantized_vals = dequantized_vals.to(dequant_dtype) + tl.store(mock_kv_cache_ptr + mock_cache_offset, dequantized_vals) + + # Dequantize V + v_scale_val = tl.load(v_scale_ptr) + offset = (orig_page_num * KV_CACHE_STRIDE + K_CACHE_STRIDE + + tl.arange(0, K_CACHE_STRIDE)) + fp8_vals = tl.load(kv_cache_ptr + offset) + dequantized_vals = fp8_vals.to(tl.float32) * v_scale_val + mock_cache_offset = ( + (batch_idx * block_table_stride + mock_block_table_idx + 1) * + KV_CACHE_STRIDE + K_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE)) + dequantized_vals = dequantized_vals.to(dequant_dtype) + tl.store(mock_kv_cache_ptr + mock_cache_offset, dequantized_vals) + + +def trtllm_prefill_attn_kvfp8_dequant( + kv_cache: torch.Tensor, + block_tables_prefill: torch.Tensor, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + dequant_dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, num_of_page_per_token = block_tables_prefill.shape + s = kv_cache.shape + assert s[1] == 2 + assert dequant_dtype in (torch.bfloat16, torch.float16) + k_cache_stride = s[2] * s[3] * s[4] + kv_cache_stride = k_cache_stride * s[1] + new_s = (batch_size * num_of_page_per_token + 1, s[1], s[2], s[3], s[4]) + # mock kv cache contains just the pages needed by this prefill + mock_kv_cache = torch.empty(new_s, + dtype=dequant_dtype, + device=kv_cache.device) + # we simply sequentially index the pages needed by this prefill + mock_block_table = torch.arange( + start=1, + end=batch_size * num_of_page_per_token + 1, + dtype=torch.int32, + device=block_tables_prefill.device, + ).reshape(batch_size, num_of_page_per_token) + grid = (batch_size, num_of_page_per_token) + _trtllm_prefill_attn_kvfp8_dequant[grid]( + kv_cache, + block_tables_prefill, + num_of_page_per_token, + mock_kv_cache, + k_scale, + v_scale, + k_cache_stride, + kv_cache_stride, + ) + return mock_kv_cache, mock_block_table + + +class FlashInferBackend(AttentionBackend): accept_output_buffer: bool = True @classmethod @@ -122,7 +204,6 @@ class FlashInferBackend(AttentionBackend): @dataclass class FlashInferMetadata: - num_actual_tokens: int # Number of tokens excluding padding. # The data type of the query @@ -175,8 +256,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.kv_cache_spec.block_size) max_num_reqs = vllm_config.scheduler_config.max_num_seqs max_num_pages = max_num_reqs * max_num_pages_per_req - self.enable_cuda_graph = self.compilation_config.cudagraph_mode.\ - decode_mode() == CUDAGraphMode.FULL + self.enable_cuda_graph = (self.compilation_config.cudagraph_mode.\ + decode_mode() == CUDAGraphMode.FULL) if self.enable_cuda_graph: # For full cudagraph capture, one `decode_wrapper` for each batch # size is needed for FlashInfer. @@ -201,7 +282,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): assert self.kv_cache_spec.dtype == self.model_config.dtype self.kv_cache_dtype = self.kv_cache_spec.dtype - if supports_trtllm_attention()[0]: + if supports_trtllm_attention()[0] and \ + not flashinfer_disable_q_quantization(): self.q_data_type = self.kv_cache_dtype else: self.q_data_type = self.model_config.dtype @@ -795,11 +877,29 @@ class FlashInferImpl(AttentionImpl): assert self.o_sf_scale is None out = output[num_decode_tokens:] + if attn_metadata.q_data_type != FP8_DTYPE \ + and self.kv_cache_dtype.startswith("fp8"): + # TRTLLM prefill attention does not support BF16 Q + # and fp8 kv cache. So to enable prefill attention + # with fp8 kv cache, we can construct a mock block + # and mock kv cache with BF16 KV involved in the prefill + mock_kv_cache, mock_block_table = ( + trtllm_prefill_attn_kvfp8_dequant( + kv_cache_permute, + block_tables_prefill, + layer._k_scale, + layer._v_scale, + attn_metadata.q_data_type, + )) + else: + mock_kv_cache = kv_cache_permute + mock_block_table = block_tables_prefill + trtllm_batch_context_with_kv_cache( query=prefill_query, - kv_cache=kv_cache_permute, + kv_cache=mock_kv_cache, workspace_buffer=workspace_buffer, - block_tables=block_tables_prefill, + block_tables=mock_block_table, seq_lens=seq_lens_prefill, max_q_len=attn_metadata.max_q_len, max_kv_len=attn_metadata.max_seq_len, @@ -837,7 +937,7 @@ class FlashInferImpl(AttentionImpl): decode_query = decode_query.contiguous() workspace_buffer = decode_wrapper._float_workspace_buffer block_tables_decode = attn_metadata.\ - block_table_tensor[:num_decode_tokens] + block_table_tensor[:num_decode_tokens] seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens] # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND From 1ec20355f5eaa52b5965bfa01d2348b5ed818750 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:32:27 -0400 Subject: [PATCH 195/584] [Bugfix] Set `VLLM_ALLREDUCE_USE_SYMM_MEM` default to False (#24696) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index c5688a72e11d5..8ca7e0f19428d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1218,7 +1218,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use pytorch symmetric memory for allreduce "VLLM_ALLREDUCE_USE_SYMM_MEM": - lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))), + lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))), # Allows vllm to find tuned config under customized folder "VLLM_TUNED_CONFIG_FOLDER": From 569bf1c9c0063e48844a1a014a7f01bc5d4aba33 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Thu, 11 Sep 2025 14:38:16 -0700 Subject: [PATCH 196/584] [Qwen3-Next] MoE configs for H200 TP=1,2,4 (#24695) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- .../E=512,N=128,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ .../E=512,N=256,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ .../E=512,N=512,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..20146f53a6eba --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..2f0b45014e863 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..e55df46b40269 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} From 7d4651997a57bc3ed5ad49e0e96baf41020979c8 Mon Sep 17 00:00:00 2001 From: Zhewen Li <zhewenli@meta.com> Date: Thu, 11 Sep 2025 15:34:36 -0700 Subject: [PATCH 197/584] [CI/Build] Add bc-linter to vLLM CI (#21234) Signed-off-by: zhewenli <zhewenli@meta.com> --- .github/.bc-linter.yml | 24 ++++++++++++++ .github/workflows/bc-lint.yml | 27 ++++++++++++++++ vllm/__init__.py | 6 ++++ vllm/_bc_linter.py | 59 +++++++++++++++++++++++++++++++++++ vllm/v1/core/sched/output.py | 5 +++ 5 files changed, 121 insertions(+) create mode 100644 .github/.bc-linter.yml create mode 100644 .github/workflows/bc-lint.yml create mode 100644 vllm/_bc_linter.py diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml new file mode 100644 index 0000000000000..443dfa45af22c --- /dev/null +++ b/.github/.bc-linter.yml @@ -0,0 +1,24 @@ +# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md +version: 1 +paths: +# We temporarily disable globally, and will only enable with `annotations.include` +# include: +# - "vllm/v1/attetion/*.py" +# - "vllm/v1/core/*.py" +exclude: + - "**/*.py" + +scan: + functions: true # check free functions and methods + classes: true # check classes/dataclasses + public_only: true # ignore names starting with "_" at any level + +annotations: + include: # decorators that force‑include a symbol + - name: "bc_linter_include" # matched by simple name or dotted suffix + propagate_to_members: false # for classes, include methods/inner classes + exclude: # decorators that force‑exclude a symbol + - name: "bc_linter_skip" # matched by simple name or dotted suffix + propagate_to_members: true # for classes, exclude methods/inner classes + +excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"] diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml new file mode 100644 index 0000000000000..3795b046d7808 --- /dev/null +++ b/.github/workflows/bc-lint.yml @@ -0,0 +1,27 @@ +name: BC Lint + +on: + pull_request: + types: + - opened + - synchronize + - reopened + +jobs: + bc_lint: + if: github.repository_owner == 'vllm-project' + runs-on: ubuntu-latest + steps: + - name: Run BC Lint Action + uses: pytorch/test-infra/.github/actions/bc-lint@main + with: + repo: ${{ github.event.pull_request.head.repo.full_name }} + base_sha: ${{ github.event.pull_request.base.sha }} + head_sha: ${{ github.event.pull_request.head.sha }} + suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }} + docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter' + config_dir: .github + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true diff --git a/vllm/__init__.py b/vllm/__init__.py index 7b90fd3a241bd..3a5c1b1ce0daf 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -14,6 +14,8 @@ import typing import vllm.env_override # noqa: F401 MODULE_ATTRS = { + "bc_linter_skip": "._bc_linter:bc_linter_skip", + "bc_linter_include": "._bc_linter:bc_linter_include", "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs", "EngineArgs": ".engine.arg_utils:EngineArgs", "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine", @@ -54,6 +56,8 @@ if typing.TYPE_CHECKING: ScoringRequestOutput) from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams + + from ._bc_linter import bc_linter_include, bc_linter_skip else: def __getattr__(name: str) -> typing.Any: @@ -70,6 +74,8 @@ else: __all__ = [ "__version__", + "bc_linter_skip", + "bc_linter_include", "__version_tuple__", "LLM", "ModelRegistry", diff --git a/vllm/_bc_linter.py b/vllm/_bc_linter.py new file mode 100644 index 0000000000000..52a95dbee1866 --- /dev/null +++ b/vllm/_bc_linter.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# vllm/_bc_linter.py +from __future__ import annotations + +from typing import Any, Callable, TypeVar, overload + +T = TypeVar("T") + + +@overload +def bc_linter_skip(obj: T) -> T: + ... + + +@overload +def bc_linter_skip(*, reason: str | None = ...) -> Callable[[T], T]: + ... + + +def bc_linter_skip(obj: Any = None, *, reason: str | None = None): + """ + No-op decorator to mark symbols/files for BC-linter suppression. + + Usage: + @bc_linter_skip + def legacy_api(...): ... + """ + + def _wrap(x: T) -> T: + return x + + return _wrap if obj is None else obj + + +@overload +def bc_linter_include(obj: T) -> T: + ... + + +@overload +def bc_linter_include(*, reason: str | None = ...) -> Callable[[T], T]: + ... + + +def bc_linter_include(obj: Any = None, *, reason: str | None = None): + """ + Usage: + @bc_linter_include + def public_api(...): ... + """ + + def _wrap(x: T) -> T: + return x + + return _wrap if obj is None else obj + + +__all__ = ["bc_linter_skip", "bc_linter_include"] diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index b5cd6c5c8af51..9888f25735753 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -6,6 +6,8 @@ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING, Optional +from vllm import bc_linter_include + if TYPE_CHECKING: import numpy as np import numpy.typing as npt @@ -19,6 +21,7 @@ if TYPE_CHECKING: from vllm.v1.request import Request +@bc_linter_include @dataclass class NewRequestData: @@ -80,6 +83,7 @@ class NewRequestData: ")") +@bc_linter_include @dataclass class CachedRequestData: @@ -109,6 +113,7 @@ class CachedRequestData: ) +@bc_linter_include @dataclass class SchedulerOutput: From 7a70a718927db93ffa399eda5a38ca0541fbb5ed Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Fri, 12 Sep 2025 02:34:58 +0400 Subject: [PATCH 198/584] [Qwen3-Next] Add B200 MoE configs for Qwen3-next (#24698) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com> --- .../E=512,N=128,device_name=NVIDIA_B200.json | 146 ++++++++++++++++++ .../E=512,N=256,device_name=NVIDIA_B200.json | 146 ++++++++++++++++++ .../E=512,N=512,device_name=NVIDIA_B200.json | 146 ++++++++++++++++++ .../E=512,N=64,device_name=NVIDIA_B200.json | 146 ++++++++++++++++++ 4 files changed, 584 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json new file mode 100644 index 0000000000000..d104aa5167b22 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json new file mode 100644 index 0000000000000..d0140252594f5 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json new file mode 100644 index 0000000000000..5d69efe9ed5f9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json new file mode 100644 index 0000000000000..a0855a921f3f6 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} From d4fd2768ef1cf3db52956a3aa881ef3a5ff72a51 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni001@gmail.com> Date: Thu, 11 Sep 2025 18:39:42 -0400 Subject: [PATCH 199/584] [Bugfix][Attention] Fix FlashInfer MLA block size logic (#24692) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> --- vllm/platforms/cuda.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 52b33849dae6b..e40b6eb2b5a4d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -146,6 +146,7 @@ class CudaPlatformBase(Platform): # required block_size. use_flashmla = False use_cutlass_mla = False + use_flashinfer_mla = False if envs.VLLM_ATTENTION_BACKEND is None: # Default case @@ -164,6 +165,8 @@ class CudaPlatformBase(Platform): use_flashmla = (envs.VLLM_ATTENTION_BACKEND == "FLASHMLA") use_cutlass_mla = ( envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA") + use_flashinfer_mla = ( + envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA") from vllm.attention.ops.flashmla import is_flashmla_supported if use_flashmla and is_flashmla_supported()[0] \ @@ -176,6 +179,11 @@ class CudaPlatformBase(Platform): cache_config.block_size = 128 logger.info("Forcing kv cache block size to 128 for " "CUTLASS_MLA backend.") + if use_flashinfer_mla and cache_config.block_size not in [32, 64]: + cache_config.block_size = 64 + logger.info( + "Forcing kv cache block size to 64 for FlashInferMLA " + "backend.") # lazy import to avoid circular import from vllm.config import CUDAGraphMode @@ -228,8 +236,9 @@ class CudaPlatformBase(Platform): use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or ( selected_backend is None and cls.is_device_capability(100) and block_size == 128) - use_flashinfermla = (selected_backend == _Backend.FLASHINFER_MLA - and cls.has_device_capability(100)) + use_flashinfermla = selected_backend == _Backend.FLASHINFER_MLA or ( + selected_backend is None and cls.is_device_capability(100) + and block_size in [32, 64]) use_flashmla = selected_backend in [ _Backend.FLASHMLA, _Backend.FLASHMLA_VLLM_V1 ] or (selected_backend is None and is_flashmla_supported()[0]) From c3aea10dc8b83645523843a27999a95340020ddf Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Thu, 11 Sep 2025 18:43:14 -0400 Subject: [PATCH 200/584] [Perf] Use upstream CUTLASS for SM90 Block FP8 kernel (#23280) Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- benchmarks/kernels/bench_block_fp8_gemm.py | 59 +- .../gemm/collective/collective_builder.hpp | 123 --- .../gemm/collective/fp8_accumulation.hpp | 183 ----- ..._warpspecialized_fp8_blockwise_scaling.hpp | 729 ------------------ .../gemm/dispatch_policy.hpp | 39 - .../vllm_collective_builder.cuh | 2 +- ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh | 3 - ...scaled_mm_blockwise_sm120_fp8_dispatch.cuh | 3 - .../scaled_mm_blockwise_sm90_fp8_dispatch.cuh | 198 +++-- .../cutlass_w8a8/c3x/scaled_mm_helper.hpp | 28 +- tests/kernels/quantization/test_block_fp8.py | 52 +- .../model_executor/layers/quantization/fp8.py | 12 +- .../layers/quantization/utils/fp8_utils.py | 50 +- 13 files changed, 221 insertions(+), 1260 deletions(-) delete mode 100644 csrc/cutlass_extensions/gemm/collective/collective_builder.hpp delete mode 100644 csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp delete mode 100644 csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp delete mode 100644 csrc/cutlass_extensions/gemm/dispatch_policy.hpp diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py index 9663503e9baa0..f1e504499eaf6 100644 --- a/benchmarks/kernels/bench_block_fp8_gemm.py +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -4,7 +4,10 @@ import torch from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - w8a8_block_fp8_matmul, + apply_w8a8_block_fp8_linear, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + CUTLASS_BLOCK_FP8_SUPPORTED, ) from vllm.platforms import current_platform from vllm.triton_utils import triton as vllm_triton @@ -29,7 +32,7 @@ DEEPSEEK_V3_SHAPES = [ ] -def build_w8a8_block_fp8_runner(M, N, K, block_size, device): +def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): """Build runner function for w8a8 block fp8 matmul.""" factor_for_scale = 1e-2 @@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device): fp8_max, fp8_min = fp8_info.max, fp8_info.min # Create random FP8 tensors - A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max - A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max - B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max - B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) # Create scales block_n, block_k = block_size[0], block_size[1] n_tiles = (N + block_n - 1) // block_n k_tiles = (K + block_k - 1) // block_k - As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale Bs = ( torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device) * factor_for_scale ) + # SM90 CUTLASS requires row-major format for scales + if use_cutlass and current_platform.is_device_capability(90): + Bs = Bs.T.contiguous() + def run(): - return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16) + if use_cutlass: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True + ) + else: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False + ) return run +# Determine available providers +available_providers = ["torch-bf16", "w8a8-block-fp8-triton"] +plot_title = "BF16 vs W8A8 Block FP8 GEMMs" + +if CUTLASS_BLOCK_FP8_SUPPORTED: + available_providers.append("w8a8-block-fp8-cutlass") + + @vllm_triton.testing.perf_report( vllm_triton.testing.Benchmark( x_names=["batch_size"], x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], x_log=False, line_arg="provider", - line_vals=["torch-bf16", "w8a8-block-fp8"], - line_names=["torch-bf16", "w8a8-block-fp8"], + line_vals=available_providers, + line_names=available_providers, ylabel="TFLOP/s (larger is better)", plot_name="BF16 vs W8A8 Block FP8 GEMMs", args={}, @@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)): ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( lambda: torch.nn.functional.linear(a, b), quantiles=quantiles ) - else: # w8a8-block-fp8 - run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device) - ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( - lambda: run_w8a8(), quantiles=quantiles + elif provider == "w8a8-block-fp8-triton": + run_w8a8_triton = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=False ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_triton(), quantiles=quantiles + ) + elif provider == "w8a8-block-fp8-cutlass": + run_w8a8_cutlass = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=True + ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_cutlass(), quantiles=quantiles + ) + else: + raise ValueError(f"Unknown provider: {provider}") to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp deleted file mode 100644 index ec75c29e54f4d..0000000000000 --- a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp +++ /dev/null @@ -1,123 +0,0 @@ -// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl -// clang-format off -#pragma once - -#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl" - -#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp" - - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// GMMA_TMA_WS_SS (BlockScaled Builders) -template < - class ElementA, - class GmemLayoutATag, - int AlignmentA, - class ElementB, - class GmemLayoutBTag, - int AlignmentB, - class ElementAccumulator, - class TileShape_MNK, - class ClusterShape_MNK, - class StageCountType, - int ScaleGranularityM -> -struct CollectiveBuilder< - arch::Sm90, - arch::OpClassTensorOp, - ElementA, - GmemLayoutATag, - AlignmentA, - ElementB, - GmemLayoutBTag, - AlignmentB, - ElementAccumulator, - TileShape_MNK, - ClusterShape_MNK, - StageCountType, - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>, - cute::enable_if_t< - not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()> -> { - using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>; - - static_assert(is_static<TileShape_MNK>::value); - static_assert(is_static<ClusterShape_MNK>::value); -#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED - static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n"); -#endif - static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(), - "Should meet TMA alignment requirement\n"); - - static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType, - KernelPtrArrayTmaWarpSpecializedCooperative, - KernelPtrArrayTmaWarpSpecializedPingpong>); - static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>(); - static_assert((!IsFP8Input || !IsArrayOfPointersGemm), - "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now."); - - // For fp32 types, map to tf32 MMA value type - using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>; - using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>; - - static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>(); - static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>(); - - static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType, - KernelTmaWarpSpecializedCooperative, - KernelPtrArrayTmaWarpSpecializedCooperative, - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>; - using AtomLayoutMNK = cute::conditional_t<IsCooperative, - Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>; - - using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< - ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{})); - - using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); - using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{}))); - - using SmemLayoutAtomA = decltype(detail::ss_smem_selector< - GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); - using SmemLayoutAtomB = decltype(detail::ss_smem_selector< - GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); - - static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; - static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage); - - static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout, - ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{}); - using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>; - - using SmemCopyAtomA = void; - using SmemCopyAtomB = void; - - using CollectiveOp = CollectiveMma< - DispatchPolicy, - TileShape_MNK, - ElementA, - TagToStrideA_t<GmemLayoutATag>, - ElementB, - TagToStrideB_t<GmemLayoutBTag>, - TiledMma, - GmemTiledCopyA, - SmemLayoutAtomA, - SmemCopyAtomA, - cute::identity, - GmemTiledCopyB, - SmemLayoutAtomB, - SmemCopyAtomB, - cute::identity - >; -}; - - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::collective - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp deleted file mode 100644 index 13b90e998625e..0000000000000 --- a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp +++ /dev/null @@ -1,183 +0,0 @@ -// clang-format off -// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp - -/*************************************************************************************************** - * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -#include "cute/algorithm/clear.hpp" -#include "cute/tensor.hpp" - -////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////FP8 Accumulation/////////////////////////// -////////////////////////////////////////////////////////////////////////////// -/// This class provides API to promote (add) or scale (multiply_add) the results -/// from the tensor core accumulators to the main accumulators when the number -/// of MMAs reaches the max number of MMA interval specified by user, after that -/// the tensor core accumulators are zeroed. -////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { - -template < - class EngineAccum, - class LayoutAccum> -struct GmmaFP8AccumulationWithScale { - using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>; - using ElementAccumulator = typename EngineAccum::value_type; - - static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static"); - static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident."); - -private: - TensorAccum& accum_; - TensorAccum accum_temp_; - - uint32_t accum_promotion_interval_; // defines the max num of executed MMAs after which accum should be promoted. - uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop - uint32_t mma_count_; // current executed MMAs - uint32_t reset_accum_flag_; // accum needs to be zeroed or not. - - // promote or `add` the partial accumulators to main accumulator (FADD). - CUTLASS_DEVICE - void promote_core() { - warpgroup_wait<0>(); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(accum_); ++i) { - accum_(i) += accum_temp_(i); - } - } - - // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA). - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) { - using TensorScale = cute::Tensor<EngineScale, LayoutScale>; - - static_assert(is_static<LayoutScale>::value, "Scale Layout should be static"); - static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident."); - - static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape."); - - warpgroup_wait<0>(); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(accum_); ++i) { - accum_(i) += accum_temp_(i) * scale(i); - } - } - -public: - CUTLASS_DEVICE - GmmaFP8AccumulationWithScale( - TensorAccum &accum, - uint32_t accum_promotion_interval, - uint32_t mma_count_per_mainloop_iteration) - : accum_(accum), - accum_promotion_interval_(accum_promotion_interval), - mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration), - mma_count_(0), - reset_accum_flag_(0) - { - accum_temp_ = cute::make_fragment_like(accum); - } - - // - // Methods (Common) - // - - CUTLASS_DEVICE - TensorAccum& operator()() { - return accum_temp_; - } - - /// prepare the MMA accumulators when initialization or zeroing is required. - CUTLASS_DEVICE - bool prepare_if_needed() { - return reset_accum_flag_; - } - - // - // Methods (for FADD version) - // - - /// promote (add) the results from the MMA accumulators to main accumulator if needed. - CUTLASS_DEVICE - void promote_if_needed() { - mma_count_ += mma_count_per_mainloop_iteration_; - reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); - if (reset_accum_flag_) { - promote_core(); - mma_count_ = 0; - } - } - - /// promote (add) the residue results from the MMA accumulators to main accumulator if needed. - CUTLASS_DEVICE - void promote_residue_if_needed() { - if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { - promote_core(); - } - } - - // - // Methods (for FFMA version) - // - - /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed. - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) { - mma_count_ += mma_count_per_mainloop_iteration_; - reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); - if (reset_accum_flag_) { - scale_core(scale); - mma_count_ = 0; - } - } - - /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed. - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) { - if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { - scale_core(scale); - } - } -}; - -} // namespace cutlass::gemm::collective diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp deleted file mode 100644 index ce7f47cf72337..0000000000000 --- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ /dev/null @@ -1,729 +0,0 @@ -// clang-format off -// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp - -/*************************************************************************************************** - * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -#include "cutlass/cutlass.h" -#include "cutlass/gemm/dispatch_policy.hpp" -#include "cutlass/trace.h" -#include "cutlass/numeric_types.h" - -#include "cute/arch/cluster_sm90.hpp" -#include "cute/arch/copy_sm80.hpp" -#include "cute/arch/copy_sm90.hpp" -#include "cute/algorithm/functional.hpp" -#include "cute/atom/mma_atom.hpp" -#include "cute/algorithm/gemm.hpp" -#include "cute/numeric/arithmetic_tuple.hpp" - -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { -using namespace cute; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// WarpSpecialized Mainloop -template < - int Stages, - class ClusterShape, - class KernelSchedule, - int ScaleGranularityM_, - class TileShape_, - class ElementA_, - class StrideA_, - class ElementB_, - class StrideB_, - class TiledMma_, - class GmemTiledCopyA_, - class SmemLayoutAtomA_, - class SmemCopyAtomA_, - class TransformA_, - class GmemTiledCopyB_, - class SmemLayoutAtomB_, - class SmemCopyAtomB_, - class TransformB_> -struct CollectiveMma< - MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>, - TileShape_, - ElementA_, - StrideA_, - ElementB_, - StrideB_, - TiledMma_, - GmemTiledCopyA_, - SmemLayoutAtomA_, - SmemCopyAtomA_, - TransformA_, - GmemTiledCopyB_, - SmemLayoutAtomB_, - SmemCopyAtomB_, - TransformB_> -{ - // - // Type Aliases - // - using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>; - using TileShape = TileShape_; - using ElementA = ElementA_; - using StrideA = StrideA_; - using ElementB = ElementB_; - using StrideB = StrideB_; - using TiledMma = TiledMma_; - using ElementAccumulator = typename TiledMma::ValTypeC; - using ElementBlockScale = ElementAccumulator; - using GmemTiledCopyA = GmemTiledCopyA_; - using GmemTiledCopyB = GmemTiledCopyB_; - using SmemLayoutAtomA = SmemLayoutAtomA_; - using SmemLayoutAtomB = SmemLayoutAtomB_; - using SmemCopyAtomA = SmemCopyAtomA_; - using SmemCopyAtomB = SmemCopyAtomB_; - using TransformA = TransformA_; - using TransformB = TransformB_; - using ArchTag = typename DispatchPolicy::ArchTag; - - using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{})); - using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>; - using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>; - using PipelineParams = typename MainloopPipeline::Params; - - // Two threads per CTA are producers (1 for operand tile and 32 for scales) - static constexpr int NumProducerThreadEvents = 33; - - static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_; - static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; - - static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - - static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - - static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M."); - - // Tile along modes in a way that maximizes the TMA box size. - using SmemLayoutA = decltype(tile_to_shape( - SmemLayoutAtomA{}, - make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}), - cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); - using SmemLayoutB = decltype(tile_to_shape( - SmemLayoutAtomB{}, - make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}), - cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); - - // Block scaling gmem-to-smem copy atom - using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>; - using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>; - - // Block scaling smem layout - using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>; - using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1. - - static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); - static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value && - cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value, - "MMA atom must source both A and B operand from smem_desc for this mainloop."); - static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>, - "GmemTiledCopy - invalid SM90 TMA copy atom specified."); - static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>, - "GmemTiledCopy - invalid SM90 TMA copy atom specified."); - static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>, - "ElementAccumulator and ElementBlockScale should be same datatype"); - - struct SharedStorage - { - struct TensorStorage : cute::aligned_struct<128> { - cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A; // mxk - cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B; // nxk - cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k - cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk - } tensors; - - using PipelineStorage = typename MainloopPipeline::SharedStorage; - PipelineStorage pipeline; - }; - using TensorStorage = typename SharedStorage::TensorStorage; - using PipelineStorage = typename SharedStorage::PipelineStorage; - - // Host side kernel arguments - struct Arguments { - ElementA const* ptr_A; - StrideA dA; - ElementB const* ptr_B; - StrideB dB; - ElementBlockScale const* ptr_scale_A; - ElementBlockScale const* ptr_scale_B; - }; - - // Device side kernel params - struct Params { - // Assumption: StrideA is congruent with Problem_MK - using TMA_A = decltype(make_tma_copy_A_sm90( - GmemTiledCopyA{}, - make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), - SmemLayoutA{}(_,_,0), - TileShape{}, - ClusterShape{})); - // Assumption: StrideB is congruent with Problem_NK - using TMA_B = decltype(make_tma_copy_B_sm90( - GmemTiledCopyB{}, - make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}), - SmemLayoutB{}(_,_,0), - TileShape{}, - ClusterShape{})); - TMA_A tma_load_a; - TMA_B tma_load_b; - uint32_t tma_transaction_bytes = TmaTransactionBytes; - uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK; - uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK; - // Block scaling factors for A and B - ElementBlockScale const* ptr_scale_A; - ElementBlockScale const* ptr_scale_B; - }; - - // - // Methods - // - - template <class ProblemShape> - static constexpr Params - to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { - (void) workspace; - - // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A); - auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B); - - Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA)); - Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB)); - typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90( - GmemTiledCopyA{}, - tensor_a, - SmemLayoutA{}(_,_,cute::Int<0>{}), - TileShape{}, - ClusterShape{}); - typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90( - GmemTiledCopyB{}, - tensor_b, - SmemLayoutB{}(_,_,cute::Int<0>{}), - TileShape{}, - ClusterShape{}); - uint32_t transaction_bytes_mk = TmaTransactionBytesMK; - uint32_t transaction_bytes_nk = TmaTransactionBytesNK; - uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk; - - return { - tma_load_a, - tma_load_b, - transaction_bytes, - transaction_bytes_mk, - transaction_bytes_nk, - args.ptr_scale_A, - args.ptr_scale_B - }; - } - - template<class ProblemShape> - static bool - can_implement( - ProblemShape const& problem_shape, - [[maybe_unused]] Arguments const& args) { - constexpr int tma_alignment_bits = 128; - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - bool implementable = true; - constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value; - implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{}); - constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value; - implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{}); - - if (!implementable) { - CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); - } - return implementable; - } - - static constexpr int K_PIPE_MAX = DispatchPolicy::Stages; - static constexpr int K_PIPE_MMAS = 1; - static constexpr uint32_t TmaTransactionBytesMK = - cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value)); - static constexpr uint32_t TmaTransactionBytesNK = - cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value)); - static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK; - - /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance - CUTLASS_DEVICE - static void prefetch_tma_descriptors(Params const& mainloop_params) - { - cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor()); - cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor()); - } - - /// Set up the data needed by this collective for load and mma. - /// Returns a tuple of tensors. The collective and the kernel layer have the contract - /// Returned tuple must contain at least two elements, with the first two elements being: - /// gA_mkl - The tma tensor, A after a local tile so it has shape (BLK_M,BLK_K,m,k,l) - /// gB_nkl - The tma tensor, B after a local tile so it has shape (BLK_N,BLK_K,n,k,l) - template <class ProblemShape_MNKL> - CUTLASS_DEVICE auto - load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const { - using X = Underscore; - // Separate out problem shape for convenience - auto [M,N,K,L] = problem_shape_MNKL; - - // TMA requires special handling of strides to deal with coord codomain mapping - // Represent the full tensors -- get these from TMA - Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) - Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) - - // Make tiled views, defer the slice - Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) - Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) - - constexpr auto scales_m = Int<ScaleMsPerTile>{}; - auto tM = get<2>(gA_mkl.shape()); - auto tN = get<2>(gB_nkl.shape()); - auto tK = get<3>(gA_mkl.shape()); - - // Make the tiled views of scale tensors - auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l) - auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{}); - auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l) - auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{}); - - // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and - // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl. - Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l) - Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l) - - return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl); - } - - /// Perform a collective-scoped matrix multiply-accumulate - /// Producer Perspective - template < - class TensorA, class TensorB, - class TensorScaleA, class TensorScaleB, - class KTileIterator, class BlockCoord - > - CUTLASS_DEVICE void - load( - Params const& mainloop_params, - MainloopPipeline pipeline, - PipelineState smem_pipe_write, - cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs, - BlockCoord const& blk_coord, - KTileIterator k_tile_iter, int k_tile_count, - int thread_idx, - uint32_t block_rank_in_cluster, - TensorStorage& shared_tensors) { - int lane_predicate = cute::elect_one_sync(); - - // Blockscaling: Tma loads for load_input and CpAsync for load_scale - Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) - Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) - Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k) - Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) - - // - // Prepare the TMA loads for A and B - // - - constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); - uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; - - Tensor gA_mkl = get<0>(load_inputs); - Tensor gB_nkl = get<1>(load_inputs); - - auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y); - auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x); - - // Partition the inputs based on the current block coordinates. - auto [m_coord, n_coord, k_coord, l_coord] = blk_coord; - Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) - Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) - - - // Block scaling: load_scale has scaling tensors in global memory which are not tiled - Tensor mScaleA_mkl = get<2>(load_inputs); - Tensor mScaleB_nkl = get<3>(load_inputs); - auto scales_m = get<0>(mScaleA_mkl.shape()); - - Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape()); - - Tensor gScaleA = local_tile( - mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), - make_coord(m_coord,_,l_coord)); // (ScaleMsPerTile,k,1) - Tensor cScaleA = local_tile( - cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), - make_coord(m_coord,_,l_coord)); - Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord); // (1,k,1) - - // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128 - TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, - Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1) - TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, - Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1) - ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x); - ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x); - - Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA); - Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA); - Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA); - - Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB); - Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB); - - // Applies the mapping from block_tma_a - Tensor tAgA = block_tma_a.partition_S(gA); // (TMA,TMA_M,TMA_K,k) - Tensor tAsA = block_tma_a.partition_D(sA); // (TMA,TMA_M,TMA_K,PIPE) - - Tensor tBgB = block_tma_b.partition_S(gB); // (TMA,TMA_N,TMA_K,k) - Tensor tBsB = block_tma_b.partition_D(sB); // (TMA,TMA_N,TMA_K,PIPE) - - uint16_t mcast_mask_a = 0; - uint16_t mcast_mask_b = 0; - - // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors - // Maps the tile -> block, value - if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) { - auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id - for (int n = 0; n < size<1>(block_layout); ++n) { - mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{})); - } - } - - if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) { - auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id - for (int m = 0; m < size<0>(block_layout); ++m) { - mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{})); - } - } - - // Allocate predicate tensors for a_scales (since we can't guarantee that - // all scales are valid, since we could have a partial tiles along M) - Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0))); - #pragma unroll - for (int i = 0; i < size(tApA_ScaleA); ++i) { - tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m; - } - - // Mainloop - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) { - // LOCK smem_pipe_write for _writing_ - pipeline.producer_acquire(smem_pipe_write); - - // - // Copy gmem to smem for *k_tile_iter - // - int write_stage = smem_pipe_write.index(); - using BarrierType = typename MainloopPipeline::ProducerBarrierType; - BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write); - - // Copy operands A and B from global memory to shared memory - if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); - if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); - - // Copy scale tensors from global memory to shared memory - copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage)); - copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage)); - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc); - - ++k_tile_iter; - - // Advance smem_pipe_write - ++smem_pipe_write; - } - } - - /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster - CUTLASS_DEVICE void - load_tail( - MainloopPipeline pipeline, - PipelineState smem_pipe_write) { - int lane_predicate = cute::elect_one_sync(); - - // Issue the epilogue waits - if (lane_predicate) { - /* This helps avoid early exit of blocks in Cluster - * Waits for all stages to either be released (all - * Consumer UNLOCKs), or if the stage was never used - * then would just be acquired since the phase was - * still inverted from make_producer_start_state - */ - pipeline.producer_tail(smem_pipe_write); - } - } - - /// Perform a collective-scoped matrix multiply-accumulate - /// Consumer Perspective - template < - class FrgTensorC - > - CUTLASS_DEVICE void - mma(MainloopPipeline pipeline, - PipelineState smem_pipe_read, - FrgTensorC& accum, - int k_tile_count, - int thread_idx, - TensorStorage& shared_tensors, - Params const& mainloop_params) { - - - static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident."); - static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); - static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); - static_assert(cute::is_void_v<SmemCopyAtomA>, - "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); - static_assert(cute::is_void_v<SmemCopyAtomB>, - "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); - - Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) - Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) - - // Block scaling - Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), - Layout< - Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>, - Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>> - >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k) - Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) - - // - // Define C accumulators and A/B partitioning - // - - // Layout of warp group to thread mapping - - static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and - stride<0>(typename TiledMma::BLayout{}) == 0 and - size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and - size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, - "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup"); - - constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup; - Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, - Int<NumThreadsPerWarpGroup>{}); - - int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0); - - TiledMma tiled_mma; - auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx)); - - Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC); // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C. - - Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) - Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) - - // Allocate "fragments/descriptors" - Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) - Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) - - CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M - CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N - CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K - CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE - CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA)); // PIPE - CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB)); // PIPE - - // - // PIPELINED MAIN LOOP - // - static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX), - "ERROR : Incorrect number of MMAs in flight"); - - // We release buffers to producer warps(dma load) with some mmas in flight - PipelineState smem_pipe_release = smem_pipe_read; - - // Per block scale values for operand A and B - - using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout. - using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above - - Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{}); // (MMA,MMA_M,MMA_N) - ElementBlockScale scale_b; - - // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - - GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA)); - warpgroup_fence_operand(accumulation()); - CUTLASS_PRAGMA_UNROLL - for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - - int read_stage = smem_pipe_read.index(); - - // Load per block scale values from shared memory to registers. - scale_b = sScaleB[read_stage]; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); - } - if constexpr (ScaleMsPerTile == 1) { - static_assert(size(RegLayoutScaleAEssential{}) == 1); - tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. - } else { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; - } - } - - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M,K) x (V,N,K) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - warpgroup_commit_batch(); - - // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` - accumulation.scale_if_needed(tCrScaleAViewAsC); - - ++smem_pipe_read; - } - - warpgroup_fence_operand(accumulation()); - // Mainloop GMMAs - k_tile_count -= prologue_mma_count; - - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - - // - // Compute on k_tile - // - - int read_stage = smem_pipe_read.index(); - - // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) - scale_b = sScaleB[read_stage]; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); - } - if constexpr (ScaleMsPerTile == 1) { - static_assert(size(RegLayoutScaleAEssential{}) == 1); - tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. - } else { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; - } - } - - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - - warpgroup_fence_operand(accumulation()); - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M,K) x (V,N,K) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - warpgroup_commit_batch(); - - /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed - warpgroup_wait<K_PIPE_MMAS>(); - warpgroup_fence_operand(accumulation()); - - // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` - accumulation.scale_if_needed(tCrScaleAViewAsC); - - pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it - - // Advance smem_pipe_read and smem_pipe_release - ++smem_pipe_read; - ++smem_pipe_release; - } - - accumulation.scale_residue_if_needed(tCrScaleAViewAsC); - - warpgroup_fence_operand(accumulation()); - } - - /// Perform a Consumer Epilogue to release all buffers - CUTLASS_DEVICE void - mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) { - // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - k_tile_count -= prologue_mma_count; - - smem_pipe_release.advance(k_tile_count); - - // Wait on all GMMAs to complete - warpgroup_wait<0>(); - - for (int count = 0; count < prologue_mma_count; ++count) { - pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it - ++smem_pipe_release; - } - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::collective - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp deleted file mode 100644 index df809e27a3efe..0000000000000 --- a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "cutlass/gemm/dispatch_policy.hpp" - -namespace cutlass::gemm { - -////////////////////////////////////////////////////////////////////////////// - -// FP8 related policies (including Blocked Scaled Accumulation) -// `ScaleGranularityM` specifies scaling granularity along M, while zero-value -// `ScaleGranularityM` indicates that scaling granularity is -// `size<0>(TileShape_MNK{})` along M. -template <int ScaleGranularityM = 0> -struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum - : KernelTmaWarpSpecializedCooperative {}; - -// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp -// specialized dynamic schedule For FP8 kernels with Block Scaling -template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>, - class KernelSchedule = KernelTmaWarpSpecialized, - int ScaleGranularityM = - 0 // `ScaleGranularityM` specifies scaling granularity along M, - // while zero-value `ScaleGranularityM` indicates that scaling - // granularity is `size<0>(TileShape_MNK{})` along M. - > -struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8 - : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, - KernelSchedule> { - static_assert( - cute::is_same_v< - KernelSchedule, - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< - ScaleGranularityM>>, - "KernelSchedule must be one of the warp specialized policies"); -}; - -////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm \ No newline at end of file diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh index e7fbba4cd4b0d..085ee1290031f 100644 --- a/csrc/cutlass_extensions/vllm_collective_builder.cuh +++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh @@ -1,6 +1,6 @@ #pragma once -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" namespace cutlass::gemm::collective { using namespace cute; diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index c841125dbb734..939879b2c59fa 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -14,9 +14,6 @@ #include "cutlass/epilogue/dispatch_policy.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" - #include "cutlass_gemm_caller.cuh" namespace vllm { diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh index d50a83ae1cd48..78d5cf37fa6d0 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh @@ -14,9 +14,6 @@ #include "cutlass/epilogue/dispatch_policy.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" - #include "cutlass_gemm_caller.cuh" namespace vllm { diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh index e089c3d4be2cc..86220264151e7 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh @@ -13,27 +13,18 @@ #include "cutlass/epilogue/dispatch_policy.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" - #include "cutlass_gemm_caller.cuh" namespace vllm { using namespace cute; -template <typename SchedulerType, typename OutType, int GroupSizeM_, - int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128, - class ClusterShape = Shape<_1, _2, _1>> +// clang-format off +template <class OutType, int ScaleGranularityM, + int ScaleGranularityN, int ScaleGranularityK, + class MmaTileShape, class ClusterShape, + class EpilogueScheduler, class MainloopScheduler> struct cutlass_3x_gemm_fp8_blockwise { - using GroupSizeM = Int<GroupSizeM_>; - using GroupSizeN = Int<GroupSizeN_>; - using GroupSizeK = Int<GroupSizeK_>; - using TileSizeM = Int<TileSizeM_>; - - static_assert(TileSizeM_ % GroupSizeM_ == 0, - "TileSizeM must be a multiple of GroupSizeM"); - using ElementAB = cutlass::float_e4m3_t; using ElementA = ElementAB; @@ -45,52 +36,67 @@ struct cutlass_3x_gemm_fp8_blockwise { static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; using ElementD = OutType; - using StrideD = Stride<int64_t, Int<1>, Int<0>>; + using LayoutD = cutlass::layout::RowMajor; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; - using ElementC = void; - using StrideC = StrideD; + using ElementC = void; // TODO: support bias + using LayoutC = LayoutD; static constexpr int AlignmentC = AlignmentD; using ElementAccumulator = float; - using ElementBlockScale = float; using ElementCompute = float; + using ElementBlockScale = float; + + using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig< + ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>; + + using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); + using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); + using ArchTag = cutlass::arch::Sm90; using OperatorClass = cutlass::arch::OpClassTensorOp; - using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>; - using KernelSchedule = cutlass::gemm:: - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< - GroupSizeM_>; - using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative; - using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; + static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; + using ElementScalar = float; + using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>; + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + MmaTileShape, + ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, + ElementAccumulator, + ElementCompute, + ElementC, + LayoutC, + AlignmentC, + ElementD, + LayoutD, + AlignmentD, + EpilogueScheduler, + DefaultOperation + >::CollectiveOp; - using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT< - cutlass::epilogue::fusion::Sm90AccFetch>; - - using CollectiveEpilogue = - typename cutlass::epilogue::collective::CollectiveBuilder< - ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, - ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC, - ElementD, StrideD, AlignmentD, EpilogueSchedule, - StoreEpilogueCompute>::CollectiveOp; - - using CollectiveMainloop = - typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, - LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape, - cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>( - sizeof(typename CollectiveEpilogue::SharedStorage))>, - KernelSchedule>::CollectiveOp; + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + ElementA, + cute::tuple<LayoutA, LayoutSFA>, + AlignmentA, + ElementB, + cute::tuple<LayoutB, LayoutSFB>, + AlignmentB, + ElementAccumulator, + MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopScheduler + >::CollectiveOp; using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal< - Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, - SchedulerType>>; + Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>; struct GemmKernel : public KernelType {}; - - using StrideA = typename GemmKernel::StrideA; - using StrideB = typename GemmKernel::StrideB; }; template <typename Gemm> @@ -99,76 +105,54 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { using GemmKernel = typename Gemm::GemmKernel; + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideD = typename Gemm::GemmKernel::StrideD; + using StrideC = typename Gemm::GemmKernel::StrideC; + using LayoutSFA = typename Gemm::LayoutSFA; + using LayoutSFB = typename Gemm::LayoutSFB; + using ScaleConfig = typename Gemm::ScaleConfig; using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; - auto prob_shape = c3x::get_problem_shape(a, b); - int32_t m = get<0>(prob_shape), n = get<1>(prob_shape), - k = get<2>(prob_shape); + int32_t m = a.size(0), n = b.size(1), k = a.size(1); - int64_t lda = a.stride(0); - int64_t ldb = b.stride(1); - int64_t ldc = out.stride(0); + TORCH_CHECK(m % 4 == 0, "m must be divisible by 4"); - using StrideA = Stride<int64_t, Int<1>, int64_t>; - using StrideB = Stride<int64_t, Int<1>, int64_t>; - using StrideC = typename Gemm::StrideC; + StrideA a_stride; + StrideB b_stride; + StrideC c_stride; + a_stride = + cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); + b_stride = + cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); + c_stride = + cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1)); - StrideA a_stride{lda, Int<1>{}, 0}; - StrideB b_stride{ldb, Int<1>{}, 0}; - StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; + LayoutSFA layout_SFA = + ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); + LayoutSFB layout_SFB = + ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); auto a_ptr = static_cast<ElementAB*>(a.data_ptr()); auto b_ptr = static_cast<ElementAB*>(b.data_ptr()); auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr()); auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr()); - // Check is the t is contiguous and is 1D or 2D with one of the dimensions - // being 1 (i.e. a row or column vector) - auto is_contiguous_vector = [](const torch::Tensor& t) { - auto t_sizes = t.sizes(); - return t.is_contiguous() && - (t.dim() == 1 || - (t.dim() == 2 && - *std::min_element(t_sizes.begin(), t_sizes.end()) == 1)); - }; - - // TODO(lucas): lets clean-up the kernel so that we pass in Strides so - // we don't have to deal with enforcing implicit layouts - TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value); - TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value); - TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales), - "a_scales must be M major"); - TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value); - TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value); - TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales), - "b_scales must be K major"); - typename GemmKernel::MainloopArguments mainloop_args{ - a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr}; + auto mainloop_args = [&](){ + return typename GemmKernel::MainloopArguments{ + a_ptr, a_stride, b_ptr, b_stride, + a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB + }; + }(); + auto prob_shape = cute::make_shape(m, n, k, 1); auto c_ptr = static_cast<ElementD*>(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ {}, c_ptr, c_stride, c_ptr, c_stride}; - - typename GemmKernel::TileSchedulerArguments scheduler; - - static constexpr bool UsesStreamKScheduler = - cute::is_same_v<typename GemmKernel::TileSchedulerTag, - cutlass::gemm::StreamKScheduler>; - - if constexpr (UsesStreamKScheduler) { - using DecompositionMode = typename cutlass::gemm::kernel::detail:: - PersistentTileSchedulerSm90StreamKParams::DecompositionMode; - using ReductionMode = typename cutlass::gemm::kernel::detail:: - PersistentTileSchedulerSm90StreamKParams::ReductionMode; - - scheduler.decomposition_mode = DecompositionMode::StreamK; - scheduler.reduction_mode = ReductionMode::Nondeterministic; - } - c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args, - epilogue_args, scheduler); + epilogue_args); } template <typename OutType> @@ -177,18 +161,12 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - auto k = a.size(1); - auto n = b.size(1); - - if (k > 3 * n) { - cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< - cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>( - out, a, b, a_scales, b_scales); - } else { - cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< - cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>( - out, a, b, a_scales, b_scales); - } + // TODO: better heuristics + cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< + OutType, 1, 128, 128, Shape<_128, _128, _128>, + Shape<_1, _2, _1>, cutlass::epilogue::TmaWarpSpecializedCooperative, + cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum>>( + out, a, b, a_scales, b_scales); } } // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp index 2ee6a19407f92..3af59267bd60c 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp @@ -32,7 +32,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor."); TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor."); int32_t version_num = get_sm_version_num(); - if (version_num >= 100) { + if (version_num >= 90) { TORCH_CHECK( a.size(0) == a_scales.size(0) && cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1), @@ -41,32 +41,6 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a, cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) && cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1), "b_scale_group_shape must be [128, 128]."); - } else { - // TODO: Remove this after using cutlass sm90 blockwise scaling gemm - // kernel, or introducing ceil_div to the load_init() of mainloop. - using GroupShape = std::array<int64_t, 2>; - auto make_group_shape = [](torch::Tensor const& x, - torch::Tensor const& s) -> GroupShape { - TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D"); - return {cuda_utils::ceil_div(x.size(0), s.size(0)), - cuda_utils::ceil_div(x.size(1), s.size(1))}; - }; - - GroupShape a_scale_group_shape = make_group_shape(a, a_scales); - GroupShape b_scale_group_shape = make_group_shape(b, b_scales); - - // 1x128 per-token group scales for activations - // 128x128 blockwise scales for weights - TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} && - b_scale_group_shape == GroupShape{128, 128} && - a.dtype() == torch::kFloat8_e4m3fn && - b.dtype() == torch::kFloat8_e4m3fn), - "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n" - "a_scale_group_shape must be [1, 128]. Got: [", - a_scale_group_shape[0], ", ", a_scale_group_shape[1], - "]\n" - "b_scale_group_shape must be [128, 128]. Got: [", - b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]"); } TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm"); diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index d9154d3fd7f33..c440747316b80 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -11,8 +11,8 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8, native_w8a8_block_matmul) from vllm.config import VllmConfig from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - get_col_major_tma_aligned_tensor, per_token_group_quant_fp8, - w8a8_block_fp8_matmul) + cutlass_scaled_mm, get_col_major_tma_aligned_tensor, + per_token_group_quant_fp8, w8a8_block_fp8_matmul) from vllm.platforms import current_platform from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8 @@ -98,6 +98,54 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 +@torch.inference_mode() +def test_w8a8_block_fp8_cutlass_matmul(): + # Test simple case where weight.shape % 128 != 0, + # like in DSV3 kv_a_proj_with_mqa + M = 32 + N = 576 + K = 7168 + block_size = [128, 128] + out_dtype = torch.bfloat16 + seed = 0 + + torch.manual_seed(seed) + factor_for_scale = 1e-2 + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + + B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale + # Hopper requires row-major format for scales + Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability( + 90) else Bs + + A_fp8, As = per_token_group_quant_fp8(A_fp32, + block_size[1], + column_major_scales=False) + # CUTLASS uses column-major format for scales + A_fp8_cutlass, As_cutlass = per_token_group_quant_fp8( + A_fp32, block_size[1], column_major_scales=True) + + ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, + out_dtype) + out = cutlass_scaled_mm(A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass, + block_size, out_dtype) + + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.001 + + @pytest.mark.parametrize( "M,N,K,block_size,out_dtype,seed", itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index d2616da84a00a..2818674775ccc 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -30,7 +30,8 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, select_cutlass_fp8_gemm_impl, swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace) + get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace, + should_use_deepgemm_for_fp8_linear) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin, prepare_moe_fp8_layer_for_marlin) @@ -462,6 +463,15 @@ class Fp8LinearMethod(LinearMethodBase): block_sz, ) + # SM90 Block FP8 CUTLASS requires row-major weight scales + if (self.block_quant and current_platform.is_device_capability(90) + and self.cutlass_block_fp8_supported + and not should_use_deepgemm_for_fp8_linear( + torch.bfloat16, layer.weight)): + layer.weight_scale_inv = Parameter( + layer.weight_scale_inv.data.T.contiguous(), + requires_grad=False) + def apply(self, layer: torch.nn.Module, x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 7b324dce3c367..e3e9635132d68 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -40,11 +40,14 @@ def cutlass_scaled_mm( block_size: list[int], output_dtype: torch.dtype = torch.float16, ) -> torch.Tensor: - return ops.cutlass_scaled_mm(A, - B.T, - out_dtype=output_dtype, - scale_a=As, - scale_b=Bs.T) + return ops.cutlass_scaled_mm( + A, + B.T, + out_dtype=output_dtype, + scale_a=As, + # SM90 block FP8 requires row-major scale_b, which we do ahead of time + scale_b=Bs if block_size is not None + and current_platform.is_device_capability(90) else Bs.T) def rocm_aiter_gemm_w8a8_blockscale_impl( @@ -152,35 +155,32 @@ def apply_w8a8_block_fp8_linear( output += bias return output.to(dtype=output_dtype).view(*output_shape) - if current_platform.is_cuda(): - if current_platform.has_device_capability(100): - - use_cutlass = cutlass_block_fp8_supported and ( - cdiv(weight.shape[0], 128) == weight_scale.shape[0] - and cdiv(weight.shape[1], 128) == weight_scale.shape[1]) - else: - # TODO: update this after switching to public sm90 block scale gemm - # as it also supports weight.shape % 128 != 0 - use_cutlass = cutlass_block_fp8_supported and ( - weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0) - else: - use_cutlass = False - w8a8_blockscale_func = dispatch_w8a8_blockscale_func( - use_cutlass, use_aiter_and_is_supported) - if use_cutlass: - q_input, x_scale = per_token_group_quant_fp8( - input_2d, block_size[1], column_major_scales=use_cutlass) + cutlass_block_fp8_supported, use_aiter_and_is_supported) + if cutlass_block_fp8_supported: + num_pad = 0 + if current_platform.is_device_capability(90): + # pad first dimension to be divisible by 4 due to + # cutlass blockwise gemm limitation for hopper + num_pad = 4 - (input_2d.shape[0] % 4) + if num_pad > 0: + input_2d = torch.nn.functional.pad(input_2d, + (0, 0, 0, num_pad), + "constant", 0) + q_input, x_scale = per_token_group_quant_fp8(input_2d, + block_size[1], + column_major_scales=True) output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale, block_size, input.dtype) - + if num_pad > 0: + output = output[:-num_pad] else: if use_aiter_and_is_supported: q_input, x_scale = aiter_per1x128_quant( input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8) else: q_input, x_scale = per_token_group_quant_fp8( - input_2d, block_size[1], column_major_scales=use_cutlass) + input_2d, block_size[1], column_major_scales=False) output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale, block_size, input.dtype) From f82f7a89906ce03a5fcc1371d6e6bab30505e569 Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Thu, 11 Sep 2025 15:45:52 -0700 Subject: [PATCH 201/584] [Qwen3-Next] MOE configs for H100 TP4 (#24699) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- ...128,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..94408e279b656 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + } +} From 7a30fa8708ce7592a3a1adeab61425a28a3bbff3 Mon Sep 17 00:00:00 2001 From: Zazzle516 <77725591+Zazzle516@users.noreply.github.com> Date: Fri, 12 Sep 2025 07:18:09 +0800 Subject: [PATCH 202/584] [Doc] Clarify cudagraph capture size logic and default behavior in scheduler (#18698) Signed-off-by: Zazzle516 <2405677060@qq.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 48 +++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 24eaf2e360ab5..8026d4c9e202c 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3579,30 +3579,40 @@ class VllmConfig: def _set_cudagraph_sizes(self): """ - cudagraph batchsize padding logic: + vLLM defines the default candidate list of batch sizes for CUDA graph + capture as: - `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible - batch sizes that cudagraph will capture. - - Depending on the engine's configuration of `max_num_seqs`, the - candidate batch sizes to capture cudagraph will shrink to the subset - which just cover the range of `[1, max_num_seqs]`. In the common case, - `max_num_seqs` is 256, and the cudagraph batch sizes will be - `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`. - - However, if users specify the cudagraph capture sizes through - compilation config, we will use the specified sizes instead. + ```python + max_graph_size = min(max_num_seqs * 2, 512) + # 1, 2, 4, then multiples of 8 up to max_graph_size + cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size] In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` will be the final sizes to capture cudagraph (in descending order). - During runtime, if batchsize is larger than - `vllm_config.compilation_config.cudagraph_capture_sizes`, - no cudagraph will be used. - If the batch size is no larger than - `vllm_config.compilation_config.cudagraph_capture_sizes`, - we can quickly find the padded graph size for a given batch size by - looking up `vllm_config.compilation_config.bs_to_padded_graph_size`. + These sizes are used to capture and reuse CUDA graphs for + performance-critical paths (e.g., decoding). Capturing enables + significantly faster kernel dispatch by avoiding Python overhead. The + list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on + most GPUs), which controls the total allowed number of tokens in a + batch. Since each sequence may have a variable number of tokens, the + maximum usable batch size will depend on actual sequence lengths. + + Example: + With `max_num_batched_tokens = 8192`, and typical sequences + averaging ~32 tokens, most practical batch sizes fall below 256. + However, the system will still allow capture sizes up to 512 if + shape and memory permit. + + Note: + If users explicitly specify cudagraph capture sizes in the + compilation config, those will override this default logic. + At runtime: + + - If batch size <= one of the `cudagraph_capture_sizes`, the closest + padded CUDA graph will be used. + - If batch size > largest `cudagraph_capture_sizes`, cudagraph will + not be used. """ # calculate the default `batch_size_capture_list` From fcba05c4353c65d93b5299b50aa6b475e467c01a Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Sep 2025 19:47:59 -0400 Subject: [PATCH 203/584] [Bug] Fix Layer `weight_block_size` Assertion Issue (#24674) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/model_executor/layers/quantization/fp8.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 2818674775ccc..3d94626e5d8c6 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -450,10 +450,10 @@ class Fp8LinearMethod(LinearMethodBase): # Activations not quantized for marlin. del layer.input_scale - # On B200, if E8M0 for DeepGemm is used, we need to + # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to # requantize the weight and input to the specific scale # at the same time. - if is_deep_gemm_e8m0_used(): + if is_deep_gemm_e8m0_used() and self.block_quant: assert layer.weight_block_size is not None block_sz = tuple(layer.weight_block_size) requant_weight_ue8m0_inplace( @@ -905,7 +905,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w13_input_scale del layer.w2_input_scale - if is_deep_gemm_e8m0_used(): + if is_deep_gemm_e8m0_used() and self.block_quant: assert layer.weight_block_size is not None # Re-quantise the expert weights so their scales are UE8M0. block_sz = tuple(layer.weight_block_size) From 2e6bc4682181b2124beb71fc532cc12187cbceea Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Date: Thu, 11 Sep 2025 20:10:19 -0400 Subject: [PATCH 204/584] [Startup] Make DeepGEMM warmup scale with max-num-batched-tokens (#24693) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> --- .../model_executor/warmup/deep_gemm_warmup.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 74599fa44c88c..a25ef86a989db 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -10,6 +10,7 @@ import torch from tqdm import tqdm import vllm.envs as envs +from vllm.distributed.parallel_state import get_dp_group from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_utils import ( compute_aligned_M, deep_gemm_block_shape) @@ -131,11 +132,9 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor, GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set() -def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - num_topk: int): +def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( + w1: torch.Tensor, w2: torch.Tensor, w1_scale: torch.Tensor, + w2_scale: torch.Tensor, num_topk: int, max_tokens: int): if (w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE): return @@ -147,9 +146,13 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(w1: torch.Tensor, num_experts = w1.size(0) device = w1.device + # Assumes all ranks have the same max_num_batched_tokens + max_tokens_across_dp = get_dp_group().world_size * max_tokens + max_tokens = min(max_tokens_across_dp, envs.VLLM_FUSED_MOE_CHUNK_SIZE) + # This is the maximum GroupedGemm M size that we expect to run # the grouped_gemm with. - MAX_M = compute_aligned_M(envs.VLLM_FUSED_MOE_CHUNK_SIZE, + MAX_M = compute_aligned_M(max_tokens, num_topk, num_experts, block_m, @@ -201,7 +204,8 @@ def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int): _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens) -def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module): +def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module, + max_tokens: int): dg_modules = [ m for m in model.modules() if _fused_moe_grouped_gemm_may_use_deep_gemm(m) @@ -211,9 +215,9 @@ def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module): w13, w13_scale, w2, w2_scale, num_topk = ( _extract_data_from_fused_moe_module(dgm)) _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup( - w13, w2, w13_scale, w2_scale, num_topk) + w13, w2, w13_scale, w2_scale, num_topk, max_tokens) def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int): deepgemm_fp8_gemm_nt_warmup(model, max_tokens) - deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model) + deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens) From 40b6c9122b5997b88f5463da15ec465707d45626 Mon Sep 17 00:00:00 2001 From: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com> Date: Fri, 12 Sep 2025 08:10:39 +0800 Subject: [PATCH 205/584] [V1] feat:add engine v1 tracing (#20372) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com> Signed-off-by: Ye Zhang <zhysishu@gmail.com> Signed-off-by: RichardoMu <44485717+RichardoMrMu@users.noreply.github.com> Signed-off-by: simon-mo <simon.mo@hey.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Mu Huai <tianbowen.tbw@antgroup.com> Co-authored-by: Ye Zhang <zhysishu@gmail.com> Co-authored-by: Benjamin Bartels <benjamin@bartels.dev> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: 瑜琮 <ly186375@antfin.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 2 +- tests/v1/tracing/test_tracing.py | 137 +++++++++++++++++++++++++++++ vllm/engine/arg_utils.py | 6 -- vllm/tracing.py | 5 ++ vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/engine/__init__.py | 6 +- vllm/v1/engine/async_llm.py | 9 +- vllm/v1/engine/llm_engine.py | 7 ++ vllm/v1/engine/output_processor.py | 86 ++++++++++++++++-- vllm/v1/engine/processor.py | 3 +- vllm/v1/metrics/stats.py | 4 + vllm/v1/request.py | 6 +- 12 files changed, 253 insertions(+), 20 deletions(-) create mode 100644 tests/v1/tracing/test_tracing.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index adbd08b13f75b..b0f5fe418dcf3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -231,7 +231,7 @@ steps: source_file_dependencies: - vllm/ - tests/metrics - - tests/tracing + - tests/v1/tracing commands: - pytest -v -s metrics - "pip install \ diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py new file mode 100644 index 0000000000000..da8655f95e195 --- /dev/null +++ b/tests/v1/tracing/test_tracing.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa +# type: ignore +from __future__ import annotations + +import threading +from collections.abc import Iterable +from concurrent import futures +from typing import Callable, Generator, Literal + +import grpc +import pytest +from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ( + ExportTraceServiceResponse) +from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import ( + TraceServiceServicer, add_TraceServiceServicer_to_server) +from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue +from opentelemetry.sdk.environment_variables import ( + OTEL_EXPORTER_OTLP_TRACES_INSECURE) + +from vllm import LLM, SamplingParams +from vllm.tracing import SpanAttributes + +FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" + +FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value', + 'array_value'] + + +def decode_value(value: AnyValue): + field_decoders: dict[FieldName, Callable] = { + "bool_value": (lambda v: v.bool_value), + "string_value": (lambda v: v.string_value), + "int_value": (lambda v: v.int_value), + "double_value": (lambda v: v.double_value), + "array_value": + (lambda v: [decode_value(item) for item in v.array_value.values]), + } + for field, decoder in field_decoders.items(): + if value.HasField(field): + return decoder(value) + raise ValueError(f"Couldn't decode value: {value}") + + +def decode_attributes(attributes: Iterable[KeyValue]): + return {kv.key: decode_value(kv.value) for kv in attributes} + + +class FakeTraceService(TraceServiceServicer): + + def __init__(self): + self.request = None + self.evt = threading.Event() + + def Export(self, request, context): + self.request = request + self.evt.set() + return ExportTraceServiceResponse() + + +@pytest.fixture +def trace_service() -> Generator[FakeTraceService, None, None]: + """Fixture to set up a fake gRPC trace service""" + server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + service = FakeTraceService() + add_TraceServiceServicer_to_server(service, server) + server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS) + server.start() + + yield service + + server.stop(None) + + +def test_traces( + monkeypatch: pytest.MonkeyPatch, + trace_service: FakeTraceService, +): + with monkeypatch.context() as m: + m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") + m.setenv("VLLM_USE_V1", "1") + sampling_params = SamplingParams( + temperature=0.01, + top_p=0.1, + max_tokens=256, + ) + model = "facebook/opt-125m" + llm = LLM(model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + gpu_memory_utilization=0.3, + disable_log_stats=False) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) + print(f"test_traces outputs is : {outputs}") + + timeout = 10 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") + + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) + # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS + ) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens + + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0 + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ba1543a8d5a30..9841876bc8e8c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1491,12 +1491,6 @@ class EngineArgs: recommend_to_remove=False) return False - # No OTLP observability so far. - if (self.otlp_traces_endpoint or self.collect_detailed_traces): - _raise_or_fallback(feature_name="--otlp-traces-endpoint", - recommend_to_remove=False) - return False - # V1 supports N-gram, Medusa, and Eagle speculative decoding. if (self.speculative_config is not None and self.speculative_config.get("method") == "draft_model"): diff --git a/vllm/tracing.py b/vllm/tracing.py index 6a287d82be5ff..7537e9901a044 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -119,6 +119,11 @@ class SpanAttributes: # forward, block/sync across workers, cpu-gpu sync time and sampling time. GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = ( "gen_ai.latency.time_in_model_execute") + GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = \ + "gen_ai.latency.time_in_model_prefill" + GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode" + GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = \ + "gen_ai.latency.time_in_model_inference" def contains_trace_headers(headers: Mapping[str, str]) -> bool: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d1a6dd73e85cd..aa45f6669207d 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -969,9 +969,9 @@ class Scheduler(SchedulerInterface): stop_reason=request.stop_reason, events=request.take_events(), kv_transfer_params=kv_transfer_params, + trace_headers=request.trace_headers, num_cached_tokens=request.num_cached_tokens, )) - else: # Invariant: EngineCore returns no partial prefill outputs. assert not prompt_logprobs_tensors diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 5d8959a3cd3fe..dec4abec519bd 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -3,6 +3,7 @@ import enum import time +from collections.abc import Mapping from typing import Any, Optional, Union import msgspec @@ -66,6 +67,8 @@ class EngineCoreRequest( current_wave: int = 0 priority: int = 0 + trace_headers: Optional[Mapping[str, str]] = None + class EngineCoreEventType(enum.IntEnum): """The type of engine core request event.""" @@ -111,6 +114,7 @@ class EngineCoreOutput( events: Optional[list[EngineCoreEvent]] = None kv_transfer_params: Optional[dict[str, Any]] = None + trace_headers: Optional[Mapping[str, str]] = None # The number of tokens with prefix cache hits. num_cached_tokens: int = 0 @@ -144,7 +148,7 @@ class EngineCoreOutputs( omit_defaults=True, # type: ignore[call-arg] gc=False): # type: ignore[call-arg] - #NOTE(Nick): We could consider ways to make this more compact, + # NOTE(Nick): We could consider ways to make this more compact, # e.g. columnwise layout engine_index: int = 0 diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f57075c6fa82d..e467f40f6eab1 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,6 +26,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask +from vllm.tracing import init_tracer from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -97,6 +98,7 @@ class AsyncLLM(EngineClient): self.model_config = vllm_config.model_config self.vllm_config = vllm_config + self.observability_config = vllm_config.observability_config self.log_requests = log_requests self.log_stats = log_stats or (stat_loggers is not None) @@ -124,6 +126,11 @@ class AsyncLLM(EngineClient): # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor(self.tokenizer, log_stats=self.log_stats) + if self.observability_config.otlp_traces_endpoint is not None: + tracer = init_tracer( + "vllm.llm_engine", + self.observability_config.otlp_traces_endpoint) + self.output_processor.tracer = tracer # EngineCore (starts the engine in background process). self.engine_core = EngineCoreClient.make_async_mp_client( @@ -603,7 +610,7 @@ class AsyncLLM(EngineClient): return self.tokenizer.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: - return False + return self.observability_config.otlp_traces_endpoint is not None async def do_log_stats( self, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 7130f666ef19f..fca5a783bc3bf 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -19,6 +19,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask +from vllm.tracing import init_tracer from vllm.transformers_utils.tokenizer_group import ( TokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext @@ -65,6 +66,7 @@ class LLMEngine: "Set VLLM_USE_V1=0 and file and issue on Github.") self.vllm_config = vllm_config + self.observability_config = vllm_config.observability_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -99,6 +101,11 @@ class LLMEngine: # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor(self.tokenizer, log_stats=self.log_stats) + if self.observability_config.otlp_traces_endpoint is not None: + tracer = init_tracer( + "vllm.llm_engine", + self.observability_config.otlp_traces_endpoint) + self.output_processor.tracer = tracer # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 2ee55b585da6c..02c8c61cb9093 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -11,6 +11,8 @@ import torch from vllm.outputs import (CompletionOutput, PoolingOutput, PoolingRequestOutput, RequestOutput) from vllm.sampling_params import RequestOutputKind +from vllm.tracing import (SpanAttributes, SpanKind, Tracer, + extract_trace_context) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason @@ -71,7 +73,6 @@ class RequestOutputCollector: @dataclass class OutputProcessorOutput: - request_outputs: list[Union[RequestOutput, PoolingRequestOutput]] reqs_to_abort: list[str] @@ -93,6 +94,9 @@ class RequestState: arrival_time: float, queue: Optional[RequestOutputCollector], log_stats: bool, + top_p: Optional[float] = None, + n: Optional[int] = None, + temperature: Optional[float] = None, ): self.request_id = request_id self.parent_req = parent_req @@ -105,6 +109,9 @@ class RequestState: self.logprobs_processor = logprobs_processor self.detokenizer = detokenizer self.max_tokens_param = max_tokens_param + self.top_p = top_p + self.n = n + self.temperature = temperature self.is_prefilling = True self.queue = queue self.num_cached_tokens = 0 @@ -137,10 +144,16 @@ class RequestState: request=request, ) max_tokens_param = sampling_params.max_tokens + top_p = sampling_params.top_p + n = sampling_params.n + temperature = sampling_params.temperature else: logprobs_processor = None detokenizer = None max_tokens_param = None + top_p = None + n = None + temperature = None assert request.pooling_params is not None output_kind = request.pooling_params.output_kind @@ -156,6 +169,9 @@ class RequestState: logprobs_processor=logprobs_processor, detokenizer=detokenizer, max_tokens_param=max_tokens_param, + top_p=top_p, + n=n, + temperature=temperature, arrival_time=request.arrival_time, queue=queue, log_stats=log_stats, @@ -274,16 +290,13 @@ class RequestState: class OutputProcessor: """Process EngineCoreOutputs into RequestOutputs.""" - def __init__( - self, - tokenizer: TokenizerGroup, - log_stats: bool, - ): + def __init__(self, tokenizer: TokenizerGroup, log_stats: bool): self.log_stats = log_stats self.tokenizer = tokenizer self.request_states: dict[str, RequestState] = {} self.parent_requests: dict[str, ParentRequest] = {} self.lora_states = LoRARequestStates() + self.tracer: Optional[Tracer] = None def get_num_unfinished_requests(self): return len(self.request_states) @@ -441,7 +454,9 @@ class OutputProcessor: # Track per-request stats self._update_stats_from_finished(req_state, finish_reason, iteration_stats) - + if self.tracer: + self.do_tracing(engine_core_output, req_state, + iteration_stats) self.lora_states.update_iteration_stats(iteration_stats) return OutputProcessorOutput( @@ -449,6 +464,63 @@ class OutputProcessor: reqs_to_abort=reqs_to_abort, ) + def do_tracing(self, engine_core_output: EngineCoreOutput, + req_state: RequestState, + iteration_stats: Optional[IterationStats]) -> None: + assert req_state.stats is not None + assert iteration_stats is not None + assert self.tracer is not None + + arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9) + trace_context = extract_trace_context(engine_core_output.trace_headers) + with (self.tracer.start_as_current_span( + "llm_request", + kind=SpanKind.SERVER, + context=trace_context, + start_time=arrival_time_nano_seconds) as span): + metrics = req_state.stats + e2e_time = iteration_stats.iteration_timestamp - \ + metrics.arrival_time + queued_time = metrics.scheduled_ts - metrics.queued_ts + prefill_time = metrics.first_token_ts - metrics.scheduled_ts + decode_time = metrics.last_token_ts - metrics.first_token_ts + inference_time = metrics.last_token_ts - metrics.scheduled_ts + span.set_attribute( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, + metrics.first_token_latency) + span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time) + span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, + queued_time) + span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, + len(req_state.prompt_token_ids)) + span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, + metrics.num_generation_tokens) + span.set_attribute( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, + prefill_time) + span.set_attribute( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, + decode_time) + span.set_attribute( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, + inference_time) + + # meta + span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, + req_state.request_id) + if req_state.top_p: + span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, + req_state.top_p) + if req_state.max_tokens_param: + span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, + req_state.max_tokens_param) + if req_state.temperature: + span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, + req_state.temperature) + if req_state.n: + span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, + req_state.n) + def _update_stats_from_output(self, req_state: RequestState, engine_core_output: EngineCoreOutput, engine_core_timestamp: Optional[float], diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 96e89eeac556a..a54a98adf56b5 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -327,8 +327,6 @@ class Processor: # TODO(woosuk): Support pooling models. self._validate_lora(lora_request) self._validate_params(params, lora_request) - if trace_headers is not None: - raise ValueError("V1 does not support tracing yet.") data_parallel_size = self.vllm_config.parallel_config.data_parallel_size if data_parallel_rank is not None and not (0 <= data_parallel_rank < @@ -435,6 +433,7 @@ class Processor: cache_salt=decoder_inputs.get("cache_salt"), priority=priority, data_parallel_rank=data_parallel_rank, + trace_headers=trace_headers, ) def _validate_model_inputs(self, diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 45c32aaaaf6c4..e6c344d193df2 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -68,6 +68,9 @@ class RequestStateStats: first_token_ts: float = 0.0 last_token_ts: float = 0.0 + # first token latency + first_token_latency: float = 0.0 + @dataclass class FinishedRequestStats: @@ -116,6 +119,7 @@ class IterationStats: first_token_latency = self._time_since(req_stats.arrival_time) self.time_to_first_tokens_iter.append(first_token_latency) + req_stats.first_token_latency = first_token_latency req_stats.num_generation_tokens += num_new_generation_tokens diff --git a/vllm/v1/request.py b/vllm/v1/request.py index ad7477241ebbd..64cce3e9efc51 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -3,6 +3,7 @@ import enum import time +from collections.abc import Mapping from functools import partial from typing import TYPE_CHECKING, Any, Callable, Optional, Union @@ -35,6 +36,7 @@ class Request: structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: Optional[str] = None, priority: int = 0, + trace_headers: Optional[Mapping[str, str]] = None, block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] = None, ) -> None: @@ -100,7 +102,8 @@ class Request: # they should also be updated simultaneously. self.output_token_ids = ConstantList(self._output_token_ids) self.all_token_ids = ConstantList(self._all_token_ids) - + # trace_headers + self.trace_headers = trace_headers # State # The number of tokens with prefix cache hits. self.num_cached_tokens = -1 @@ -136,6 +139,7 @@ class Request: if request.sampling_params else None, cache_salt=request.cache_salt, priority=request.priority, + trace_headers=request.trace_headers, block_hasher=block_hasher, ) From 880c741bb6d755c1da25d51e10985b4fa06671bd Mon Sep 17 00:00:00 2001 From: Tao He <linzhu.ht@alibaba-inc.com> Date: Fri, 12 Sep 2025 09:16:43 +0800 Subject: [PATCH 206/584] [Bugfix] fixes the causal_conv1d_update kernel update non-speculative decoding cases (#24680) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- .../layers/mamba/ops/causal_conv1d.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 709794429c7b6..a0478a359f91b 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -720,15 +720,15 @@ def _causal_conv1d_update_kernel( # STEP 2: assume state_len > seqlen idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] - # The conv_state updates works in a sliding window manner, - # at each forward pass, the tokens are shift by 1, so we + # With speculative decoding, the conv_state updates works in a sliding + # window manner, at each forward pass, the tokens are shift by 1, so we # load since idx_tokens + 1. conv_state_ptrs_source = ( conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) + conv_state_token_offset * stride_conv_state_tok + (idx_feats * stride_conv_state_dim)[None, :] + - ((idx_tokens + 1) * stride_conv_state_tok)[:, None] - ) # [BLOCK_M, BLOCK_N] + ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) * + stride_conv_state_tok)[:, None]) # [BLOCK_M, BLOCK_N] mask = ((conv_state_batch_coord < num_cache_lines) & ((idx_tokens + seqlen) < state_len)[:, None] & (idx_feats < dim)[None, :]) @@ -924,7 +924,10 @@ def causal_conv1d_update( ) stride_state_indices = conv_state_indices.stride( 0) if conv_state_indices is not None else 0 - state_len = width - 1 + (seqlen - 1) # effective state_len needed + if num_accepted_tokens is not None: + state_len = width - 1 + (seqlen - 1) # effective state_len needed + else: + state_len = width - 1 np2_statelen = triton.next_power_of_2(state_len) def grid(META): From 12a8414d81e186c2db397d7f1ecb5c17e614c6bd Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Fri, 12 Sep 2025 10:06:26 +0800 Subject: [PATCH 207/584] [Qwen3-Next] MoE configs for H20 TP=1,2,4,8 (#24707) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- ...E=512,N=128,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++ ...E=512,N=256,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++ ...E=512,N=512,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++ .../E=512,N=64,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++ 4 files changed, 584 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000000000..9f4c3cbc9b8a9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000000000..68649395a23ed --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000000000..a68c83147eeb3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000000000..5dd1a8e19c2ce --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} From 6a50eaa0d3858dd19a142746fce7181813c5d632 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Thu, 11 Sep 2025 23:02:53 -0400 Subject: [PATCH 208/584] [DOCs] Update ROCm installation docs section (#24691) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> --- .../installation/gpu/rocm.inc.md | 40 +++++++------------ 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 4c70128d0b49a..37c6647929b51 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM supports AMD GPUs with ROCm 6.3. +vLLM supports AMD GPUs with ROCm 6.3 or above. !!! tip [Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm. @@ -11,8 +11,9 @@ vLLM supports AMD GPUs with ROCm 6.3. # --8<-- [end:installation] # --8<-- [start:requirements] -- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201) -- ROCm 6.3 +- GPU: MI200s (gfx90a), MI300 (gfx942), MI350 (gfx950), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201) +- ROCm 6.3 or above + - MI350 requires ROCm 7.0 or above # --8<-- [end:requirements] # --8<-- [start:set-up-using-python] @@ -32,35 +33,35 @@ Currently, there are no pre-built ROCm wheels. - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) - [PyTorch](https://pytorch.org/) - For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3. + For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.4.3_ubuntu24.04_py3.12_pytorch_release_2.6.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3. Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example: ```bash # Install PyTorch pip uninstall torch -y - pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 + pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/rocm6.4 ``` -1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) +1. Install [Triton for ROCm](https://github.com/triton-lang/triton) - Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + Install ROCm's Triton (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) ```bash python3 -m pip install ninja cmake wheel pybind11 pip uninstall -y triton - git clone https://github.com/OpenAI/triton.git + git clone https://github.com/triton-lang/triton.git cd triton git checkout e5be006 - cd python - pip3 install . + if [ ! -f setup.py ]; then cd python; fi + python3 setup.py install cd ../.. ``` !!! note If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. -2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention) +2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/Dao-AILab/flash-attention) Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention#amd-rocm-support) Alternatively, wheels intended for vLLM use can be accessed under the releases. @@ -68,9 +69,9 @@ Currently, there are no pre-built ROCm wheels. For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. ```bash - git clone https://github.com/ROCm/flash-attention.git + git clone https://github.com/Dao-AILab/flash-attention.git cd flash-attention - git checkout b7d29fb + git checkout 1a7f4dfa git submodule update --init GPU_ARCHS="gfx90a" python3 setup.py install cd .. @@ -194,16 +195,6 @@ To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm . ``` -To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: - -```bash -DOCKER_BUILDKIT=1 docker build \ - --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \ - -f docker/Dockerfile.rocm \ - -t vllm-rocm \ - . -``` - To run the above docker image `vllm-rocm`, use the below command: ??? console "Command" @@ -218,8 +209,7 @@ To run the above docker image `vllm-rocm`, use the below command: --device /dev/kfd \ --device /dev/dri \ -v <path/to/model>:/app/model \ - vllm-rocm \ - bash + vllm-rocm ``` Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models. From e090b7b45b015e9a4158c6676da802e04f1d88f1 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser <mbayser@br.ibm.com> Date: Fri, 12 Sep 2025 00:30:41 -0300 Subject: [PATCH 209/584] Enable conversion of multimodal models to pooling tasks (#24451) Signed-off-by: Max de Bayser <mbayser@br.ibm.com> --- .../pooling/test_mm_classifier_conversion.py | 114 ++++++++++++ vllm/entrypoints/llm.py | 174 +++++++++++------- vllm/model_executor/model_loader/utils.py | 18 +- vllm/model_executor/models/adapters.py | 45 ++++- vllm/model_executor/models/gemma3_mm.py | 6 +- 5 files changed, 282 insertions(+), 75 deletions(-) create mode 100644 tests/models/language/pooling/test_mm_classifier_conversion.py diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py new file mode 100644 index 0000000000000..166b953de43e7 --- /dev/null +++ b/tests/models/language/pooling/test_mm_classifier_conversion.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.platforms import current_platform + + +def test_idefics_multimodal( + vllm_runner, + monkeypatch, +) -> None: + if current_platform.is_rocm(): + # ROCm Triton FA does not currently support sliding window attention + # switch to use ROCm CK FA backend + monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False") + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3", + runner="pooling", + task="classify", + convert="classify", + load_format="dummy", + max_model_len=512, + enforce_eager=True, + tensor_parallel_size=1, + disable_log_stats=True, + dtype="bfloat16") as vllm_model: + llm = vllm_model.get_llm() + outputs = llm.classify(prompts) + for output in outputs: + assert len(output.outputs.probs) == 2 + + +def update_config(config): + config.text_config.update({ + "architectures": ["Gemma3ForSequenceClassification"], + "classifier_from_token": ["A", "B", "C", "D", "E"], + "method": + "no_post_processing", + "id2label": { + "A": "Chair", + "B": "Couch", + "C": "Table", + "D": "Bed", + "E": "Cupboard" + }, + }) + return config + + +def test_gemma_multimodal( + vllm_runner, + monkeypatch, +) -> None: + if current_platform.is_rocm(): + # ROCm Triton FA does not currently support sliding window attention + # switch to use ROCm CK FA backend + monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False") + + messages = [{ + "role": + "system", + "content": + """ + You are a helpful assistant. You will be given a product description + which may also include an image. Classify the following product into + one of the categories: + + A = chair + B = couch + C = table + D = bed + E = cupboard + + You'll answer with exactly one letter (A, B, C, D, or E).""" + }, { + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": + "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg" + } + }, { + "type": "text", + "text": "A fine 19th century piece of furniture." + }] + }] + + with vllm_runner(model_name="google/gemma-3-4b-it", + runner="pooling", + task="classify", + convert="classify", + load_format="auto", + hf_overrides=update_config, + override_pooler_config={"pooling_type": "LAST"}, + max_model_len=512, + enforce_eager=True, + tensor_parallel_size=1, + disable_log_stats=True, + dtype="bfloat16") as vllm_model: + + llm = vllm_model.get_llm() + prompts = llm.preprocess_chat(messages) + + result = llm.classify(prompts) + assert result[0].outputs.probs[0] > 0.95 + assert all(c < 0.05 for c in result[0].outputs.probs[1:]) \ No newline at end of file diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index a6174161f115a..94749eb884512 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -703,6 +703,106 @@ class LLM: return outputs + def preprocess_chat( + self, + messages: Union[list[ChatCompletionMessageParam], + list[list[ChatCompletionMessageParam]]], + lora_request: Optional[LoRARequest] = None, + chat_template: Optional[str] = None, + chat_template_content_format: ChatTemplateContentFormatOption = "auto", + add_generation_prompt: bool = True, + continue_final_message: bool = False, + tools: Optional[list[dict[str, Any]]] = None, + chat_template_kwargs: Optional[dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, + ) -> list[TokensPrompt]: + """ + Generate prompt for a chat conversation. The pre-processed + prompt can then be used as input for the other LLM methods. + + Refer to `chat` for a complete description of the arguments. + Returns: + A list of `TokensPrompts` objects containing the tokenized + prompt after chat template interpolation, and the + pre-processed multi-modal inputs. + """ + list_of_messages: list[list[ChatCompletionMessageParam]] + + # Handle multi and single conversations + if is_list_of(messages, list): + # messages is list[list[...]] + list_of_messages = cast(list[list[ChatCompletionMessageParam]], + messages) + else: + # messages is list[...] + list_of_messages = [ + cast(list[ChatCompletionMessageParam], messages) + ] + + tokenizer = self.get_tokenizer(lora_request) + model_config = self.llm_engine.get_model_config() + resolved_content_format = resolve_chat_template_content_format( + chat_template, + tools, + chat_template_content_format, + tokenizer, + model_config=model_config, + ) + + _chat_template_kwargs: dict[str, Any] = dict( + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tools, + ) + _chat_template_kwargs.update(chat_template_kwargs or {}) + + prompts: list[TokensPrompt] = [] + + for msgs in list_of_messages: + # NOTE: _parse_chat_message_content_parts() currently doesn't + # handle mm_processor_kwargs, since there is no implementation in + # the chat message parsing for it. + conversation, mm_data, mm_uuids = parse_chat_messages( + msgs, + model_config, + tokenizer, + content_format=resolved_content_format, + ) + + if isinstance(tokenizer, MistralTokenizer): + prompt_token_ids = apply_mistral_chat_template( + tokenizer, + messages=msgs, + **_chat_template_kwargs, + ) + else: + prompt_str = apply_hf_chat_template( + tokenizer=tokenizer, + conversation=conversation, + model_config=model_config, + **_chat_template_kwargs, + ) + # Special tokens are already included in chat templates so + # should not be added by the tokenizer in this case. + prompt_token_ids = tokenizer.encode(prompt_str, + add_special_tokens=False) + + prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) + + if mm_data is not None: + prompt["multi_modal_data"] = mm_data + + if mm_uuids is not None: + prompt["multi_modal_uuids"] = mm_uuids + + if mm_processor_kwargs is not None: + prompt["mm_processor_kwargs"] = mm_processor_kwargs + + prompts.append(prompt) + + return prompts + def chat( self, messages: Union[list[ChatCompletionMessageParam], @@ -769,80 +869,18 @@ class LLM: A list of `RequestOutput` objects containing the generated responses in the same order as the input messages. """ - list_of_messages: list[list[ChatCompletionMessageParam]] - # Handle multi and single conversations - if is_list_of(messages, list): - # messages is list[list[...]] - list_of_messages = cast(list[list[ChatCompletionMessageParam]], - messages) - else: - # messages is list[...] - list_of_messages = [ - cast(list[ChatCompletionMessageParam], messages) - ] - - tokenizer = self.get_tokenizer(lora_request) - model_config = self.llm_engine.get_model_config() - resolved_content_format = resolve_chat_template_content_format( - chat_template, - tools, - chat_template_content_format, - tokenizer, - model_config=model_config, - ) - - _chat_template_kwargs: dict[str, Any] = dict( + prompts = self.preprocess_chat( + messages=messages, + lora_request=lora_request, chat_template=chat_template, + chat_template_content_format=chat_template_content_format, add_generation_prompt=add_generation_prompt, continue_final_message=continue_final_message, tools=tools, + chat_template_kwargs=chat_template_kwargs, + mm_processor_kwargs=mm_processor_kwargs, ) - _chat_template_kwargs.update(chat_template_kwargs or {}) - - prompts: list[Union[TokensPrompt, TextPrompt]] = [] - - for msgs in list_of_messages: - # NOTE: _parse_chat_message_content_parts() currently doesn't - # handle mm_processor_kwargs, since there is no implementation in - # the chat message parsing for it. - conversation, mm_data, mm_uuids = parse_chat_messages( - msgs, - model_config, - tokenizer, - content_format=resolved_content_format, - ) - - if isinstance(tokenizer, MistralTokenizer): - prompt_token_ids = apply_mistral_chat_template( - tokenizer, - messages=msgs, - **_chat_template_kwargs, - ) - else: - prompt_str = apply_hf_chat_template( - tokenizer=tokenizer, - conversation=conversation, - model_config=model_config, - **_chat_template_kwargs, - ) - # Special tokens are already included in chat templates so - # should not be added by the tokenizer in this case. - prompt_token_ids = tokenizer.encode(prompt_str, - add_special_tokens=False) - - prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) - - if mm_data is not None: - prompt["multi_modal_data"] = mm_data - - if mm_uuids is not None: - prompt["multi_modal_uuids"] = mm_uuids - - if mm_processor_kwargs is not None: - prompt["mm_processor_kwargs"] = mm_processor_kwargs - - prompts.append(prompt) return self.generate( prompts, diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index c82fa5a40aa53..0c2441a6db44d 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -19,10 +19,11 @@ from vllm.logger import init_logger from vllm.model_executor.layers.linear import QKVCrossParallelLinear from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) -from vllm.model_executor.models.adapters import (as_embedding_model, - as_reward_model, - as_seq_cls_model) -from vllm.model_executor.models.interfaces import SupportsQuant +from vllm.model_executor.models.adapters import ( + as_embedding_model, as_reward_model, as_seq_cls_model, + try_create_mm_pooling_model_cls) +from vllm.model_executor.models.interfaces import (SupportsQuant, + supports_multimodal) from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -183,6 +184,15 @@ def get_model_architecture( "performance may not be optimal.", arch) convert_type = model_config.convert_type + if convert_type != "none" and supports_multimodal(model_cls): + logger.debug_once("Detected conversion of Multi Modal model.") + converted = try_create_mm_pooling_model_cls(model_cls) + if converted is not None: + logger.debug_once("Creating wrapper class to forward pooler.") + return converted, arch + else: + logger.debug_once("Attempting direct conversion.") + if convert_type == "none": pass elif convert_type == "embed": diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 78ad9a433e314..c4328a176a5de 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import inspect from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast import torch import torch.nn as nn +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.models.config import VerifyAndUpdateConfig @@ -129,6 +132,41 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str: return model_name + pooling_suffix +def try_create_mm_pooling_model_cls(orig_cls: _T) -> _T: + + class CallVisitor(ast.NodeVisitor): + + def __init__(self): + self.calls = [] + + def visit_Call(self, node): + if isinstance(node.func, ast.Name): + self.calls.append(node.func.id) + self.generic_visit(node) + + visitor = CallVisitor() + visitor.visit(ast.parse(inspect.getsource(orig_cls))) + if "init_vllm_registered_model" not in visitor.calls: + return None + + class ModelForPooling(orig_cls, VllmModelForPooling): + + is_pooling_model = True + + def __init__( + self, + *, + vllm_config: "VllmConfig", + prefix: str = "", + **kwargs: Any, + ) -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) + + self.pooler = self.get_language_model().pooler + + return ModelForPooling # type: ignore + + def _create_pooling_model_cls(orig_cls: _T) -> _T: # Lazy import from .utils import AutoWeightsLoader, WeightsMapper @@ -399,6 +437,7 @@ def load_weights_using_from_2_way_softmax( from vllm.model_executor.models.utils import AutoWeightsLoader model_config = model.vllm_config.model_config + tokens = getattr(model.config, "classifier_from_token", []) tokens = cast(list[int], tokens) assert len(tokens) == 2 @@ -406,9 +445,10 @@ def load_weights_using_from_2_way_softmax( if model.config.tie_word_embeddings: model.lm_head = model.model.embed_tokens else: + quant_config = model.vllm_config.quant_config model.lm_head = ParallelLMHead(model.config.vocab_size, model.config.hidden_size, - quant_config=model.quant_config) + quant_config=quant_config) loader = AutoWeightsLoader(model) loaded_weights = loader.load_weights(weights) @@ -452,9 +492,10 @@ def load_weights_no_post_processing(model, if model.config.tie_word_embeddings: model.lm_head = model.model.embed_tokens else: + quant_config = model.vllm_config.quant_config model.lm_head = ParallelLMHead(model.config.vocab_size, model.config.hidden_size, - quant_config=model.quant_config) + quant_config=quant_config) loader = AutoWeightsLoader(model) loaded_weights = loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index f3dc7dde46bdf..e652ba2f1c7fe 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -512,7 +512,11 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, architectures=["Gemma3ForCausalLM"], ) logit_scale = getattr(config, "logit_scale", 1.0) - self.language_model.logits_processor.scale *= logit_scale + + if hasattr(self.language_model, "logits_processor"): + # The logits processor can be unset if we're using + # automatic conversion to pooling model. + self.language_model.logits_processor.scale *= logit_scale self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) From ddcec289c7f3da664aa8b74f122d0330d17da3bc Mon Sep 17 00:00:00 2001 From: Andrew Sansom <andrew@protopia.ai> Date: Thu, 11 Sep 2025 23:35:48 -0500 Subject: [PATCH 210/584] Fix implementation divergence for BLOOM models between vLLM and HuggingFace when using prompt embeds (#24686) Signed-off-by: Andrew Sansom <andrew@protopia.ai> --- tests/models/language/generation/test_common.py | 6 ++---- vllm/model_executor/models/bloom.py | 3 ++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 6fc8f1301fdb9..062258930fe15 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os from typing import Optional import pytest @@ -99,9 +98,10 @@ AITER_MODEL_LIST = [ @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize( "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +@pytest.mark.parametrize("use_prompt_embeds", [True, False]) def test_models(hf_runner, vllm_runner, example_prompts, model: str, max_tokens: int, num_logprobs: int, use_rocm_aiter: bool, - monkeypatch) -> None: + use_prompt_embeds: bool, monkeypatch) -> None: model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") @@ -119,8 +119,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, # in parts of the operators pytest.skip(f"Skipping '{model}' model test with AITER kernel.") - use_prompt_embeds = os.getenv("VLLM_USE_V1") == "0" - with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 13ecda0122be6..f8ed92314c3d2 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -257,7 +257,7 @@ class BloomModel(nn.Module): config.hidden_size)) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.word_embeddings_layernorm(self.word_embeddings(input_ids)) + return self.word_embeddings(input_ids) def forward( self, @@ -271,6 +271,7 @@ class BloomModel(nn.Module): hidden_states = inputs_embeds else: hidden_states = self.get_input_embeddings(input_ids) + hidden_states = self.word_embeddings_layernorm(hidden_states) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] From 7920de0a2af7c663efae29856f1bac061a553b58 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" <jiang1.li@intel.com> Date: Fri, 12 Sep 2025 12:56:31 +0800 Subject: [PATCH 211/584] [Bugfix] Fix MRoPE dispatch on CPU (#24712) Signed-off-by: jiang1.li <jiang1.li@intel.com> --- vllm/model_executor/layers/rotary_embedding/mrope.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 0acb5ea742455..27ca010cdc3ae 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -300,6 +300,15 @@ class MRotaryEmbedding(RotaryEmbedding): key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key + def forward_cpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + return self.forward_native(positions, query, key, offsets) + @classmethod def get_input_positions( cls, From f592b3174b39a7aeac52432d66d76e89ff0a80b4 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Thu, 11 Sep 2025 23:35:04 -0700 Subject: [PATCH 212/584] [BugFix] Fix Qwen3-Next PP (#24709) Signed-off-by: Nick Hill <nhill@redhat.com> --- vllm/model_executor/models/qwen3_next.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 55c16c462885d..86e26da5b9b86 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Qwen3Next model.""" from collections.abc import Iterable +from itertools import islice from typing import Optional import torch @@ -917,8 +918,11 @@ class Qwen3NextModel(nn.Module): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) - self.norm = Qwen3NextRMSNorm(config.hidden_size, - eps=config.rms_norm_eps) + if get_pp_group().is_last_rank: + self.norm = Qwen3NextRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -941,7 +945,7 @@ class Qwen3NextModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers: + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, From 561a0baee0503f50a36751568c9653fe3bd5c3eb Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Fri, 12 Sep 2025 00:49:09 -0700 Subject: [PATCH 213/584] [CI] Fix flaky test v1/worker/test_gpu_model_runner.py::test_kv_cache_stride_order (#24640) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- tests/v1/worker/test_gpu_model_runner.py | 39 ++++++++++++------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 6d99029e404ef..8e27c14b7405b 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import random - import numpy as np import pytest import torch @@ -409,29 +407,30 @@ def test_kv_cache_stride_order(monkeypatch, model_runner): model_runner.model_config.get_head_size() ] # TODO mla test - default_stride = list(range(5)) + default_stride = tuple(range(5)) # Permutation that gets you back to expected kv shape - rnd_stride = tuple(random.sample(default_stride, len(default_stride))) + for test_stride in ((1, 4, 0, 2, 3), (0, 1, 2, 3, 4)): - def rnd_stride_order(): - return rnd_stride + def rnd_stride_order(test_stride=test_stride): + return test_stride - # Patch the attention backend class and re-trigger the KV cache creation. - for attn_group in model_runner._attn_group_iterator(): - attn_backend = attn_group.backend - monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order", - rnd_stride_order) + # Patch the attention backend class and re-trigger the KV cache creation + for attn_group in model_runner._attn_group_iterator(): + attn_backend = attn_group.backend + monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order", + rnd_stride_order) - model_runner.attn_groups = [] - model_runner.initialize_kv_cache(model_runner.kv_cache_config) + model_runner.attn_groups = [] + model_runner.kv_caches = [] + model_runner.initialize_kv_cache(model_runner.kv_cache_config) - # Shape is unchanged, but layout may differ - kv_cache_shape = model_runner.kv_caches[0].shape - assert list(kv_cache_shape) == expected_kv_cache_shape - if default_stride == rnd_stride: - assert all(kv.is_contiguous() for kv in model_runner.kv_caches) - else: - assert all(not kv.is_contiguous() for kv in model_runner.kv_caches) + # Shape is unchanged, but layout may differ + kv_cache_shape = model_runner.kv_caches[0].shape + assert list(kv_cache_shape) == expected_kv_cache_shape + if default_stride == test_stride: + assert all(kv.is_contiguous() for kv in model_runner.kv_caches) + else: + assert all(not kv.is_contiguous() for kv in model_runner.kv_caches) def test_update_config(model_runner): From d21a36f5f9569949dd6c313deed609d77e393850 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Fri, 12 Sep 2025 16:52:25 +0800 Subject: [PATCH 214/584] [CI] Add ci_envs for convenient local testing (#24630) Signed-off-by: wang.yuqi <noooop@126.com> --- tests/ci_envs.py | 45 +++++++++++++++++++ .../language/generation_ppl_test/ppl_utils.py | 19 +++++--- .../language/pooling_mteb_test/mteb_utils.py | 39 ++++++++++++---- vllm/config/__init__.py | 13 ++++-- 4 files changed, 98 insertions(+), 18 deletions(-) create mode 100644 tests/ci_envs.py diff --git a/tests/ci_envs.py b/tests/ci_envs.py new file mode 100644 index 0000000000000..d16ecce1ef8dd --- /dev/null +++ b/tests/ci_envs.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +These envs only work for a small part of the tests, fix what you need! +""" + +import os +from typing import TYPE_CHECKING, Any, Callable, Optional + +if TYPE_CHECKING: + VLLM_CI_NO_SKIP: bool = False + VLLM_CI_DTYPE: Optional[str] = None + VLLM_CI_HEAD_DTYPE: Optional[str] = None + VLLM_CI_HF_DTYPE: Optional[str] = None + +environment_variables: dict[str, Callable[[], Any]] = { + # A model family has many models with the same architecture. + # By default, a model family tests only one model. + # Through this flag, all models can be tested. + "VLLM_CI_NO_SKIP": lambda: bool(int(os.getenv("VLLM_CI_NO_SKIP", "0"))), + # Allow changing the dtype used by vllm in tests + "VLLM_CI_DTYPE": lambda: os.getenv("VLLM_CI_DTYPE", None), + # Allow changing the head dtype used by vllm in tests + "VLLM_CI_HEAD_DTYPE": lambda: os.getenv("VLLM_CI_HEAD_DTYPE", None), + # Allow changing the head dtype used by transformers in tests + "VLLM_CI_HF_DTYPE": lambda: os.getenv("VLLM_CI_HF_DTYPE", None), +} + + +def __getattr__(name: str): + # lazy evaluation of environment variables + if name in environment_variables: + return environment_variables[name]() + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + return list(environment_variables.keys()) + + +def is_set(name: str): + """Check if an environment variable is explicitly set.""" + if name in environment_variables: + return name in os.environ + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py index 550e874cf8579..6225bbe3377bd 100644 --- a/tests/models/language/generation_ppl_test/ppl_utils.py +++ b/tests/models/language/generation_ppl_test/ppl_utils.py @@ -7,6 +7,7 @@ import pytest import torch from datasets import load_dataset +import tests.ci_envs as ci_envs from tests.models.utils import (GenerateModelInfo, TokensTextLogprobsPromptLogprobs) from vllm.logprobs import Logprob @@ -26,19 +27,26 @@ def wikitext_ppl_test(hf_runner, # A model family has many models with the same architecture, # and we don't need to test each one. - if not model_info.enable_test: + if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test: pytest.skip("Skipping test.") dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") # Allow vllm to test using the given dtype, such as float32 vllm_extra_kwargs = vllm_extra_kwargs or {} - vllm_extra_kwargs["dtype"] = model_info.dtype + vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype # Allow vllm to test using hf_overrides if model_info.hf_overrides is not None: vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + # Allow changing the head dtype used by vllm in tests + if ci_envs.VLLM_CI_HEAD_DTYPE is not None: + if "hf_overrides" not in vllm_extra_kwargs: + vllm_extra_kwargs["hf_overrides"] = {} + vllm_extra_kwargs["hf_overrides"][ + "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE + with vllm_runner(model_info.name, gpu_memory_utilization=0.7, max_model_len=max_length, @@ -46,7 +54,7 @@ def wikitext_ppl_test(hf_runner, enforce_eager=True, **vllm_extra_kwargs) as vllm_model: # Use max_num_seqs=1 to avoid OOM, - # and batch different requests together. + # and avoid batch different requests together. model_config = vllm_model.llm.llm_engine.model_config @@ -91,12 +99,13 @@ def wikitext_ppl_test(hf_runner, n_tokens += len(token_log_probs) vllm_ppl = float(torch.exp(nll_sum / n_tokens)) vllm_dtype = model_config.dtype + head_dtype = model_config.head_dtype # Accelerate ppl test by setting Transformers ppl score to a constant if model_info.hf_ppl is None: with hf_runner( model_info.name, - dtype=model_info.hf_dtype, + dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype, ) as hf_model: nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu") n_tokens = 0 @@ -121,7 +130,7 @@ def wikitext_ppl_test(hf_runner, differ = (vllm_ppl - hf_ppl) / hf_ppl print("Model:", model_info.name) - print("VLLM:", vllm_dtype, vllm_ppl) + print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_ppl) print("Transformers:", hf_dtype, hf_ppl) print("Difference (%):", differ * 100) diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index 56a105e96e5ee..7b3c02fbbd9f8 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -11,6 +11,7 @@ import pytest import requests import torch +import tests.ci_envs as ci_envs from tests.models.utils import (EmbedModelInfo, RerankModelInfo, check_embeddings_close) @@ -168,7 +169,7 @@ def mteb_test_embed_models(hf_runner, atol=MTEB_EMBED_TOL): # A model family has many models with the same architecture, # and we don't need to test each one. - if not model_info.enable_test: + if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test: pytest.skip("Skipping test.") # Test embed_dims, isnan and whether to use normalize @@ -176,12 +177,19 @@ def mteb_test_embed_models(hf_runner, # Allow vllm to test using the given dtype, such as float32 vllm_extra_kwargs = vllm_extra_kwargs or {} - vllm_extra_kwargs["dtype"] = model_info.dtype + vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype # Allow vllm to test using hf_overrides if model_info.hf_overrides is not None: vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + # Allow changing the head dtype used by vllm in tests + if ci_envs.VLLM_CI_HEAD_DTYPE is not None: + if "hf_overrides" not in vllm_extra_kwargs: + vllm_extra_kwargs["hf_overrides"] = {} + vllm_extra_kwargs["hf_overrides"][ + "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE + with vllm_runner(model_info.name, runner="pooling", max_model_len=None, @@ -202,6 +210,7 @@ def mteb_test_embed_models(hf_runner, vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype + head_dtype = model_config.head_dtype # Test embed_dims, isnan and whether to use normalize vllm_outputs = vllm_model.embed(example_prompts, @@ -211,9 +220,11 @@ def mteb_test_embed_models(hf_runner, # Accelerate mteb test by setting # SentenceTransformers mteb score to a constant if model_info.mteb_score is None: - with hf_runner(model_info.name, - is_sentence_transformer=True, - dtype=model_info.hf_dtype) as hf_model: + with hf_runner( + model_info.name, + is_sentence_transformer=True, + dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype, + ) as hf_model: # e.g. setting default parameters for the encode method of hf_runner if hf_model_callback is not None: @@ -236,7 +247,8 @@ def mteb_test_embed_models(hf_runner, st_dtype = "Constant" print("Model:", model_info.name) - print("VLLM:", vllm_dtype, vllm_main_score) + print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", + vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) @@ -319,17 +331,24 @@ def mteb_test_rerank_models(hf_runner, atol=MTEB_RERANK_TOL): # A model family has many models with the same architecture, # and we don't need to test each one. - if not model_info.enable_test: + if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test: pytest.skip("Skipping test.") # Allow vllm to test using the given dtype, such as float32 vllm_extra_kwargs = vllm_extra_kwargs or {} - vllm_extra_kwargs["dtype"] = model_info.dtype + vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype # Allow vllm to test using hf_overrides if model_info.hf_overrides is not None: vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + # Allow changing the head dtype used by vllm in tests + if ci_envs.VLLM_CI_HEAD_DTYPE is not None: + if "hf_overrides" not in vllm_extra_kwargs: + vllm_extra_kwargs["hf_overrides"] = {} + vllm_extra_kwargs["hf_overrides"][ + "head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE + with vllm_runner(model_info.name, runner="pooling", max_model_len=None, @@ -355,6 +374,7 @@ def mteb_test_rerank_models(hf_runner, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS) vllm_dtype = model_config.dtype + head_dtype = model_config.head_dtype # Accelerate mteb test by setting # SentenceTransformers mteb score to a constant @@ -366,7 +386,8 @@ def mteb_test_rerank_models(hf_runner, st_dtype = "Constant" print("Model:", model_info.name) - print("VLLM:", vllm_dtype, vllm_main_score) + print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", + vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 8026d4c9e202c..ee58802766c4c 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1775,16 +1775,21 @@ class ModelConfig: such as the lm_head in a generation model, or the score or classifier in a classification model. - The default head_dtype based on runner_type.\n + `head_dtype` currently only supports pooling models.\n - The pooling model defaults to using fp32 head, - you can use --hf-overrides '{"head_dtype": "model"}' to disable it.\n - - The generate model defaults to not using fp32 head, - you can use --hf-overrides '{"head_dtype": "float32"}' to enable it. + you can use --hf-overrides '{"head_dtype": "model"}' to disable it. """ + head_dtype = _get_head_dtype(config=self.hf_config, dtype=self.dtype, runner_type=self.runner_type) + if self.runner_type != "pooling" and head_dtype != self.dtype: + logger.warning_once( + "`head_dtype` currently only supports pooling models." + "fallback to model dtype [%s].", self.dtype) + return self.dtype + if head_dtype not in current_platform.supported_dtypes: logger.warning_once( "The current platform does not support [%s] head dtype, " From 59d5d2c736028d8a67f230149ad6470adb1e71e5 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" <jiang1.li@intel.com> Date: Fri, 12 Sep 2025 18:51:01 +0800 Subject: [PATCH 215/584] [CI/Build] Skip prompt embeddings tests on V1-only CPU backend (#24721) Signed-off-by: jiang1.li <jiang1.li@intel.com> --- tests/models/language/generation/test_common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 062258930fe15..a39f24c80f1c5 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -119,6 +119,12 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, # in parts of the operators pytest.skip(f"Skipping '{model}' model test with AITER kernel.") + # Note: can be removed when + # https://github.com/vllm-project/vllm/pull/24278 finished + if current_platform.is_cpu() and use_prompt_embeds: + pytest.skip("Skipping use_prompt_embeds=True with " + "V1-only CPU backend.") + with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) From 64d90c3e4fe2a0e4395b8c94344dcdf78fa4cd22 Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Fri, 12 Sep 2025 03:57:07 -0700 Subject: [PATCH 216/584] [Misc][gpt-oss] Add gpt-oss label to PRs that mention harmony or related to builtin tool call (#24717) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- .github/mergify.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index befad23da8664..f2dd2e06214ae 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -124,9 +124,16 @@ pull_request_rules: - or: - files~=^examples/.*gpt[-_]?oss.*\.py - files~=^tests/.*gpt[-_]?oss.*\.py + - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py + - files~=^tests/entrypoints/test_context.py - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py + - files~=^vllm/entrypoints/harmony_utils.py + - files~=^vllm/entrypoints/tool_server.py + - files~=^vllm/entrypoints/tool.py + - files~=^vllm/entrypoints/context.py - title~=(?i)gpt[-_]?oss + - title~=(?i)harmony actions: label: add: From 60a095192499bf10b588b7e34e6d3cba6861f01e Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Fri, 12 Sep 2025 19:12:01 +0800 Subject: [PATCH 217/584] [Bugfix] Fix BNB name match (#24735) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- .../model_loader/bitsandbytes_loader.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 9c34159f9a269..4edf193b54ac5 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -326,7 +326,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): global_tp_size = get_tensor_model_parallel_world_size() global_tp_rank = get_tensor_model_parallel_rank() - + check_match = (lambda weight_name, module_name: weight_name. + removesuffix(".weight") == module_name) for ( org_weight_name, mapped_weight_name, @@ -347,12 +348,12 @@ class BitsAndBytesModelLoader(BaseModelLoader): ) and mapped_weight_name.endswith(".weight"): # Without sharding if any( - mapped_weight_name.startswith(module) + check_match(mapped_weight_name, module) for module in self.unsharded_weights_modules): weight_sub_tensor = weight_tensor # Shard by column elif any( - mapped_weight_name.startswith(module) + check_match(mapped_weight_name, module) for module in self.column_sharded_weights_modules): total_size = weight_tensor.size(-1) start_index = total_size // tp_size * tp_rank @@ -362,14 +363,14 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Weights have fused on disk. In this case, we assume that the # weight and module use same name. elif any( - mapped_weight_name.startswith(module) + check_match(mapped_weight_name, module) for module in self.maybe_fused_weights_modules): # special case for fused weights # get the size of each shard weight tensor total_shard_sizes = next( (sizes for module, sizes in self.maybe_fused_weights_modules.items() - if mapped_weight_name.startswith(module))) + if check_match(mapped_weight_name, module))) total_size = weight_tensor.size(0) assert total_size == sum(total_shard_sizes) # get the start/end index of each shard weight tensor From 7a1c4025f1e2879fa398888d70c596e5818026cb Mon Sep 17 00:00:00 2001 From: Ignacio Sica <mignacio.sica@gmail.com> Date: Fri, 12 Sep 2025 08:23:07 -0300 Subject: [PATCH 218/584] [Kernel] [CPU] refactor `cpu_attn.py:_run_sdpa_forward` for better memory access (#24701) Signed-off-by: ignaciosica <mignacio.sica@gmail.com> --- vllm/v1/attention/backends/cpu_attn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index ab87f3bb4e3cb..6627164c98798 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -641,10 +641,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): attn_metadata: TorchSDPAMetadata, attn_type: str = AttentionType.DECODER, ) -> None: - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=1) - value = value.repeat_interleave(self.num_queries_per_kv, dim=1) - attn_masks = attn_metadata.get_attn_bias(attn_type) if attn_masks is None: if self.alibi_slopes is not None: @@ -665,6 +661,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): key = key.movedim(0, key.dim() - 2) value = value.movedim(0, value.dim() - 2) + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=-3) + value = value.repeat_interleave(self.num_queries_per_kv, dim=-3) + causal_attn = (attn_type == AttentionType.DECODER) seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type) From fdb09c77d6f92fab0ebced332aa79fda86eaf1fc Mon Sep 17 00:00:00 2001 From: youkaichao <youkaichao@gmail.com> Date: Fri, 12 Sep 2025 19:25:19 +0800 Subject: [PATCH 219/584] [sleep mode] save memory for on-the-fly quantization (#24731) Signed-off-by: youkaichao <youkaichao@gmail.com> --- vllm/device_allocator/cumem.py | 37 ++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 7963fb15c4191..af7ca6be1fca8 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -16,8 +16,11 @@ from typing import Any, Callable, Optional, Union import torch +from vllm.logger import init_logger from vllm.utils import is_pin_memory_available +logger = init_logger(__name__) + def find_loaded_library(lib_name) -> Optional[str]: """ @@ -165,6 +168,9 @@ class CuMemAllocator: py_d_mem = allocation_handle[2] self.pointer_to_data[py_d_mem] = AllocationData( allocation_handle, self.current_tag) + logger.debug( + "Allocated %s bytes for %s with address %s from cumem allocator", + allocation_handle[1], self.current_tag, py_d_mem) return def _python_free_callback(self, ptr: int) -> HandleType: @@ -174,6 +180,9 @@ class CuMemAllocator: data = self.pointer_to_data.pop(ptr) if data.cpu_backup_tensor is not None: data.cpu_backup_tensor = None + logger.debug( + "Freed %s bytes for %s with address %s from cumem allocator", + data.handle[1], data.tag, ptr) return data.handle def sleep( @@ -197,9 +206,14 @@ class CuMemAllocator: assert isinstance(offload_tags, tuple) + total_bytes = 0 + backup_bytes = 0 + for ptr, data in self.pointer_to_data.items(): handle = data.handle + total_bytes += handle[1] if data.tag in offload_tags: + backup_bytes += handle[1] size_in_bytes = handle[1] cpu_backup_tensor = torch.empty( size_in_bytes, @@ -211,6 +225,12 @@ class CuMemAllocator: data.cpu_backup_tensor = cpu_backup_tensor unmap_and_release(handle) + logger.info( + "CuMemAllocator: sleep freed %.2f GiB memory in total, of which " + "%.2f GiB is backed up in CPU and the rest %.2f GiB is discarded " + "directly.", total_bytes / 1024**3, backup_bytes / 1024**3, + (total_bytes - backup_bytes) / 1024**3) + gc.collect() torch.cuda.empty_cache() @@ -267,12 +287,17 @@ class CuMemAllocator: # when using pluggable allocator, see # https://github.com/pytorch/pytorch/issues/145168 . # if we have some memory allocated and then freed, - # the memory will not be released. - # right now it is fine, because we only use this allocator - # during weight loading and kv cache creation, where we only - # allocate memory. - # TODO: we need to find a way to release the memory, - # i.e. calling torch.cuda.empty_cache() + # the memory will not be released, e.g. in online quantization, + # where the model is created in higher precision, and then + # quantized in lower precision. + # Find all unused allocations and manually release them. + # TODO: we should expose `empty_cache` method in the memory pool. + # TODO: ask for help from PyTorch team to expose this method. + allocations = data[0].snapshot() + for allocation in allocations: + if allocation["allocated_size"] == 0: + handle = self._python_free_callback(allocation["address"]) + unmap_and_release(handle) self.current_tag = old_tag def get_current_usage(self) -> int: From 72fc8aa41255a21716cc0f4d8eb81b9572f71bc2 Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Fri, 12 Sep 2025 06:27:24 -0700 Subject: [PATCH 220/584] [Multi Modal] Add FA3 in VIT (#24347) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> --- tests/entrypoints/openai/test_vision.py | 4 +- tests/kernels/attention/test_mha_attn.py | 49 ++++++++++---- vllm/attention/layer.py | 75 +++++++++++++++++++--- vllm/model_executor/models/ernie45_vl.py | 23 ++++++- vllm/model_executor/models/glm4_1v.py | 22 ++++++- vllm/model_executor/models/keye.py | 17 ++++- vllm/model_executor/models/qwen2_5_vl.py | 24 ++++++- vllm/model_executor/models/qwen2_vl.py | 24 ++++++- vllm/model_executor/models/siglip2navit.py | 16 ++++- vllm/model_executor/models/vision.py | 10 +-- vllm/platforms/cuda.py | 28 ++++---- vllm/platforms/interface.py | 3 +- vllm/platforms/rocm.py | 18 +++--- 13 files changed, 247 insertions(+), 66 deletions(-) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 29a3b40d2d865..72819f31de206 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -34,11 +34,11 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ], [ "The image shows a Venn diagram with three over", - "The image shows a Venn diagram with three intersect", + "This image shows a Venn diagram with three over", ], [ "This image displays a gradient of colors ranging from", - "The image displays a gradient of colors ranging from", + "This image displays a gradient of colors forming a spectrum", ], ] diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index c01ea32994da0..d37b968ed9792 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -36,31 +36,52 @@ def test_mha_attn_platform(device: str): torch.set_default_dtype(torch.float16) if device == "cpu": - with patch("vllm.attention.selector.current_platform", - CpuPlatform()), \ - patch("vllm.platforms.current_platform", CpuPlatform()): + with patch("vllm.attention.layer.current_platform", CpuPlatform()), \ + patch("vllm.model_executor.models.vision.current_platform", + CpuPlatform()): attn = MultiHeadAttention(16, 64, scale=1) - assert attn.attn_backend == _Backend.TORCH_SDPA_VLLM_V1 + assert attn.attn_backend == _Backend.TORCH_SDPA elif device == "hip": - with patch("vllm.attention.selector.current_platform", - RocmPlatform()), \ - patch("vllm.platforms.current_platform", RocmPlatform()), \ - patch("vllm.attention.layer.current_platform", RocmPlatform()): + with patch("vllm.attention.layer.current_platform", RocmPlatform()), \ + patch("vllm.model_executor.models.vision.current_platform", + RocmPlatform()): attn = MultiHeadAttention(16, 64, scale=1) assert attn.attn_backend == _Backend.TORCH_SDPA else: - with patch("vllm.attention.selector.current_platform", - CudaPlatform()), \ - patch("vllm.platforms.current_platform", CudaPlatform()): + # Test CUDA with head_size=64 (divisible by 32) + # - should use vLLM's FlashAttention + with patch("vllm.attention.layer.current_platform", CudaPlatform()), \ + patch("vllm.model_executor.models.vision.current_platform", + CudaPlatform()): attn = MultiHeadAttention(16, 64, scale=1) - assert attn.attn_backend == _Backend.XFORMERS + assert attn.attn_backend == _Backend.FLASH_ATTN - with patch("vllm.attention.selector.current_platform", + # Test CUDA with head_size=72 (not divisible by 32) + # - with upstream FA not available + # - should use xformers + with patch("vllm.attention.layer.current_platform", CudaPlatform()), \ + patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), \ - patch("vllm.platforms.current_platform", CudaPlatform()): + patch("vllm.attention.layer.check_upstream_fa_availability", + return_value=False): attn = MultiHeadAttention(16, 72, scale=1) assert attn.attn_backend == _Backend.XFORMERS + # Test CUDA with head_size=72 (not divisible by 32) + # - with upstream FA available + # - should use upstream FA + with patch("vllm.attention.layer.current_platform", CudaPlatform()), \ + patch("vllm.model_executor.models.vision.current_platform", + CudaPlatform()), \ + patch("vllm.attention.layer.check_upstream_fa_availability", + return_value=True), \ + patch.dict('sys.modules', {'flash_attn': type('MockFlashAttn', (), + { + 'flash_attn_varlen_func': lambda *args, **kwargs: None + })()}): + attn = MultiHeadAttention(16, 72, scale=1) + assert attn.attn_backend == _Backend.FLASH_ATTN + def ref_attention( query: torch.Tensor, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index bb05b468fd102..44cb2c7c6b642 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op @@ -55,6 +56,14 @@ def check_xformers_availability(): return USE_XFORMERS_OPS +def check_upstream_fa_availability(dtype: torch.dtype): + if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda( + ) and current_platform.has_device_capability(80): + from transformers.utils import is_flash_attn_2_available + return is_flash_attn_2_available() + return False + + class Attention(nn.Module, AttentionLayerBase): """Attention layer. @@ -349,29 +358,55 @@ class MultiHeadAttention(nn.Module): f"divisible by num_kv_heads ({self.num_kv_heads})" self.num_queries_per_kv = self.num_heads // self.num_kv_heads + # During model initialization, the default dtype is set as the model + # weight and activation dtype. dtype = torch.get_default_dtype() - attn_backend = get_attn_backend(head_size, - dtype, - kv_cache_dtype=None, - block_size=16, - is_attention_free=False) - backend = backend_name_to_enum(attn_backend.get_name()) + + # Determine the attention backend + backend = get_vit_attn_backend(head_size=head_size, dtype=dtype) + + # Some auto-selected backends can be upgraded + # to upstream flash attention if available. + # If vllm native fa is selected, we use it directly. + use_upstream_fa = False + if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability( + dtype): + backend = _Backend.FLASH_ATTN + use_upstream_fa = True + if current_platform.is_rocm(): # currently, only torch_sdpa is supported on rocm self.attn_backend = _Backend.TORCH_SDPA else: + self.attn_backend = backend if backend in { _Backend.TORCH_SDPA, _Backend.TORCH_SDPA_VLLM_V1, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1, _Backend.ROCM_AITER_FA, - } else current_platform.get_vit_attn_backend() + _Backend.FLASH_ATTN, + _Backend.FLASH_ATTN_VLLM_V1, + } else _Backend.TORCH_SDPA if (self.attn_backend == _Backend.XFORMERS and not check_xformers_availability()): self.attn_backend = _Backend.TORCH_SDPA + if self.attn_backend in { + _Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1 + }: + if use_upstream_fa: + from flash_attn import flash_attn_varlen_func + self._flash_attn_varlen_func = flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func + self._flash_attn_varlen_func = flash_attn_varlen_func + + logger.info_once( + f"MultiHeadAttention attn_backend: {self.attn_backend}, " + f"use_upstream_fa: {use_upstream_fa}") + def forward( self, query: torch.Tensor, @@ -392,7 +427,31 @@ class MultiHeadAttention(nn.Module): key = torch.repeat_interleave(key, num_repeat, dim=2) value = torch.repeat_interleave(value, num_repeat, dim=2) - if self.attn_backend == _Backend.XFORMERS: + if self.attn_backend in { + _Backend.FLASH_ATTN, + _Backend.FLASH_ATTN_VLLM_V1, + }: + + cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len, + step=q_len, + dtype=torch.int32, + device=query.device) + cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len, + step=kv_len, + dtype=torch.int32, + device=key.device) + + out = self._flash_attn_varlen_func( + query.flatten(0, 1), + key.flatten(0, 1), + value.flatten(0, 1), + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=q_len, + max_seqlen_k=kv_len, + softmax_scale=self.scale, + ) + elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops out = xops.memory_efficient_attention_forward(query, diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 97aace5a20c3a..bcff65a717ab1 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -34,6 +34,7 @@ import torch.nn.functional as F from einops import rearrange, repeat from transformers import BatchFeature +from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -170,7 +171,16 @@ class Ernie4_5_VisionAttention(nn.Module): prefix=f"{prefix}.proj") # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=self.hidden_size_per_attention_head, + dtype=torch.get_default_dtype()) + + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.ROCM_AITER_FA @@ -233,7 +243,10 @@ class Ernie4_5_VisionAttention(nn.Module): if self.attn_backend == _Backend.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - from flash_attn import flash_attn_varlen_func + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -457,7 +470,11 @@ class Ernie4_5_VisionTransformer(nn.Module): ), "vit's config.hidden must be equal to config.embed_dim" self.ln = nn.LayerNorm(hidden_size, eps=1e-6) - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 539381b618000..279f458dfa6cf 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -44,6 +44,7 @@ from transformers.models.glm4v.video_processing_glm4v import ( Glm4vVideoProcessor) from transformers.video_utils import VideoMetadata +from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig from vllm.distributed import (get_tensor_model_parallel_world_size, parallel_state) @@ -260,7 +261,15 @@ class Glm4vVisionAttention(nn.Module): ) # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=self.hidden_size_per_attention_head, + dtype=torch.get_default_dtype()) + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, @@ -310,7 +319,10 @@ class Glm4vVisionAttention(nn.Module): if self.attn_backend == _Backend.FLASH_ATTN: # from vllm_flash_attn.flash_attn_interface import ( # flash_attn_varlen_func) - from flash_attn import flash_attn_varlen_func + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -715,7 +727,11 @@ class Glm4vVisionTransformer(nn.Module): self.post_layernorm = RMSNorm(vision_config.hidden_size, eps=vision_config.rms_norm_eps) - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 710b805acb3ea..04824db1b6dd9 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -17,6 +17,7 @@ from transformers.modeling_outputs import (BaseModelOutput, BaseModelOutputWithPooling) from transformers.utils import torch_int +from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -374,7 +375,16 @@ class KeyeSiglipAttention(nn.Module): ) # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=self.head_dim, dtype=torch.get_default_dtype()) + + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}: raise RuntimeError( f"Keye-VL does not support {self.attn_backend} backend now.") @@ -428,7 +438,10 @@ class KeyeSiglipAttention(nn.Module): ) if self.attn_backend == _Backend.FLASH_ATTN: - from flash_attn import flash_attn_varlen_func + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8aa7775570297..98f9c0cf4c16b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -38,6 +38,7 @@ from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) +from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -298,7 +299,16 @@ class Qwen2_5_VisionAttention(nn.Module): disable_tp=use_data_parallel) # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=self.hidden_size_per_attention_head, + dtype=torch.get_default_dtype()) + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.ROCM_AITER_FA @@ -359,7 +369,10 @@ class Qwen2_5_VisionAttention(nn.Module): if self.attn_backend == _Backend.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - from flash_attn import flash_attn_varlen_func + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -628,7 +641,12 @@ class Qwen2_5_VisionTransformer(nn.Module): prefix=f"{prefix}.merger", use_data_parallel=use_data_parallel, ) - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 90a1ad2a658ab..89af79c3b5fd5 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -41,6 +41,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from transformers.models.qwen2_vl.video_processing_qwen2_vl import ( Qwen2VLVideoProcessor) +from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils @@ -314,7 +315,16 @@ class Qwen2VisionAttention(nn.Module): prefix=f"{prefix}.proj") # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=self.hidden_size_per_attention_head, + dtype=torch.get_default_dtype()) + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.ROCM_AITER_FA @@ -374,7 +384,10 @@ class Qwen2VisionAttention(nn.Module): if self.attn_backend == _Backend.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - from flash_attn import flash_attn_varlen_func + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -628,7 +641,12 @@ class Qwen2VisionTransformer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.merger", ) - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index c6244fb3b3e6a..a86700fe68dd3 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -13,6 +13,7 @@ from torch.nn import functional as F from transformers import Siglip2VisionConfig from transformers.configuration_utils import PretrainedConfig +from vllm.attention.layer import check_upstream_fa_availability from vllm.config import QuantizationConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn @@ -236,7 +237,15 @@ class Siglip2Attention(nn.Module): self.use_rope = config.use_rope # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + self.attn_backend = get_vit_attn_backend( + head_size=self.head_dim, dtype=torch.get_default_dtype()) + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.ROCM_AITER_FA @@ -280,7 +289,10 @@ class Siglip2Attention(nn.Module): if self.attn_backend == _Backend.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - from flash_attn import flash_attn_varlen_func + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func attn_output = flash_attn_varlen_func( queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(seq_length, -1) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index c16aa5ac608f9..81f86db7e1875 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -7,7 +7,6 @@ from typing import Final, Generic, Optional, Protocol, TypeVar, Union import torch from transformers import PretrainedConfig -from vllm.attention.selector import get_env_variable_attn_backend from vllm.logger import init_logger from vllm.platforms import _Backend, current_platform @@ -68,17 +67,18 @@ def get_vision_encoder_info( raise NotImplementedError(msg) -def get_vit_attn_backend(support_fa: bool = False) -> _Backend: +def get_vit_attn_backend(head_size: int, dtype: torch.dtype) -> _Backend: """ Get the available attention backend for Vision Transformer. """ - # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn. + # Lazy import to avoid circular dependency + from vllm.attention.selector import get_env_variable_attn_backend selected_backend: Optional[_Backend] = get_env_variable_attn_backend() if selected_backend is not None: return selected_backend - return current_platform.get_vit_attn_backend(support_fa) + return current_platform.get_vit_attn_backend(head_size, dtype) def resolve_visual_encoder_outputs( @@ -122,4 +122,4 @@ def resolve_visual_encoder_outputs( uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1) if post_layer_norm is not None and uses_last_layer: hs_pool[-1] = post_layer_norm(encoder_outputs) - return torch.cat(hs_pool, dim=-1) \ No newline at end of file + return torch.cat(hs_pool, dim=-1) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index e40b6eb2b5a4d..77c9a012b2d3d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -209,18 +209,24 @@ class CudaPlatformBase(Platform): return torch.cuda.max_memory_allocated(device) @classmethod - def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend: - if cls.has_device_capability(80) and support_fa: - from transformers.utils import is_flash_attn_2_available - if is_flash_attn_2_available(): + def get_vit_attn_backend(cls, head_size: int, + dtype: torch.dtype) -> _Backend: + if dtype not in (torch.float16, torch.bfloat16): + return _Backend.XFORMERS + + if cls.has_device_capability(80): + FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 + from vllm.attention.selector import is_attn_backend_supported + is_default_fa_supported = is_attn_backend_supported( + FLASH_ATTN_V1, head_size, dtype, allow_import_error=False) + if is_default_fa_supported: return _Backend.FLASH_ATTN - logger.warning_once( - "Current `vllm-flash-attn` has a bug inside vision " - "module, so we use xformers backend instead. You can " - "run `pip install flash-attn` to use flash-attention " - "backend.") - # Fallback for Volta/Turing GPUs or FA not supported - return _Backend.XFORMERS + else: + # Fallback to XFORMERS + return _Backend.XFORMERS + else: + # Fallback for Volta/Turing GPUs or FA not supported + return _Backend.XFORMERS @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 59aa468185698..054d08c3a85be 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -192,7 +192,8 @@ class Platform: return device_id @classmethod - def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend: + def get_vit_attn_backend(cls, head_size: int, + dtype: torch.dtype) -> _Backend: return _Backend.TORCH_SDPA @classmethod diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f4d136c5e0aaa..bb8bff48c7b95 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -175,15 +175,15 @@ class RocmPlatform(Platform): ] @classmethod - def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend: - if support_fa: - if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA - and on_gfx9()): - # Note: AITER FA is only supported for Qwen-VL models. - # TODO: Add support for other VL models in their model class. - return _Backend.ROCM_AITER_FA - if on_gfx9(): - return _Backend.FLASH_ATTN + def get_vit_attn_backend(cls, head_size: int, + dtype: torch.dtype) -> _Backend: + if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA + and on_gfx9()): + # Note: AITER FA is only supported for Qwen-VL models. + # TODO: Add support for other VL models in their model class. + return _Backend.ROCM_AITER_FA + if on_gfx9(): + return _Backend.FLASH_ATTN return _Backend.TORCH_SDPA @classmethod From 0377802c20df2528d895add1f9918b2c7b868eb0 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Fri, 12 Sep 2025 06:42:23 -0700 Subject: [PATCH 221/584] [Multimodal] Remove legacy multimodal fields in favor of MultiModalFeatureSpec (#24548) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/v1/core/test_encoder_cache_manager.py | 12 +++++- tests/v1/tpu/worker/test_tpu_model_runner.py | 4 +- tests/v1/worker/test_gpu_input_batch.py | 4 +- tests/v1/worker/test_gpu_model_runner.py | 4 +- .../v1/shared_storage_connector.py | 41 ++++++++++--------- vllm/v1/core/encoder_cache_manager.py | 10 ++--- vllm/v1/core/kv_cache_utils.py | 36 ++++++++-------- vllm/v1/core/sched/output.py | 18 +++----- vllm/v1/core/sched/scheduler.py | 22 +++++----- vllm/v1/request.py | 9 +--- vllm/v1/worker/gpu_input_batch.py | 10 ++--- vllm/v1/worker/gpu_model_runner.py | 28 +++++++------ vllm/v1/worker/tpu_model_runner.py | 20 ++++----- 13 files changed, 102 insertions(+), 116 deletions(-) diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py index ae5b751f45a4b..4e3cace86be6a 100644 --- a/tests/v1/core/test_encoder_cache_manager.py +++ b/tests/v1/core/test_encoder_cache_manager.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange from vllm.v1.core.encoder_cache_manager import EncoderCacheManager @@ -9,8 +10,17 @@ class MockRequest: def __init__(self, request_id, mm_hashes, token_counts): self.request_id = request_id - self.mm_hashes = mm_hashes self._token_counts = token_counts + self.mm_features = [] + for i, mm_hash in enumerate(mm_hashes): + feature = MultiModalFeatureSpec( + data=None, + modality="image", + identifier=mm_hash, + mm_position=PlaceholderRange(offset=0, + length=self._token_counts[i]), + ) + self.mm_features.append(feature) def get_num_encoder_tokens(self, input_id: int) -> int: return self._token_counts[input_id] diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 941aa0a77692c..c719e44acc9c2 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -64,9 +64,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: NewRequestData( req_id=req_id, prompt_token_ids=[1, 2, 3], - mm_kwargs=[], - mm_hashes=[], - mm_positions=[], + mm_features=[], sampling_params=SamplingParams(), pooling_params=PoolingParams(), block_ids=([0], ), # block_ids should be tuple[list[int]] diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 7031859078264..38f543c784866 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -203,9 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int): prompt_token_ids=prompt_token_ids, sampling_params=_create_sampling_params(), pooling_params=None, - mm_kwargs=[], - mm_positions=[], - mm_hashes=[], + mm_features=[], block_ids=([], ), generator=None, num_computed_tokens=len(output_token_ids), diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 8e27c14b7405b..5ebc00d573030 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -118,9 +118,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: NewRequestData( req_id=req_id, prompt_token_ids=[1, 2, 3], - mm_kwargs=[], - mm_hashes=[], - mm_positions=[], + mm_features=[], sampling_params=SamplingParams(), pooling_params=None, block_ids=([0], ), diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 9f4da613f8b62..25460ed295d30 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -300,11 +300,12 @@ class SharedStorageConnector(KVConnectorBase_V1): total_need_load = 0 for new_req in scheduler_output.scheduled_new_reqs: if new_req.req_id in self._requests_need_load: - meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids[0], - block_size=self._block_size, - is_store=False, - mm_hashes=new_req.mm_hashes) + meta.add_request( + token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids[0], + block_size=self._block_size, + is_store=False, + mm_hashes=[f.identifier for f in new_req.mm_features]) total_need_load += 1 else: # NOTE: here, we set the store and load being exclusive, @@ -312,11 +313,12 @@ class SharedStorageConnector(KVConnectorBase_V1): # NOTE(rob): for this debug implementation, we only cache # the original prompt tokens. if not self._found_match_for_request(new_req): - meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids[0], - block_size=self._block_size, - is_store=True, - mm_hashes=new_req.mm_hashes) + meta.add_request( + token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids[0], + block_size=self._block_size, + is_store=True, + mm_hashes=[f.identifier for f in new_req.mm_features]) cached_reqs = scheduler_output.scheduled_cached_reqs for i, req_id in enumerate(cached_reqs.req_ids): @@ -341,11 +343,12 @@ class SharedStorageConnector(KVConnectorBase_V1): # of the block_ids for the request. block_ids = new_block_ids[0] - meta.add_request(token_ids=token_ids, - block_ids=block_ids, - block_size=self._block_size, - is_store=False, - mm_hashes=request.mm_hashes) + meta.add_request( + token_ids=token_ids, + block_ids=block_ids, + block_size=self._block_size, + is_store=False, + mm_hashes=[f.identifier for f in request.mm_features]) total_need_load += 1 assert total_need_load == len(self._requests_need_load) @@ -364,10 +367,10 @@ class SharedStorageConnector(KVConnectorBase_V1): """ num_tokens_to_check = align_to_block_size( len(request.prompt_token_ids) - 1, self._block_size) - foldername = self._generate_foldername_debug(torch.tensor( - request.prompt_token_ids)[:num_tokens_to_check], - request.mm_hashes, - create_folder=False) + foldername = self._generate_foldername_debug( + torch.tensor(request.prompt_token_ids)[:num_tokens_to_check], + [f.identifier for f in request.mm_features], + create_folder=False) return os.path.exists(foldername) def _generate_foldername_debug( diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index bd2ec036834b2..eadea15a2e5e3 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -86,7 +86,7 @@ class EncoderCacheManager: Returns: True if the encoder output for this input is already cached """ - mm_hash = request.mm_hashes[input_id] + mm_hash = request.mm_features[input_id].identifier # Not cached at all if mm_hash not in self.cached: return False @@ -167,7 +167,7 @@ class EncoderCacheManager: This method assumes can_allocate() returned True for the same input. """ - mm_hash = request.mm_hashes[input_id] + mm_hash = request.mm_features[input_id].identifier request_id = request.request_id if mm_hash not in self.cached: self.cached[mm_hash] = set() @@ -193,8 +193,8 @@ class EncoderCacheManager: """ return { input_id - for input_id in range(len(request.mm_hashes)) - if request.mm_hashes[input_id] in self.cached + for input_id in range(len(request.mm_features)) + if request.mm_features[input_id].identifier in self.cached } def free_encoder_input(self, request: Request, input_id: int) -> None: @@ -208,7 +208,7 @@ class EncoderCacheManager: `can_allocate`). """ req_id = request.request_id - mm_hash = request.mm_hashes[input_id] + mm_hash = request.mm_features[input_id].identifier # The mm_hash not in cache or the req_id set is empty if not self.cached.get(mm_hash, None): return diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 2c0eac3ddd79d..f939da8c5b5c3 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -418,9 +418,9 @@ def need_extra_keys(request: Request) -> bool: # Multimodal requests need to include the MM hash. # LoRA requests need to include the LoRA ID. # Request with provided cache salt need to include the salt. - return bool(request.mm_hashes) or (request.lora_request - is not None) or (request.cache_salt - is not None) + return bool(request.mm_features) or (request.lora_request + is not None) or (request.cache_salt + is not None) def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, @@ -442,32 +442,28 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, """ extra_keys: list[Any] = [] - mm_positions, mm_hashes = request.mm_positions, request.mm_hashes - if not mm_positions: + mm_features = request.mm_features + if not mm_features: return extra_keys, start_mm_idx - if mm_positions and len(mm_positions) != len(mm_hashes): - raise ValueError( - "The number of multi-modal positions and hashes must match. This " - "is likely because you did not enable MM hashing. " - "Please set `mm_processor_cache_gb > 0`.") - - # Note that we assume mm_positions is sorted by offset. + # Note that we assume mm_features are sorted by mm_position.offset. # We do not need to check all mm inputs if the start token index is out of # range. This usually happens in the late prefill phase and decoding phase. - if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx: + last_pos = mm_features[-1].mm_position + if last_pos.offset + last_pos.length < start_token_idx: return extra_keys, start_mm_idx # Support start_mm_idx == -1 to indicate the last mm input. if start_mm_idx < 0: - assert -start_mm_idx <= len(mm_positions) - start_mm_idx = len(mm_positions) + start_mm_idx + assert -start_mm_idx <= len(mm_features) + start_mm_idx = len(mm_features) + start_mm_idx curr_mm_idx = start_mm_idx - while mm_positions and curr_mm_idx < len(mm_positions): - assert mm_hashes[curr_mm_idx] is not None - offset = mm_positions[curr_mm_idx].offset - length = mm_positions[curr_mm_idx].length + while mm_features and curr_mm_idx < len(mm_features): + mm_feature = mm_features[curr_mm_idx] + assert mm_feature.identifier is not None + offset = mm_feature.mm_position.offset + length = mm_feature.mm_position.length if end_token_idx > offset: if start_token_idx > offset + length: # This block has passed the current mm input. @@ -475,7 +471,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, continue # The block contains the current mm input. - extra_keys.append(mm_hashes[curr_mm_idx]) + extra_keys.append(mm_feature.identifier) if end_token_idx >= offset + length: # If this block contains the end of the current mm input, diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 9888f25735753..56ab396d6d937 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorMetadata) from vllm.lora.request import LoRARequest - from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange + from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.request import Request @@ -27,9 +27,7 @@ class NewRequestData: req_id: str prompt_token_ids: list[int] - mm_kwargs: list[MultiModalKwargsItem] - mm_hashes: list[str] - mm_positions: list[PlaceholderRange] + mm_features: list[MultiModalFeatureSpec] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] block_ids: tuple[list[int], ...] @@ -45,9 +43,7 @@ class NewRequestData: return cls( req_id=request.request_id, prompt_token_ids=request.prompt_token_ids, - mm_kwargs=request.mm_kwargs, - mm_hashes=request.mm_hashes, - mm_positions=request.mm_positions, + mm_features=request.mm_features, sampling_params=request.sampling_params, pooling_params=request.pooling_params, block_ids=block_ids, @@ -59,9 +55,7 @@ class NewRequestData: return (f"NewRequestData(" f"req_id={self.req_id}," f"prompt_token_ids={self.prompt_token_ids}," - f"mm_kwargs={self.mm_kwargs}," - f"mm_hashes={self.mm_hashes}," - f"mm_positions={self.mm_positions}," + f"mm_features={self.mm_features}," f"sampling_params={self.sampling_params}," f"block_ids={self.block_ids}," f"num_computed_tokens={self.num_computed_tokens}," @@ -73,9 +67,7 @@ class NewRequestData: return (f"NewRequestData(" f"req_id={self.req_id}," f"prompt_token_ids_len={len(self.prompt_token_ids)}," - f"mm_kwargs={self.mm_kwargs}," - f"mm_hashes={self.mm_hashes}," - f"mm_positions={self.mm_positions}," + f"mm_features={self.mm_features}," f"sampling_params={self.sampling_params}," f"block_ids={self.block_ids}," f"num_computed_tokens={self.num_computed_tokens}," diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index aa45f6669207d..c1e59423e9a18 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -736,18 +736,18 @@ class Scheduler(SchedulerInterface): if num_new_tokens == 0 or not request.has_encoder_inputs: return [], num_new_tokens, encoder_compute_budget encoder_inputs_to_schedule: list[int] = [] - mm_positions = request.mm_positions - assert mm_positions is not None - assert len(mm_positions) > 0 + mm_features = request.mm_features + assert mm_features is not None + assert len(mm_features) > 0 # NOTE: since scheduler operates on the request level (possibly with # multiple encoder inputs per request), we need to create temporary # trackers for accounting at the encoder input level. mm_hashes_to_schedule = set() num_tokens_to_schedule = 0 - for i, pos_info in enumerate(mm_positions): - start_pos = pos_info.offset - num_encoder_tokens = pos_info.length + for i, mm_feature in enumerate(mm_features): + start_pos = mm_feature.mm_position.offset + num_encoder_tokens = mm_feature.mm_position.length # The encoder output is needed if the two ranges overlap: # [num_computed_tokens, num_computed_tokens + num_new_tokens) and @@ -778,7 +778,7 @@ class Scheduler(SchedulerInterface): if not self.is_encoder_decoder: # We are not using the encoder cache for encoder-decoder models, # yet. - if request.mm_hashes[i] in mm_hashes_to_schedule: + if request.mm_features[i].identifier in mm_hashes_to_schedule: # The same encoder input has already been scheduled in the # current step. continue @@ -820,7 +820,7 @@ class Scheduler(SchedulerInterface): num_tokens_to_schedule += num_encoder_tokens encoder_compute_budget -= num_encoder_tokens - mm_hashes_to_schedule.add(request.mm_hashes[i]) + mm_hashes_to_schedule.add(request.mm_features[i].identifier) encoder_inputs_to_schedule.append(i) return ( @@ -1048,9 +1048,9 @@ class Scheduler(SchedulerInterface): # Here, we use list(set) to avoid modifying the set while iterating # over it. for input_id in list(cached_encoder_input_ids): - mm_positions = request.mm_positions[input_id] - start_pos = mm_positions.offset - num_tokens = mm_positions.length + mm_feature = request.mm_features[input_id] + start_pos = mm_feature.mm_position.offset + num_tokens = mm_feature.mm_position.length if self.is_encoder_decoder and request.num_computed_tokens > 0: # With Whisper, as soon as we've generated a single token, # we know we're done with the encoder input. Cross Attention diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 64cce3e9efc51..4e3e581235cce 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -91,11 +91,6 @@ class Request: self.mm_features = mm_features or [] self.num_encoder_inputs = len(self.mm_features) self.has_encoder_inputs = self.num_encoder_inputs > 0 - # TODO(sfeng33): Remove these legacy fields after clearing out all - # references in scheduler and model runner - self.mm_positions = [f.mm_position for f in self.mm_features] - self.mm_kwargs = [f.data for f in self.mm_features] - self.mm_hashes = [f.identifier for f in self.mm_features] # Read-only views # Prevent directly appending to these lists since @@ -180,8 +175,8 @@ class Request: return RequestStatus.get_finished_reason(self.status) def get_num_encoder_tokens(self, input_id: int) -> int: - assert input_id < len(self.mm_positions) - num_tokens = self.mm_positions[input_id].length + assert input_id < len(self.mm_features) + num_tokens = self.mm_features[input_id].mm_position.length return num_tokens def record_event( diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 1cf56656d7adf..339b9937b73f4 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -10,8 +10,7 @@ import torch from typing_extensions import deprecated from vllm.lora.request import LoRARequest -from vllm.multimodal.inputs import (MultiModalKwargsItem, - MultiModalKwargsItems, PlaceholderRange) +from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import swap_dict_values @@ -31,9 +30,7 @@ class CachedRequestState: req_id: str prompt_token_ids: list[int] - mm_kwargs: list[MultiModalKwargsItem] - mm_positions: list[PlaceholderRange] - mm_hashes: list[str] + mm_features: list[MultiModalFeatureSpec] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] generator: Optional[torch.Generator] @@ -60,7 +57,8 @@ class CachedRequestState: "removed in v0.13. Please use `mm_kwargs` instead.") def mm_inputs(self) -> list[MultiModalKwargsItems]: return [ - MultiModalKwargsItems.from_seq([item]) for item in self.mm_kwargs + MultiModalKwargsItems.from_seq([f.data]) for f in self.mm_features + if f.data is not None ] def get_token_id(self, idx: int) -> int: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ebb18e81c38a8..a17c8783d48fa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -555,9 +555,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, - mm_kwargs=new_req_data.mm_kwargs, - mm_positions=new_req_data.mm_positions, - mm_hashes=new_req_data.mm_hashes, + mm_features=new_req_data.mm_features, sampling_params=sampling_params, pooling_params=pooling_params, generator=generator, @@ -698,7 +696,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): second_per_grid_ts = [] audio_feature_lengths = [] use_audio_in_video = False - for mm_item in req_state.mm_kwargs: + for mm_feature in req_state.mm_features: + mm_item = mm_feature.data + if mm_item is None: + continue mm_input = mm_item.get_data() if (t := mm_input.get("image_grid_thw")) is not None: image_grid_thw.append(t.tolist()) @@ -731,7 +732,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_kwargs = list[MultiModalKwargsItem]() for req in scheduler_output.scheduled_new_reqs: - mm_kwargs.extend(req.mm_kwargs) + for feature in req.mm_features: + if feature.data is not None: + mm_kwargs.append(feature.data) # Input all modalities at once mm_kwargs_combined: BatchedTensorInputs = {} @@ -1361,10 +1364,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] for mm_input_id in encoder_input_ids: - mm_hash = req_state.mm_hashes[mm_input_id] - mm_kwargs.append(req_state.mm_kwargs[mm_input_id]) - mm_hashes_pos.append( - (mm_hash, req_state.mm_positions[mm_input_id])) + mm_feature = req_state.mm_features[mm_input_id] + mm_hash = mm_feature.identifier + mm_kwargs.append(mm_feature.data) + mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) return mm_kwargs, mm_hashes_pos @@ -1426,9 +1429,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] num_computed_tokens = \ req_state.num_computed_tokens + shift_computed_tokens - mm_positions = req_state.mm_positions - mm_hashes = req_state.mm_hashes - for i, pos_info in enumerate(mm_positions): + for mm_feature in req_state.mm_features: + pos_info = mm_feature.mm_position start_pos = pos_info.offset num_encoder_tokens = pos_info.length @@ -1451,7 +1453,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) assert start_idx < end_idx - mm_hash = mm_hashes[i] + mm_hash = mm_feature.identifier encoder_output = self.encoder_cache.get(mm_hash, None) assert encoder_output is not None,\ f"Encoder cache miss for {mm_hash}." diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 15af7ffac8095..43f12912707f1 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -387,9 +387,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, - mm_kwargs=new_req_data.mm_kwargs, - mm_positions=new_req_data.mm_positions, - mm_hashes=new_req_data.mm_hashes, + mm_features=new_req_data.mm_features, sampling_params=sampling_params, pooling_params=None, generator=None, @@ -822,10 +820,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] for mm_input_id in encoder_input_ids: - mm_hash = req_state.mm_hashes[mm_input_id] - mm_kwargs.append(req_state.mm_kwargs[mm_input_id]) - mm_hashes_pos.append( - (mm_hash, req_state.mm_positions[mm_input_id])) + mm_feature = req_state.mm_features[mm_input_id] + mm_hash = mm_feature.identifier + mm_kwargs.append(mm_feature.data) + mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) # Batch mm inputs as much as we can: if a request in the batch has # multiple modalities or a different modality than the previous one, @@ -883,13 +881,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_id] req_state = self.requests[req_id] num_computed_tokens = req_state.num_computed_tokens - mm_positions = req_state.mm_positions - mm_hashes = req_state.mm_hashes # TODO unroll loop and assume/enforce --disable_chunked_mm_input # NOTE (NickLucche) here we diverge from logic in other runners, as # we assume to only have whole mm items to process. Hence we avoid # the intrinsic dynamism that `gather_mm_placeholders` introduces. - for i, pos_info in enumerate(mm_positions): + for mm_feature in req_state.mm_features: + pos_info = mm_feature.mm_position start_pos = pos_info.offset num_encoder_tokens = pos_info.length @@ -904,8 +901,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # The encoder output is already processed and stored # in the decoder's KV cache. continue - - mm_hash = mm_hashes[i] + mm_hash = mm_feature.identifier encoder_output = self.encoder_cache.get(mm_hash, None) assert encoder_output is not None,\ f"Encoder cache miss for {mm_hash}." From bcb06d7baf22a80383bcc25251a1dcebfe9a3919 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Fri, 12 Sep 2025 15:43:12 +0200 Subject: [PATCH 222/584] [Doc]: fix typos in various files (#24726) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- benchmarks/kernels/benchmark_w8a8_block_fp8.py | 2 +- csrc/cpu/cpu_types_vxe.hpp | 2 +- csrc/cpu/sgl-kernels/moe.cpp | 2 +- docs/design/multiprocessing.md | 2 +- vllm/attention/backends/flash_attn.py | 2 +- vllm/benchmarks/datasets.py | 2 +- vllm/entrypoints/openai/protocol.py | 2 +- vllm/model_executor/layers/mamba/mamba_mixer2.py | 2 +- vllm/model_executor/models/minicpmv.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/tpu_input_batch.py | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 98bde9d83c82d..df2b713e46dc4 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -56,7 +56,7 @@ def w8a8_block_matmul( Bs: The per-block quantization scale for `B`. block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. - output_dytpe: The dtype of the returned tensor. + output_dtype: The dtype of the returned tensor. Returns: torch.Tensor: The result of matmul. diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp index ab8cbbbf4ec4f..51bca37e699b9 100644 --- a/csrc/cpu/cpu_types_vxe.hpp +++ b/csrc/cpu/cpu_types_vxe.hpp @@ -12,7 +12,7 @@ namespace vec_op { #define vec_sub(a, b) ((a) - (b)) #define vec_mul(a, b) ((a) * (b)) #define vec_div(a, b) ((a) / (b)) -#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic +#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left // FIXME: FP16 is not fully supported in Torch-CPU diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp index beeccff783ea0..94b24c2f13a06 100644 --- a/csrc/cpu/sgl-kernels/moe.cpp +++ b/csrc/cpu/sgl-kernels/moe.cpp @@ -215,7 +215,7 @@ int moe_align_block_size( offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M); } }); - // TODO: do we need to vecterize this ? + // TODO: do we need to vectorize this ? for (int mb = 0; mb < num_token_blocks; ++mb) { offsets[mb + 1] += offsets[mb]; } diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md index 247072d1cb275..6e92b20d267b4 100644 --- a/docs/design/multiprocessing.md +++ b/docs/design/multiprocessing.md @@ -8,7 +8,7 @@ page for information on known issues and how to solve them. ## Introduction !!! important - The source code references are to the state of the code at the time of writing in December, 2024. + The source code references are to the state of the code at the time of writing in December 2024. The use of Python multiprocessing in vLLM is complicated by: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index d8cb208c4f2ea..78c768f92d3c2 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -901,7 +901,7 @@ def _get_query_key_seq_metadata( attn_metadata.encoder_seq_start_loc, attn_metadata.max_encoder_seq_len) elif attn_type == AttentionType.ENCODER: - # For encoder attention both the query and the key are same i.e the + # For encoder attention both the query and the key are same i.e. the # encoder sequence. return (attn_metadata.encoder_seq_start_loc, attn_metadata.max_encoder_seq_len, diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 32820b026b6f6..bf9e87198bcf1 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset): [6880, 6881] -> ['Ġcalls', 'here'] -> [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] To avoid uncontrolled change of the prompt length, - the encoded sequence is truncated before being decode again. + the encoded sequence is truncated before being decoded again. """ # Build the inner sequence by sampling sequentially from the vocab inner_seq = ((offset + index + np.arange(input_len)) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index c8ecbd28e7db5..4dcb1f3f1c89f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors], elif processors: raise ValueError( "The `logits_processors` argument is not supported by this " - "server. See --logits-processor-pattern engine argugment " + "server. See --logits-processor-pattern engine argument " "for more information.") return None diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 04ebdbca85e5d..1623a2fd562c7 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp): # - the weight already has a "weight_loader" attribute # which set_weight_attrs will raise if we do not # delete before trying to override it - # - ditto for the otther two weights below + # - ditto for the other two weights below delattr(self.conv1d.bias, "weight_loader") set_weight_attrs( self.conv1d.bias, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 04176c5589ed6..9b2d84e32151a 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): def _process_multimodal_inputs(self, modalities: dict): # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a17c8783d48fa..6572e421b65b9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens += num_pad # If cudagraph_mode.decode_mode() == FULL and - # cudagraph_mode.seperate_routine(). This means that we are using + # cudagraph_mode.separate_routine(). This means that we are using # different graphs and/or modes for mixed prefill-decode batches vs. # uniform decode batches. A uniform decode batch means that all # requests have identical query length, except a potential virtual diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index 81c798685cb3a..dfa54d0ad83b6 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -392,7 +392,7 @@ class InputBatch: # NOTE: the following is unsafe # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\ # self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...] - # instead, we need to temporiarily copy the data for one of the indices + # instead, we need to temporarily copy the data for one of the indices # TODO(lucas): optimize this by only copying valid indices tmp = self.token_ids_cpu[i1, ...].copy() self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...] From 41f17bf2900611211f9a2145e11008f0c8cb45b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Fri, 12 Sep 2025 22:43:15 +0900 Subject: [PATCH 223/584] [Docs] Fix warnings in mkdocs build (continued) (#24740) Signed-off-by: Zerohertz <ohg3417@gmail.com> --- .../layers/quantization/torchao.py | 10 +- .../layers/quantization/utils/int8_utils.py | 2 +- .../layers/rotary_embedding/mrope.py | 4 +- .../model_executor/model_loader/tensorizer.py | 89 +++++++-------- vllm/model_executor/models/aria.py | 16 +-- vllm/model_executor/models/bart.py | 104 +++++++----------- vllm/model_executor/models/blip2.py | 1 - vllm/model_executor/models/donut.py | 18 +-- vllm/model_executor/models/florence2.py | 38 +++---- vllm/model_executor/models/glm4_1v.py | 15 +-- 10 files changed, 121 insertions(+), 176 deletions(-) diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 3498d2994c2ab..2efb605f203fd 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -144,9 +144,9 @@ def torchao_quantize_param_data(param: torch.Tensor, """Quantize a Tensor with torchao quantization specified by torchao_config Args: - `param`: weight parameter of the linear module - `torchao_config`: type of quantization and their arguments we want to - use to quantize the Tensor + param: weight parameter of the linear module + torchao_config: type of quantization and their arguments we want to + use to quantize the Tensor """ from torchao.core.config import AOBaseConfig from torchao.quantization import quantize_ @@ -172,8 +172,8 @@ class TorchAOLinearMethod(LinearMethodBase): """Linear method for torchao. Args: - torchao_config: The torchao quantization config, a string - that encodes the type of quantization and all relevant arguments. + quant_config: The torchao quantization config, a string that encodes + the type of quantization and all relevant arguments. """ def __init__(self, quant_config: TorchAOConfig): diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 6840cabbf1ae3..62e458ec3c93e 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -423,7 +423,7 @@ def w8a8_block_int8_matmul( Bs: The per-block quantization scale for `B`. block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. - output_dytpe: The dtype of the returned tensor. + output_dtype: The dtype of the returned tensor. Returns: torch.Tensor: The result of matmul. diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 27ca010cdc3ae..786a6e1b3e129 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -135,8 +135,8 @@ def triton_mrope( """Qwen2VL mrope kernel. Args: - query: [num_tokens, num_heads * head_size] - key: [num_tokens, num_kv_heads * head_size] + q: [num_tokens, num_heads * head_size] + k: [num_tokens, num_kv_heads * head_size] cos: [3, num_tokens, head_size //2 ] (T/H/W positions with multimodal inputs) sin: [3, num_tokens, head_size //2 ] diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 3d491be3156b6..58296131fadb9 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -171,51 +171,52 @@ class TensorizerConfig(MutableMapping): _is_sharded: bool = field(init=False, default=False) _fields: ClassVar[tuple[str, ...]] _keys: ClassVar[frozenset[str]] - """ - Args for the TensorizerConfig class. These are used to configure the - behavior of model serialization and deserialization using Tensorizer. + """Configuration class for Tensorizer settings. - Args: - tensorizer_uri: Path to serialized model tensors. Can be a local file - path or a S3 URI. This is a required field unless lora_dir is - provided and the config is meant to be used for the - `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or - `lora_dir` is passed to this object's initializer, this is a required - argument. - tensorizer_dir: Path to a directory containing serialized model tensors, - and all other potential model artifacts to load the model, such as - configs and tokenizer files. Can be passed instead of `tensorizer_uri` - where the `model.tensors` file will be assumed to be in this - directory. - vllm_tensorized: If True, indicates that the serialized model is a - vLLM model. This is used to determine the behavior of the - TensorDeserializer when loading tensors from a serialized model. - It is far faster to deserialize a vLLM model as it utilizes - tensorizer's optimized GPU loading. Note that this is now - deprecated, as serialized vLLM models are now automatically - inferred as vLLM models. - verify_hash: If True, the hashes of each tensor will be verified against - the hashes stored in the metadata. A `HashMismatchError` will be - raised if any of the hashes do not match. - num_readers: Controls how many threads are allowed to read concurrently - from the source file. Default is `None`, which will dynamically set - the number of readers based on the number of available - resources and model size. This greatly increases performance. - encryption_keyfile: File path to a binary file containing a - binary key to use for decryption. `None` (the default) means - no decryption. See the example script in - examples/others/tensorize_vllm_model.py. - s3_access_key_id: The access key for the S3 bucket. Can also be set via - the S3_ACCESS_KEY_ID environment variable. - s3_secret_access_key: The secret access key for the S3 bucket. Can also - be set via the S3_SECRET_ACCESS_KEY environment variable. - s3_endpoint: The endpoint for the S3 bucket. Can also be set via the - S3_ENDPOINT_URL environment variable. - lora_dir: Path to a directory containing LoRA adapter artifacts for - serialization or deserialization. When serializing LoRA adapters - this is the only necessary parameter to pass to this object's - initializer. - """ + These settings configure the behavior of model serialization and + deserialization using Tensorizer. + + Attributes: + tensorizer_uri: Path to serialized model tensors. Can be a local file + path or a S3 URI. This is a required field unless lora_dir is + provided and the config is meant to be used for the + `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or + `lora_dir` is passed to this object's initializer, this is + a required argument. + tensorizer_dir: Path to a directory containing serialized model tensors, + and all other potential model artifacts to load the model, such as + configs and tokenizer files. Can be passed instead of + `tensorizer_uri` where the `model.tensors` file will be assumed + to be in this directory. + vllm_tensorized: If True, indicates that the serialized model is a + vLLM model. This is used to determine the behavior of the + TensorDeserializer when loading tensors from a serialized model. + It is far faster to deserialize a vLLM model as it utilizes + tensorizer's optimized GPU loading. Note that this is now + deprecated, as serialized vLLM models are now automatically + inferred as vLLM models. + verify_hash: If True, the hashes of each tensor will be verified + against the hashes stored in the metadata. A `HashMismatchError` + will be raised if any of the hashes do not match. + num_readers: Controls how many threads are allowed to read concurrently + from the source file. Default is `None`, which will dynamically set + the number of readers based on the number of available + resources and model size. This greatly increases performance. + encryption_keyfile: File path to a binary file containing a + binary key to use for decryption. `None` (the default) means + no decryption. See the example script in + examples/others/tensorize_vllm_model.py. + s3_access_key_id: The access key for the S3 bucket. Can also be set via + the S3_ACCESS_KEY_ID environment variable. + s3_secret_access_key: The secret access key for the S3 bucket. Can also + be set via the S3_SECRET_ACCESS_KEY environment variable. + s3_endpoint: The endpoint for the S3 bucket. Can also be set via the + S3_ENDPOINT_URL environment variable. + lora_dir: Path to a directory containing LoRA adapter artifacts for + serialization or deserialization. When serializing LoRA adapters + this is the only necessary parameter to pass to this object's + initializer. + """ def __post_init__(self): # check if the configuration is for a sharded vLLM model diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 1c7960fa3e0a5..db262447d7fa8 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -143,16 +143,8 @@ class AriaProjector(nn.Module): projects ViT's outputs into MoE's inputs. Args: - patch_to_query_dict (dict): Maps patch numbers to their corresponding - query numbers, - e.g., {1225: 128, 4900: 256}. This allows for different query sizes - based on image resolution. - embed_dim (int): Embedding dimension. - num_heads (int): Number of attention heads. - kv_dim (int): Dimension of key and value. - ff_dim (int): Hidden dimension of the feed-forward network. - output_dim (int): Output dimension. - norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm. + config: [AriaConfig](https://huggingface.co/docs/transformers/main/model_doc/aria#transformers.AriaConfig) + containing projector configuration parameters. Outputs: A tensor with the shape of (batch_size, query_number, output_dim) @@ -282,8 +274,8 @@ class AriaTextMoELayer(nn.Module): Forward pass of the MoE Layer. Args: - hidden_states (torch.Tensor): Input tensor of shape (batch_size, - sequence_length, hidden_size). + hidden_states: Input tensor of shape + (batch_size, sequence_length, hidden_size). Returns: torch.Tensor: Output tensor after passing through the MoE layer. diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index fd4d820a01e9d..242530817c642 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -401,8 +401,7 @@ class BartEncoderLayer(nn.Module): def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: r""" Args: - hidden_states - torch.Tensor of *encoder* input embeddings. + hidden_states: torch.Tensor of *encoder* input embeddings. Returns: Encoder layer output torch.Tensor """ @@ -490,10 +489,8 @@ class BartDecoderLayer(nn.Module): ) -> torch.Tensor: r""" Args: - decoder_hidden_states - torch.Tensor of *decoder* input embeddings. - encoder_hidden_states - torch.Tensor of *encoder* input embeddings. + decoder_hidden_states: torch.Tensor of *decoder* input embeddings. + encoder_hidden_states: torch.Tensor of *encoder* input embeddings. Returns: Decoder layer output torch.Tensor """ @@ -584,12 +581,10 @@ class BartEncoder(nn.Module): ) -> torch.Tensor: r""" Args: - input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *encoder* input sequence tokens. + input_ids: Indices of *encoder* input sequence tokens in the + vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *encoder* input sequence tokens. Returns: Decoder output torch.Tensor """ @@ -663,14 +658,11 @@ class BartDecoder(nn.Module): ) -> torch.Tensor: r""" Args: - decoder_input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - decoder_positions - Positions of *decoder* input sequence tokens. - encoder_hidden_states: - Tensor of encoder output embeddings + decoder_input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + decoder_positions: Positions of *decoder* input sequence tokens. + encoder_hidden_states: Tensor of encoder output embeddings. Returns: Decoder output torch.Tensor """ @@ -732,16 +724,13 @@ class BartModel(nn.Module, SupportsQuant): encoder_positions: torch.Tensor) -> torch.Tensor: r""" Args: - input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *decoder* input sequence tokens. - encoder_input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - encoder_positions: - Positions of *encoder* input sequence tokens. + input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *decoder* input sequence tokens. + encoder_input_ids: Indices of *encoder* input sequence tokens + in the vocabulary. + encoder_positions: Positions of *encoder* input sequence tokens. Returns: Model output torch.Tensor """ @@ -848,14 +837,10 @@ class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices. Returns: Output torch.Tensor """ @@ -912,8 +897,7 @@ class MBartEncoderLayer(BartEncoderLayer): def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: r""" Args: - hidden_states - torch.Tensor of *encoder* input embeddings. + hidden_states: torch.Tensor of *encoder* input embeddings. Returns: Encoder layer output torch.Tensor """ @@ -1035,12 +1019,10 @@ class MBartEncoder(nn.Module): ) -> torch.Tensor: r""" Args: - input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *encoder* input sequence tokens. + input_ids: Indices of *encoder* input sequence tokens in the + vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *encoder* input sequence tokens. Returns: Decoder output torch.Tensor """ @@ -1116,14 +1098,11 @@ class MBartDecoder(nn.Module): ) -> torch.Tensor: r""" Args: - decoder_input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - decoder_positions - Positions of *decoder* input sequence tokens. - encoder_hidden_states: - Tensor of encoder output embeddings + decoder_input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + decoder_positions: Positions of *decoder* input sequence tokens. + encoder_hidden_states: Tensor of encoder output embeddings. Returns: Decoder output torch.Tensor """ @@ -1185,16 +1164,13 @@ class MBartModel(nn.Module, SupportsQuant): encoder_positions: torch.Tensor) -> torch.Tensor: r""" Args: - input_ids - Indices of *decoder* input sequence tokens in the vocabulary. - Padding will be ignored by default should you - provide it. - positions - Positions of *decoder* input sequence tokens. - encoder_input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - encoder_positions: - Positions of *encoder* input sequence tokens. + input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. + Padding will be ignored by default should you provide it. + positions: Positions of *decoder* input sequence tokens. + encoder_input_ids: Indices of *encoder* input sequence tokens + in the vocabulary. + encoder_positions: Positions of *encoder* input sequence tokens. Returns: Model output torch.Tensor """ diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index ed98a3008c567..c1e7a7d498b11 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -678,7 +678,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each input image. Info: [Blip2ImageInputs][] diff --git a/vllm/model_executor/models/donut.py b/vllm/model_executor/models/donut.py index c00db52371b68..23f4c6a4f93fc 100644 --- a/vllm/model_executor/models/donut.py +++ b/vllm/model_executor/models/donut.py @@ -79,10 +79,8 @@ class DonutLanguageForConditionalGeneration(nn.Module, SupportsV0Only): ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. Returns: Output torch.Tensor """ @@ -351,14 +349,10 @@ class DonutForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices Returns: Output torch.Tensor """ diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index d0881231fb1e7..5e05e0c60f41c 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -631,16 +631,14 @@ class Florence2LanguageModel(nn.Module): ) -> torch.Tensor: r""" Args: - input_ids - Indices of *decoder* input sequence tokens in the vocabulary. + input_ids: Indices of *decoder* input sequence tokens + in the vocabulary. Padding will be ignored by default should you provide it. - positions - Positions of *decoder* input sequence tokens. - encoder_input_ids - Indices of *encoder* input sequence tokens in the vocabulary. - encoder_positions: - Positions of *encoder* input sequence tokens. + positions: Positions of *decoder* input sequence tokens. + encoder_input_ids: Indices of *encoder* input sequence tokens + in the vocabulary. + encoder_positions: Positions of *encoder* input sequence tokens. Returns: Model output torch.Tensor """ @@ -699,14 +697,10 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices Returns: Output torch.Tensor """ @@ -1068,14 +1062,10 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: r""" Args: - input_ids - torch.Tensor of *decoder* input token ids. - positions - torch.Tensor of *decoder* position indices. - encoder_input_ids - torch.Tensor of *encoder* input token ids. - encoder_positions - torch.Tensor of *encoder* position indices + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_input_ids: torch.Tensor of *encoder* input token ids. + encoder_positions: torch.Tensor of *encoder* position indices Returns: Output torch.Tensor """ diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 279f458dfa6cf..869287fc0268a 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1599,17 +1599,10 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, **NOTE**: If mrope is enabled (default setting for GLM-4V opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. - second_per_grid_ts: Tensor `(num_videos)` of video time interval ( - in seconds) for each grid along the temporal dimension in the - 3D position IDs. `None` if no videos are passed. + intermediate_tensors: Optional intermediate tensors for pipeline + parallelism. + inputs_embeds: Optional pre-computed input embeddings. + **kwargs: Additional keyword arguments. """ if intermediate_tensors is not None: inputs_embeds = None From 4d7c1d531b89b400b31c9d2e369299d8205980e5 Mon Sep 17 00:00:00 2001 From: Yan Ma <yan.ma@intel.com> Date: Fri, 12 Sep 2025 21:43:56 +0800 Subject: [PATCH 224/584] [Bugfix] Fix MRoPE dispatch on XPU (#24724) Signed-off-by: Yan Ma <yan.ma@intel.com> --- vllm/model_executor/layers/rotary_embedding/mrope.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 786a6e1b3e129..69849fdac0277 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -300,6 +300,15 @@ class MRotaryEmbedding(RotaryEmbedding): key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key + def forward_xpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + return self.forward_native(positions, query, key, offsets) + def forward_cpu( self, positions: torch.Tensor, From 9f04d9d55f29c31dcc623abbf74edd0d2fc1252b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= <elvircrn@gmail.com> Date: Fri, 12 Sep 2025 16:54:04 +0200 Subject: [PATCH 225/584] [Qwen3-Next] MoE configs for H100 TP=1,2 and TP2/EP (#24739) Signed-off-by: elvircrn <elvircrn@gmail.com> --- ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ ...256,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..cc853947c19f5 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..cc1427c139e3e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..564ff499d43c4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} From a5b84f1cbfeb4030fd7fab4a91d4264b356e20b0 Mon Sep 17 00:00:00 2001 From: dongluw <108290936+dongluw@users.noreply.github.com> Date: Fri, 12 Sep 2025 10:54:17 -0400 Subject: [PATCH 226/584] [Core] Shared memory based object store for Multimodal data caching and IPC (#20452) Signed-off-by: donglu <donglu@cohere.com> --- .buildkite/test-pipeline.yaml | 2 + docs/configuration/optimization.md | 31 +- tests/distributed/test_shm_buffer.py | 172 +++++ tests/distributed/test_shm_storage.py | 327 +++++++++ tests/multimodal/test_cache.py | 8 +- tests/tensorizer_loader/conftest.py | 1 + tools/check_pickle_imports.py | 1 + vllm/config/__init__.py | 20 + .../shm_object_storage.py | 635 ++++++++++++++++++ vllm/engine/arg_utils.py | 17 +- vllm/envs.py | 7 + vllm/executor/uniproc_executor.py | 10 + vllm/multimodal/cache.py | 220 +++++- vllm/utils/__init__.py | 6 + vllm/v1/engine/core.py | 4 +- vllm/v1/executor/multiproc_executor.py | 28 +- vllm/v1/executor/utils.py | 25 + 17 files changed, 1487 insertions(+), 27 deletions(-) create mode 100644 tests/distributed/test_shm_buffer.py create mode 100644 tests/distributed/test_shm_storage.py create mode 100644 vllm/distributed/device_communicators/shm_object_storage.py create mode 100644 vllm/v1/executor/utils.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b0f5fe418dcf3..b633625a77e30 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -789,6 +789,8 @@ steps: commands: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py - label: 2 Node Tests (4 GPUs in total) # 16min timeout_in_minutes: 30 diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index c853fcf92941e..5807d787cf531 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -230,6 +230,20 @@ Multi-modal IPC caching is automatically enabled when there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes, to avoid repeatedly transferring the same multi-modal inputs between them. +#### Key-Replicated Cache + +By default, IPC caching uses a **key-replicated cache**, where cache keys exist +in both the API (`P0`) and engine core (`P1`) processes, but the actual cache +data resides only in `P1`. + +#### Shared Memory Cache + +When multiple worker processes are involved (e.g., when TP > 1), a +**shared-memory cache** is more efficient. This can be enabled by setting +`mm_processor_cache_type="shm"`. In this mode, cache keys are stored +on `P0`, while the cache data itself lives in shared memory accessible by all +processes. + ### Configuration You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB). @@ -244,6 +258,12 @@ Examples: llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=8) +# Use a shared-memory based IPC cache +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + tensor_parallel_size=2, + mm_processor_cache_type="shm", + mm_processor_cache_gb=8) + # Disable the cache llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=0) @@ -253,11 +273,12 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows: -| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory | -|-------------------|-------------|------------|------------|-------------| -| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` | -| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` | -| ❌ | ❌ | N/A | N/A | `0` | +| mm_processor_cache_type | Cache Type | `P0` Cache | `P1` Engine Cache | `P1` Worker Cache | Max. Memory | +|-------------------|-------------|------------|------------|-------------|-------------| +| lru | Processor Caching | K + V | N/A | N/A | `mm_processor_cache_gb * data_parallel_size` | +| lru | Key-Replicated Caching | K | K + V | N/A | `mm_processor_cache_gb * api_server_count` | +| shm | Shared Memory Caching | K | N/A | V | `mm_processor_cache_gb * api_server_count` | +| N/A | Disabled | N/A | N/A | N/A | `0` | K: Stores the hashes of multi-modal items V: Stores the processed tensor data of multi-modal items diff --git a/tests/distributed/test_shm_buffer.py b/tests/distributed/test_shm_buffer.py new file mode 100644 index 0000000000000..f70028b879609 --- /dev/null +++ b/tests/distributed/test_shm_buffer.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import traceback +import unittest + +from vllm.distributed.device_communicators.shm_object_storage import ( + SingleWriterShmRingBuffer) + + +class TestSingleWriterShmRingBuffer(unittest.TestCase): + """Test suite for the ring buffer implementation""" + + def setUp(self): + """Set up test fixtures""" + self.buffer_size = 4096 + self.ring_buffer = None + + def tearDown(self): + """Clean up after tests""" + if self.ring_buffer: + del self.ring_buffer + + def test_buffer_opening(self): + """Test opening an existing buffer""" + # First create a buffer + self.ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=self.buffer_size, create=True) + + # Then open it with another instance + reader_buffer = SingleWriterShmRingBuffer(*self.ring_buffer.handle()) + self.assertFalse(reader_buffer.is_writer) + self.assertEqual(reader_buffer.shared_memory.name, + self.ring_buffer.shared_memory.name) + + def test_buffer_access(self): + """Test accessing allocated buffers""" + self.ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=self.buffer_size, create=True) + + size = 100 + address, monotonic_id = self.ring_buffer.allocate_buf(size) + + # Write some test data + test_data = b"Hello, World!" * 7 # 91 bytes + with self.ring_buffer.access_buf(address) as (data_buf, metadata): + data_buf[0:len(test_data)] = test_data + + # Read it back + with self.ring_buffer.access_buf(address) as (data_buf2, metadata2): + read_data = bytes(data_buf2[0:len(test_data)]) + read_id = metadata2[0] + + self.assertEqual(read_data, test_data) + self.assertEqual(read_id, monotonic_id) + + def test_memory_error_on_full_buffer(self): + """Test that MemoryError is raised when buffer is full""" + small_buffer_size = 200 + self.ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=small_buffer_size, create=True) + + # Fill up the buffer + self.ring_buffer.allocate_buf(100) + self.ring_buffer.allocate_buf(80) # Total: 196 bytes used + + # This should fail + with self.assertRaises(MemoryError): + self.ring_buffer.allocate_buf(1) # Would exceed buffer capacity + + def test_allocation_and_free(self): + """Test allocation and freeing of buffers""" + small_buffer_size = 200 + self.ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=small_buffer_size, create=True) + + size = 80 + # Write some data + test_data = b"Repeated test data" + for i in range(5): + address, monotonic_id = self.ring_buffer.allocate_buf(size) + with self.ring_buffer.access_buf(address) as (data_buf, metadata): + data_buf[0:4] = (0).to_bytes(4, "little") # 0 for not in-use + data_buf[4:len(test_data) + 4] = test_data + print(self.ring_buffer.metadata) + freed_ids = self.ring_buffer.free_buf(lambda *args: True) + print(f" Freed IDs: {freed_ids}") + self.assertEqual(freed_ids[0], i) + + def test_clear_buffer(self): + """Test clearing the buffer""" + self.ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=self.buffer_size, create=True) + + # Allocate some buffers + for _ in range(3): + self.ring_buffer.allocate_buf(100) + + # Clear the buffer + self.ring_buffer.clear() + + # Check that metadata is empty and IDs reset + self.assertEqual(len(self.ring_buffer.metadata), 0) + self.assertEqual(self.ring_buffer.monotonic_id_start, 0) + self.assertEqual(self.ring_buffer.monotonic_id_end, 0) + self.assertEqual(self.ring_buffer.data_buffer_start, 0) + self.assertEqual(self.ring_buffer.data_buffer_end, 0) + + +def main(): + """Main function demonstrating usage and running tests""" + print("=== SingleWriterShmRingBuffer Test Suite ===\n") + + # Run unit tests + print("Running unit tests...") + unittest.main(argv=[""], exit=False, verbosity=2) + + print("\n" + "=" * 50) + print("=== Manual Demo ===\n") + + # Manual demonstration + try: + print("Creating ring buffer...") + writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048, + create=True) + reader_buffer = SingleWriterShmRingBuffer(*writer_buffer.handle()) + + print(f"Buffer created with name: {writer_buffer.shared_memory.name}") + + # Allocate some buffers + print("\nAllocating buffers...") + address_array = [] + for i in range(3): + size = 100 + i * 50 + try: + writer_buffer.free_buf(lambda *args: True) + address, monotonic_id = writer_buffer.allocate_buf(size) + address_array.append((address, size, monotonic_id)) + + # Write some test data + with writer_buffer.access_buf(address) as (data_buf, metadata): + test_message = f"Test message {i}".encode() + data_buf[0:len(test_message)] = test_message + + except MemoryError as e: + print(f" Failed to allocate {size} bytes: {e}") + + print("\nBuffer state:") + print(f" Data buffer start: {writer_buffer.data_buffer_start}") + print(f" Data buffer end: {writer_buffer.data_buffer_end}") + print(f" Monotonic ID start: {writer_buffer.monotonic_id_start}") + print(f" Monotonic ID end: {writer_buffer.monotonic_id_end}") + print(f" Metadata entries: {len(writer_buffer.metadata)}") + + # Try to read back the data + print("\nReading back data...") + for address, size, monotonic_id in address_array: + with reader_buffer.access_buf(address) as (data_buf, metadata): + # Find null terminator or read first 50 chars + data_bytes = bytes(data_buf[0:size]) + message = data_bytes.decode() + print(f" ID {monotonic_id}: '{message}'") + + except Exception as e: + print(f"Demo error: {e}") + traceback.print_exc() + + print("\n=== Demo Complete ===") + + +if __name__ == "__main__": + main() diff --git a/tests/distributed/test_shm_storage.py b/tests/distributed/test_shm_storage.py new file mode 100644 index 0000000000000..03495222bc1b8 --- /dev/null +++ b/tests/distributed/test_shm_storage.py @@ -0,0 +1,327 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import multiprocessing +import random +import time +import traceback +import unittest +from multiprocessing import Lock + +import torch + +# Assuming these are imported from your module +from vllm.distributed.device_communicators.shm_object_storage import ( + MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer) +from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, + MultiModalSharedField) + + +def _dummy_elem(modality: str, key: str, size: int): + return MultiModalFieldElem( + modality=modality, + key=key, + data=torch.empty((size, ), dtype=torch.int8), + field=MultiModalSharedField(1), + ) + + +def _dummy_item(modality: str, size_by_key: dict[str, int]): + return MultiModalKwargsItem.from_elems([ + _dummy_elem(modality, key, size) for key, size in size_by_key.items() + ]) + + +class TestSingleWriterShmObjectStorage(unittest.TestCase): + + def setUp(self): + """Set up test fixtures before each test method.""" + ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=1024 * 100, + create=True, # 10 MB buffer + ) + self.storage = SingleWriterShmObjectStorage( + max_object_size=1024 * 10, # 10KB max object + n_readers=2, + ring_buffer=ring_buffer, + serde_class=MsgpackSerde, + reader_lock=Lock(), + ) + + def tearDown(self): + """Clean up after each test.""" + if self.storage: + del self.storage + + def test_minimal_put_get_cycle(self): + """Test basic put and get operations.""" + key = "test_key" + value = _dummy_item("text", {"field1": 10, "field2": 20}) + + # Put operation + address, monotonic_id = self.storage.put(key, value) + + # Verify key is in index + self.assertIn(key, self.storage.key_index) + self.assertEqual(self.storage.key_index[key], (address, monotonic_id)) + self.assertEqual(self.storage.id_index[monotonic_id], key) + + # Get operation + result = self.storage.get(address, monotonic_id) + + # Verify result + self.assertEqual(result, value) + + def test_put_same_key_twice(self): + """Test behavior when putting the same key multiple times.""" + key = "duplicate_key" + value1 = "first value" + value2 = "second value" + + # First put + address1, id1 = self.storage.put(key, value1) + retrieved1 = self.storage.get(address1, id1) + self.assertEqual(retrieved1, value1) + + # should raise an error on second put + with self.assertRaises(ValueError) as context: + self.storage.put(key, value2) + + self.assertIn("already exists in the storage", str(context.exception)) + + def test_large_object_rejection(self): + """Test that objects exceeding max_object_size are rejected.""" + # Create an object larger than max_object_size + large_data = "x" * (self.storage.max_object_size + 100) + + with self.assertRaises(ValueError) as context: + self.storage.put("large_key", large_data) + + self.assertIn("exceeds max object size", str(context.exception)) + + def test_buffer_overflow_and_cleanup(self): + """Test behavior when buffer fills up and needs cleanup.""" + # Fill up the buffer with many small objects + stored_items = [] + + try: + for i in range(1000): # Try to store many items + key = f"item_{i}" + value = f"data_{i}" * 100 # Make it reasonably sized + address, monotonic_id = self.storage.put(key, value) + stored_items.append((key, value, address, monotonic_id)) + except MemoryError: + print(f"Buffer filled after {len(stored_items)} items") + + # Verify that some items are still accessible + accessible_count = 0 + for key, original_value, address, monotonic_id in stored_items: + for i in range(self.storage.n_readers): + retrieved = self.storage.get(address, monotonic_id) + if retrieved == original_value: + accessible_count += 1 + + self.assertEqual(accessible_count, len(stored_items)) + + try: + for i in range(len(stored_items), 1000): # Try to store many items + key = f"item_{i}" + value = f"data_{i}" * 100 # Make it reasonably sized + address, monotonic_id = self.storage.put(key, value) + stored_items.append((key, value, address, monotonic_id)) + except MemoryError: + print(f"Buffer filled after {len(stored_items)} items") + + # Verify that some items are still accessibles + for key, original_value, address, monotonic_id in stored_items: + try: + for i in range(self.storage.n_readers): + retrieved = self.storage.get(address, monotonic_id) + if retrieved == original_value: + accessible_count += 1 + except ValueError as e: + print(f"Error retrieving {key}: {e}") + + # some items from the first batch may still be accessible + self.assertGreaterEqual(accessible_count, len(stored_items)) + + def test_blocking_unread_object(self): + """Test behavior when buffer fills up and needs cleanup.""" + # Fill up the buffer with many small objects + stored_items = [] + + try: + for i in range(1000): # Try to store many items + key = f"item_{i}" + value = f"data_{i}" * 100 # Make it reasonably sized + address, monotonic_id = self.storage.put(key, value) + stored_items.append((key, value, address, monotonic_id)) + except MemoryError: + print(f"Buffer filled after {len(stored_items)} items") + + # read all items except the first one + # to simulate a blocking situation + accessible_count = 0 + for key, original_value, address, monotonic_id in stored_items[1:]: + for i in range(self.storage.n_readers): + retrieved = self.storage.get(address, monotonic_id) + if retrieved == original_value: + accessible_count += 1 + + self.assertEqual(accessible_count, len(stored_items) - 1) + + try: + key = f"item_{len(stored_items)}" + value = f"data_{len(stored_items)}" * 100 + address, monotonic_id = self.storage.put(key, value) + except MemoryError: + print(f"Buffer filled after {len(stored_items)} items") + + # read the first item + for i in range(self.storage.n_readers): + key, original_value, address, monotonic_id = stored_items[0] + retrieved = self.storage.get(address, monotonic_id) + self.assertEqual(retrieved, original_value) + + try: + for i in range(len(stored_items), 1000): # Try to store many items + key = f"item_{i}" + value = f"data_{i}" * 100 # Make it reasonably sized + address, monotonic_id = self.storage.put(key, value) + stored_items.append((key, value, address, monotonic_id)) + except MemoryError: + print(f"Buffer filled after {len(stored_items)} items") + + # some items from the first batch may still be accessible + self.assertGreaterEqual(len(stored_items), accessible_count + 10) + + def test_invalid_get_operations(self): + """Test various invalid get operations.""" + # Test with non-existent address + with self.assertRaises(ValueError): # Could be various exceptions + self.storage.get(99999, 1) + + # Store something first + address, monotonic_id = self.storage.put("test", "value") + + # Test with wrong monotonic_id + with self.assertRaises(ValueError) as context: + self.storage.get(address, monotonic_id + 100) + + self.assertIn("has been modified or is invalid", \ + str(context.exception)) + + def test_clear_storage(self): + """Test clearing the storage.""" + # Store some items + for i in range(5): + self.storage.put(f"item_{i}", f"value_{i}") + + # Clear the storage + self.storage.clear() + + # Verify that all indices are empty + self.assertEqual(len(self.storage.key_index), 0) + self.assertEqual(len(self.storage.id_index), 0) + self.assertEqual(len(self.storage.ring_buffer.metadata), 0) + + # Verify that new items can be added after clearing + address, monotonic_id = self.storage.put("new_item", "new_value") + self.assertIn("new_item", self.storage.key_index) + self.assertEqual((address, monotonic_id), (0, 0)) + + +# Reader process function +def reader_process(process_id, storage_handle, items_to_read): + """Reader process that connects to existing shared memory and reads data.""" + reader_storage = SingleWriterShmObjectStorage.create_from_handle( + storage_handle) + + print(f"Reader {process_id} started") + + errors = [] + + for key, original_value, address, monotonic_id in items_to_read: + time.sleep(random.random() / 100) + try: + # Read data from shared memory + retrieved_value = reader_storage.get(address, monotonic_id) + + # Verify data integrity + assert retrieved_value == original_value + print(f"Reader {process_id} retrieved {key}: {retrieved_value}") + except Exception as e: + errors.append((key, str(e), type(e).__name__)) + + +def run_multiprocess_example(): + """Run a minimal working example with real shared memory.""" + print("=== Minimal Object Storage Example ===") + + try: + # Create storage instance + ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=1024 * 100, + create=True, # 10 MB buffer + ) + storage = SingleWriterShmObjectStorage( + max_object_size=1024, + n_readers=3, + ring_buffer=ring_buffer, + serde_class=MsgpackSerde, + reader_lock=Lock(), + ) + + print(f"Created storage (writer: {storage.is_writer})") + + # Test basic data types + test_data = [ + ("user_data", { + "name": "Alice", + "age": 30, + "scores": [95, 87, 92] + }), + ("simple_string", "Hello, World!"), + ("number", 42), + ("list_data", [1, 2, 3, "four", 5.0]), + ] + + stored_items = [] + + # Store all data + for key, value in test_data: + print(f"Storing {key}: {value}") + address, monotonic_id = storage.put(key, value) + stored_items.append((key, value, address, monotonic_id)) + print(f" -> Stored at address {address}, ID {monotonic_id}") + + print("\n--- Retrieving Data ---") + processes = [] + handle = storage.handle() + # initialize lock for reader processes + handle.reader_lock = Lock() + for i in range(storage.n_readers): + p = multiprocessing.Process(target=reader_process, + args=(i, handle, stored_items)) + processes.append(p) + p.start() + + for p in processes: + p.join(timeout=10) + if p.is_alive(): + p.terminate() + p.join() + + except Exception as e: + print(f"Error in minimal example: {e}") + traceback.print_exc() + + +if __name__ == "__main__": + # Run the minimal example first + run_multiprocess_example() + print("\n" + "=" * 50 + "\n") + + # Run the test suite + print("Running comprehensive test suite...") + unittest.main(verbosity=2, exit=False) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 44c05db2278f7..3c61ee26e092e 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -10,8 +10,8 @@ from vllm.config import ModelConfig, ParallelConfig, VllmConfig from vllm.multimodal.cache import (MultiModalCache, MultiModalProcessorCacheItem, MultiModalProcessorCacheItemMetadata, - processor_cache_from_config, - receiver_cache_from_config) + engine_receiver_cache_from_config, + processor_cache_from_config) from vllm.multimodal.hasher import MultiModalHasher from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, MultiModalKwargsItems, @@ -115,9 +115,9 @@ def _compare_caches( ): mm_registry = MultiModalRegistry() cache_0_p0 = processor_cache_from_config(config_0, mm_registry) - cache_0_p1 = receiver_cache_from_config(config_0, mm_registry) + cache_0_p1 = engine_receiver_cache_from_config(config_0, mm_registry) cache_1_p0 = processor_cache_from_config(config_1, mm_registry) - cache_1_p1 = receiver_cache_from_config(config_1, mm_registry) + cache_1_p1 = engine_receiver_cache_from_config(config_1, mm_registry) cache_size_gb = max( config_0.model_config.mm_processor_cache_gb, diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 18aa4c88c0338..571dc2e0eb50f 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -90,6 +90,7 @@ class DummyExecutor(UniProcExecutor): distributed_init_method=distributed_init_method, is_driver_worker=is_driver_worker, ) + self.mm_receiver_cache = None self.collective_rpc("init_worker", args=([kwargs], )) self.collective_rpc("init_device") diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py index ad0ae45d1d465..fe717121db40d 100644 --- a/tools/check_pickle_imports.py +++ b/tools/check_pickle_imports.py @@ -39,6 +39,7 @@ ALLOWED_FILES = set([ 'vllm/engine/multiprocessing/client.py', 'vllm/distributed/device_communicators/all_reduce_utils.py', 'vllm/distributed/device_communicators/shm_broadcast.py', + 'vllm/distributed/device_communicators/shm_object_storage.py', 'vllm/engine/multiprocessing/engine.py', 'benchmarks/kernels/graph_machete_bench.py', 'benchmarks/kernels/benchmark_lora.py', diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index ee58802766c4c..85e58c290b792 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -262,6 +262,7 @@ def is_init_field(cls: ConfigType, name: str) -> bool: TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] MMEncoderTPMode = Literal["weights", "data"] +MMCacheType = Literal["shm", "lru"] class LogprobsMode(enum.Enum): @@ -450,6 +451,13 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" + mm_processor_cache_type: MMCacheType = "lru" + """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, + use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" + mm_shm_cache_max_object_size_mb: int = 128 + """Size limit (in MiB) for each object stored in the multi-modal processor + shared memory cache. Only effective when `mm_processor_cache_type` is + `"shm"`.""" mm_encoder_tp_mode: MMEncoderTPMode = "weights" """Indicates how to optimize multi-modal encoder inference using tensor parallelism (TP). @@ -881,6 +889,9 @@ class ModelConfig: media_io_kwargs=self.media_io_kwargs, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_processor_cache_type=self.mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=self. + mm_shm_cache_max_object_size_mb, mm_encoder_tp_mode=self.mm_encoder_tp_mode, interleave_mm_strings=self.interleave_mm_strings, skip_mm_profiling=self.skip_mm_profiling, @@ -2448,6 +2459,15 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ + mm_processor_cache_type: MMCacheType = "lru" + """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, + use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" + + mm_shm_cache_max_object_size_mb: int = 128 + """Size limit (in MiB) for each object stored in the multi-modal processor + shared memory cache. Only effective when `mm_processor_cache_type` is + `"shm"`.""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" """ Indicates how to optimize multi-modal encoder inference using diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py new file mode 100644 index 0000000000000..3fac104bda1e8 --- /dev/null +++ b/vllm/distributed/device_communicators/shm_object_storage.py @@ -0,0 +1,635 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pickle +from abc import ABC, abstractmethod +from collections.abc import Iterable +from contextlib import contextmanager +from dataclasses import dataclass +from itertools import chain +from multiprocessing import shared_memory +from multiprocessing.synchronize import Lock as LockType +from typing import Any, Callable, Optional, Union +from unittest.mock import patch + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class SingleWriterShmRingBuffer: + """ + A single-writer, multiple-reader ring buffer implementation using shared + memory. This class provides a thread-safe ring buffer where one process + can write data while multiple processes/threads can read from it. + + Architecture: + - Uses shared memory for cross-process communication + - Maintains metadata for each allocated buffer chunk in the writer process + - Supports custom "is_free_fn" functions to determine when buffers can be + reused + - Each buffer chunk contains: [4-byte id][4-byte size][actual_data] + + Key Concepts: + - monotonic_id_start/end: Track the range of active buffer IDs + - data_buffer_start/end: Track the physical memory range in use + - Automatic wraparound when reaching buffer end + - Lazy garbage collection based on is_free_fn checks + + Example Usage Scenarios: + + Scenario 1: Simple Linear Allocation + ``` + Buffer size: 100 bytes + Initial state: [................................................. ] + ^start=end(0) + + After allocating 20 bytes (id=0): + [id:0|size:20|data........][...................................] + ^start(0) ^end(28) + + After allocating 30 bytes (id=1): + [id:0|size:20|data........][id:1|size:30|data..............][..] + ^start(0) ^end(66) + ``` + + Scenario 2: Memory Reclamation + ``` + Before freeing (both buffers still in use): + [id:0|size:20|data........][id:1|size:30|data..............][..] + ^start(0) ^end(66) + + After id:0 is marked free by readers: + [FREED.................... ][id:1|size:30|data..............][..] + ^start(28) ^end(66) + + After both are freed: + [FREED..............................................][..] + ^start=end(66) + ``` + + Scenario 3: Wraparound Allocation (continuing from Scenario 2) + ``` + Starting from after memory reclamation in Scenario 2: + [FREED..............................................][..] + ^start=end(66) + + Allocate 40 bytes (id=2) - only 34 bytes available at end, so wraparound: + [id:2|size:40|data........................][FREED.............][..] + ^end(148) ^start(66) + ``` + + Scenario 4: Error Handling - Out of Space + ``` + Starting from after wraparound allocation in Scenario 3: + [id:2|size:40|data........................][FREED.............][..] + ^end(148) ^start(66) + + Trying to allocate 20 more bytes: + occupied_size_new = end + size - start = 148 + 28 - 66 > buffer_size(100) + -> Raises MemoryError: "Not enough space in the data buffer" + ``` + + Thread Safety: + - Single writer: Only one process/thread should write (allocate_buf) + - Multiple readers: Multiple processes/threads can read (access_buf) + - Reader synchronization handled by is_free_fn callback + - Writer handles garbage collection (free_buf) based on reader feedback + + Memory Layout per Buffer Chunk: + [4-byte monotonic_id][4-byte chunk_size][actual_data...] + ^metadata_start ^data_start + + The monotonic_id ensures data integrity - readers can verify they're + accessing the correct data even after buffer wraparound or reuse. + """ + + def __init__( + self, + data_buffer_size: int, + name: Optional[str] = None, + create: bool = False, + ): + self.data_buffer_size = data_buffer_size + self.is_writer = create + + self.ID_NBYTES = 4 + self.ID_MAX = 2**31 # exclusive, so 2**31 - 1 is the max value + self.SIZE_NBYTES = 4 + # 4 bytes for id, 4 bytes for buffer size + self.MD_SIZE = self.ID_NBYTES + self.SIZE_NBYTES + self.monotonic_id_end = 0 + self.monotonic_id_start = 0 + self.data_buffer_start = 0 + self.data_buffer_end = 0 + + if create: + # we are creating a buffer + self.metadata = { + self.monotonic_id_end: self.data_buffer_end + } # monotonic_id -> start address + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.data_buffer_size, name=name) + else: + # we are opening an existing buffer + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + self.shared_memory = shared_memory.SharedMemory(name=name) + # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa + # Some platforms allocate memory based on page size, + # so the shared memory block size may be larger or equal + # to the requested size. The size parameter is ignored + # when attaching to an existing block. + assert self.shared_memory.size >= self.data_buffer_size + + logger.debug("Shared memory created/opened with name: %s, size: %d", + self.shared_memory.name, self.data_buffer_size) + + def handle(self): + return ( + self.data_buffer_size, + self.shared_memory.name, + ) + + def clear(self) -> None: + """Clear the ring buffer.""" + assert self.is_writer, "Only the writer can clear the buffer." + self.metadata.clear() + self.monotonic_id_end = 0 + self.monotonic_id_start = 0 + self.data_buffer_start = 0 + self.data_buffer_end = 0 + + def __del__(self): + if hasattr(self, "shared_memory"): + self.shared_memory.close() + if self.is_writer: + self.shared_memory.unlink() + + def int2byte(self, integer: int) -> bytes: + """Convert an integer to bytes.""" + return integer.to_bytes(self.ID_NBYTES, "little", signed=True) + + def byte2int(self, byte_data: bytes) -> int: + """Convert bytes back to an integer.""" + return int.from_bytes(byte_data, "little", signed=True) + + def allocate_buf(self, size: int) -> tuple[int, int]: + ''' + Allocate a buffer `MD_SIZE` + `size` bytes in the shared memory. + Memory layout: + [4-byte monotonic_id][4-byte size][buffer data...] + ''' + assert self.is_writer, "Only the writer can allocate buffers." + assert size > 0, "Size must be greater than 0" + size += self.MD_SIZE # add metadata size to the buffer size + # reset to beginning if the buffer does have enough contiguous space + buffer_end_reset = self.data_buffer_end % self.data_buffer_size + if buffer_end_reset + size > self.data_buffer_size: + buffer_end_reset = (self.data_buffer_end // self.data_buffer_size + + 1) * self.data_buffer_size + else: # no reset needed + buffer_end_reset = self.data_buffer_end + + # check if we have enough space in the data buffer + # i.e. if the new end (self.data_buffer_end + size) + # exceeds the start of the data buffer + occupied_size_new = buffer_end_reset + size - self.data_buffer_start + if occupied_size_new > self.data_buffer_size: + raise MemoryError("Not enough space in the data buffer, " + "try calling free_buf() to free up space") + self.data_buffer_end = buffer_end_reset + + # first 4 bytes as the monotonic id + buf_idx = self.data_buffer_end % self.data_buffer_size + self.shared_memory.buf[buf_idx:buf_idx + self.ID_NBYTES] = \ + self.int2byte(self.monotonic_id_end) + # next 4 bytes as the size of the data buffer + self.shared_memory.buf[buf_idx + self.ID_NBYTES: \ + buf_idx + self.MD_SIZE] = self.int2byte(size) + + # record metadata + self.metadata[self.monotonic_id_end % + self.ID_MAX] = self.data_buffer_end + # update buffer and monotonic id indices + current_buffer_end = self.data_buffer_end + current_id_end = self.monotonic_id_end + self.data_buffer_end += size + self.monotonic_id_end = (self.monotonic_id_end + 1) % self.ID_MAX + return current_buffer_end, current_id_end + + @contextmanager + def access_buf(self, address: int): + buf_idx = address % self.data_buffer_size + + # read metadata + metadata_buff = self.shared_memory.buf[buf_idx:buf_idx + self.MD_SIZE] + id = self.byte2int(metadata_buff[:self.ID_NBYTES]) + size = self.byte2int(metadata_buff[self.ID_NBYTES:self.MD_SIZE]) + + # yield the data buffer and metadata + data_buff = self.shared_memory.buf[buf_idx + self.MD_SIZE:buf_idx + + size] + with (memoryview(data_buff) as data_view, ): + yield data_view, (id, size) + + def free_buf(self, + is_free_fn: Callable[[int, memoryview], bool], + nbytes: Optional[int] = None) -> Iterable[int]: + ''' + Free a buffer of the given size. This is a no-op in shared memory, + but we need to keep track of the metadata. + + If freed memory spreads across the end and start of the ring buffer, + the actual freed memory will be in two segments. In this case there + still might not be a contiguous space of `nbytes` available. + + Args: + nbytes (int, optional): The size of the buffer to free. If None, + frees the maximum size of the ring buffer. + ''' + + assert self.is_writer, "Only the writer can free buffers." + logger.debug( + "Freeing up space in the ring buffer, " + "monotonic_id_start: %d, monotonic_id_end: %d", + self.monotonic_id_start, self.monotonic_id_end) + monotonic_id_before = self.monotonic_id_start + # if nbytes is None, free up the maximum size of the ring buffer + if nbytes is None: + nbytes = self.data_buffer_size + freed_bytes = 0 + while self.monotonic_id_start in self.metadata and freed_bytes < nbytes: + address = self.metadata[self.monotonic_id_start] + with self.access_buf(address) as (data_buff, metadata): + if is_free_fn(self.monotonic_id_start, data_buff): + # check passed, we can free the buffer + del self.metadata[self.monotonic_id_start] + self.monotonic_id_start = ((self.monotonic_id_start + 1) % + self.ID_MAX) + self.data_buffer_start = address + freed_bytes += metadata[1] + else: + # there are still readers, we cannot free the buffer + break + + logger.debug( + "Freed %d bytes from the ring buffer, " + "monotonic_id_start: %d, monotonic_id_end: %d", freed_bytes, + self.monotonic_id_start, self.monotonic_id_end) + + # buffer wrap around + if self.data_buffer_start >= self.data_buffer_size: + self.data_buffer_start -= self.data_buffer_size + self.data_buffer_end -= self.data_buffer_size + + monotonic_id_after = self.monotonic_id_start + # id wrap around + if monotonic_id_after >= monotonic_id_before: + return range(monotonic_id_before, monotonic_id_after) + else: + return chain(range(monotonic_id_before, self.ID_MAX), + range(0, monotonic_id_after)) + + +class ObjectSerde(ABC): + + @abstractmethod + def serialize(self, value: Any) -> tuple[Any, int, bytes, int]: + """Serialize an object to bytes.""" + raise NotImplementedError + + @abstractmethod + def deserialize(self, data: memoryview) -> Any: + """Deserialize bytes back to an object.""" + raise NotImplementedError + + +class MsgpackSerde(ObjectSerde): + + def __init__(self): + # Delayed import to avoid circular dependency + from vllm.multimodal.inputs import MultiModalKwargsItem + from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder + + self.encoder = MsgpackEncoder() + self.tensor_decoder = MsgpackDecoder(torch.Tensor) + self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem) + self._mm_kwargs_item_cls = MultiModalKwargsItem + + def serialize( + self, + value: Any) -> tuple[Union[bytes, list[bytes]], int, bytes, int]: + len_arr = None + if isinstance(value, (torch.Tensor, self._mm_kwargs_item_cls)): + type_name = type(value).__name__ + value = self.encoder.encode(value) + len_arr = [len(s) for s in value] + nbytes = sum(len_arr) + else: + value = pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL) + type_name = type(value).__name__ + nbytes = len(value) + + object_metadata = (type_name, nbytes, len_arr) + serialized_metadata = pickle.dumps(object_metadata, + protocol=pickle.HIGHEST_PROTOCOL) + return value, nbytes, serialized_metadata, len(serialized_metadata) + + def deserialize(self, data_view: memoryview) -> Any: + # pickle.loads do not read past the end of a pickled object + # within a large buffer, so we can skip storing the metadata size + type_name, nbytes, len_arr = pickle.loads(data_view) + serialized_data = bytearray(data_view[-nbytes:]) + + if type_name == torch.Tensor.__name__: + obj = [] + start_idx = 0 + for length in len_arr: + item_bytes = serialized_data[start_idx:start_idx + length] + obj.append(item_bytes) + start_idx += length + obj = self.tensor_decoder.decode(obj) + elif type_name == self._mm_kwargs_item_cls.__name__: + obj = [] + start_idx = 0 + for length in len_arr: + item_bytes = serialized_data[start_idx:start_idx + length] + obj.append(item_bytes) + start_idx += length + obj = self.mm_decoder.decode(obj) + elif type_name == bytes.__name__: + obj = pickle.loads(serialized_data) + else: + raise ValueError( + f"Unsupported object type '{type_name}' in metadata") + + return obj + + +@dataclass +class ShmObjectStorageHandle: + max_object_size: int + n_readers: int + ring_buffer_handle: tuple[int, str] + serde_class: type[ObjectSerde] + reader_lock: Optional[LockType] + + +class SingleWriterShmObjectStorage: + """ + A single-writer, multiple-reader object storage system built on top of a + shared memory ring buffer. Provides key-value storage with automatic memory + management and cross-process serialization support. + + This storage system follows a FIFO (First-In-First-Out) eviction policy + where the oldest objects are automatically freed when memory runs low. + Memory is reclaimed based on reader reference counting - objects are only + freed when all readers have finished accessing them. + + Architecture: + - Single writer process can put(key, value) objects + - Multiple reader processes can get(address, monotonic_id) objects + - Built on SingleWriterShmRingBuffer for efficient shared memory management + - Thread-safe operations with reader synchronization via locks + + Key Features: + - FIFO Eviction: Oldest objects are evicted first when memory is full + - Reference Counting: Objects are only freed when no readers are + accessing them + - Duplicate Key Handling: Existing keys are not overwritten, just + re-referenced + - Customized Serialization: By default uses Msgpack for efficient + serialization of Python objects, but can be extended for custom types + - Cross-Process Safety: Uses shared memory with proper synchronization + - Automatic Cleanup: Garbage collection happens transparently during + allocation + + Memory Layout per Object: + [4-byte reference_count][metadata_size][serialized_object_data] + + Thread Safety: + - Writer operations (put, clear) are single-threaded by design + - Reader operations (get) are thread-safe with lock-based reference + counting + - Memory reclamation is handled exclusively by the writer process + """ + + def __init__( + self, + max_object_size: int, + n_readers: int, + ring_buffer: SingleWriterShmRingBuffer, + serde_class: type[ObjectSerde] = MsgpackSerde, + reader_lock: Optional[LockType] = None, + ): + """ + Initialize the object storage. + + Args: + max_object_size: Maximum size for a single object in bytes. + n_readers: Number of reader processes that can access the storage. + ring_buffer: The shared memory ring buffer for storing objects. + serde_class: Serializer/deserializer for objects. + reader_lock: Optional lock for synchronizing reader access. + Raises: + ValueError: If reader_lock is None for readers. + """ + + self.max_object_size = max_object_size + self.n_readers = n_readers + self.serde_class = serde_class + self.ser_de = serde_class() + self.ring_buffer = ring_buffer + self.is_writer = self.ring_buffer.is_writer + + self.flag_bytes = 4 # for in-use flag + + if self.is_writer: + # Key-value mapping: key -> (address, monotonic_id) + self.key_index: dict[str, tuple[int, int]] = {} + # Reverse mapping: monotonic_id -> key + self.id_index: dict[int, str] = {} + # Writer flag to track in-use status: monotonic_id -> count + self.writer_flag: dict[int, int] = {} + else: + if reader_lock is None: + raise ValueError("Lock must be provided for readers.") + + self._reader_lock = reader_lock + + def clear(self) -> None: + """Clear the object storage.""" + if self.is_writer: + self.ring_buffer.clear() + self.key_index.clear() + self.id_index.clear() + self.writer_flag.clear() + logger.debug("Object storage cleared and reinitialized.") + + def copy_to_buffer( + self, + data: Union[bytes, list[bytes]], + data_bytes: int, + metadata: bytes, + md_bytes: int, + data_view: memoryview, + ) -> None: + data_view[self.flag_bytes:self.flag_bytes + md_bytes] = metadata + if isinstance(data, bytes): + data_view[-data_bytes:] = data + elif isinstance(data, list): + start_idx = self.flag_bytes + md_bytes + for item_bytes in data: + item_size = len(item_bytes) + data_view[start_idx:start_idx + item_size] = item_bytes + start_idx += item_size + else: + raise ValueError( + f"Unsupported data type for serialization: {type(data)}") + + def increment_writer_flag(self, id: int) -> None: + """Set the in-use flag for the writer.""" + self.writer_flag[id] = self.writer_flag.get(id, 0) + 1 + + def increment_reader_flag(self, data_view: memoryview) -> None: + """Set the in-use flag for the reader.""" + # >0 for in-use flag + reader_count = self.ring_buffer.byte2int(data_view) + data_view[:] = self.ring_buffer.int2byte(reader_count + 1) + + def free_unused(self) -> None: + """Free unused buffers in the ring buffer.""" + # try to free up 2*max_object_size bytes of space in the ring buffer, + # since the buffer might be fragmented + freed_ids = self.ring_buffer.free_buf(self.default_is_free_check, + 2 * self.max_object_size) + # update the metadata after freeing up space + for freed_id in freed_ids: + key_to_free = self.id_index[freed_id] + del self.key_index[key_to_free] + del self.id_index[freed_id] + del self.writer_flag[freed_id] + + def is_cached(self, key: str) -> bool: + """ + Check if the object with the given key is cached. + """ + return key in self.key_index + + def get_cached(self, key: str) -> tuple[int, int]: + """ + Get the cached object by key if it exists. + """ + address, monotonic_id = self.key_index[key] + self.increment_writer_flag(monotonic_id) + return address, monotonic_id + + def put(self, key: str, value: Any) -> tuple[int, int]: + """ + Store a key-value pair in the object storage. + Attempts to free max_object_size bytes using FIFO order + when the ring buffer runs out of space during a put() operation. + + Args: + key: String key to identify the object + value: Any serializable Python object + + Raises: + MemoryError: If there's not enough space in the buffer + ValueError: If the serialized object is too large + ValueError: If the key already exists in the storage + """ + if key in self.key_index: + raise ValueError(f"Key '{key}' already exists in the storage.") + + object_data, data_bytes, object_metadata, md_bytes = \ + self.ser_de.serialize(value) + buffer_size = self.flag_bytes + data_bytes + md_bytes + + # Sanity checks + if buffer_size > self.max_object_size: + raise ValueError( + f"Serialized object size ({buffer_size} bytes) exceeds " + f"max object size ({self.max_object_size} bytes)") + + # Allocate new buffer + try: + address, monotonic_id = self.ring_buffer.allocate_buf(buffer_size) + except MemoryError: + self.free_unused() + # try again after freeing up space + address, monotonic_id = self.ring_buffer.allocate_buf(buffer_size) + + # Write data to buffer + with self.ring_buffer.access_buf(address) as (data_view, metadata): + data_view[:self.flag_bytes] = self.ring_buffer.int2byte(0) + self.copy_to_buffer(object_data, data_bytes, object_metadata, + md_bytes, data_view) + self.increment_writer_flag(monotonic_id) + + # Update key index + self.key_index[key] = (address, monotonic_id) + self.id_index[monotonic_id] = key + return address, monotonic_id + + def get(self, address: int, monotonic_id: int) -> Any: + # Read data from buffer + with self.ring_buffer.access_buf(address) as (data_view, buf_metadata): + # check id from metadata + if buf_metadata[0] != monotonic_id: + raise ValueError( + f"Data for address:id '{address}:{monotonic_id}'" + " has been modified or is invalid.") + + obj = self.ser_de.deserialize(data_view[self.flag_bytes:]) + + # decrease the in-use flag for reader reads + if self._reader_lock is not None: + with self._reader_lock: + self.increment_reader_flag(data_view[:self.flag_bytes]) + else: + # if self._reader_lock is None, it means we are the writer + # in this case, we do not need to decrease the reader count + assert self.is_writer + + return obj + + def handle(self): + """Get handle for sharing across processes.""" + return ShmObjectStorageHandle( + max_object_size=self.max_object_size, + n_readers=self.n_readers, + ring_buffer_handle=self.ring_buffer.handle(), + serde_class=self.serde_class, + reader_lock=self._reader_lock, + ) + + @staticmethod + def create_from_handle( + handle: ShmObjectStorageHandle) -> "SingleWriterShmObjectStorage": + logger.debug("Creating storage from handle: %s", handle) + ring_buffer = SingleWriterShmRingBuffer(*handle.ring_buffer_handle) + return SingleWriterShmObjectStorage( + max_object_size=handle.max_object_size, + n_readers=handle.n_readers, + ring_buffer=ring_buffer, + serde_class=handle.serde_class, + reader_lock=handle.reader_lock, + ) + + def default_is_free_check(self, id: int, buf: memoryview) -> bool: + """ + Default is_free function that checks if the first 4 bytes are zero. + This indicates that the buffer is free. + """ + reader_count = int.from_bytes(buf[0:4], "little", signed=True) + writer_count = self.writer_flag[id] + return reader_count >= writer_count * self.n_readers diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9841876bc8e8c..bc7ec0f065db2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -27,8 +27,8 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, - ModelDType, ModelImpl, MultiModalConfig, + LoRAConfig, MambaDType, MMCacheType, MMEncoderTPMode, + ModelConfig, ModelDType, ModelImpl, MultiModalConfig, ObservabilityConfig, ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, SchedulerPolicy, SpeculativeConfig, TaskOption, @@ -373,6 +373,10 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb + mm_processor_cache_type: Optional[MMCacheType] = \ + MultiModalConfig.mm_processor_cache_type + mm_shm_cache_max_object_size_mb: int = \ + MultiModalConfig.mm_shm_cache_max_object_size_mb mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode io_processor_plugin: Optional[str] = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling @@ -782,6 +786,12 @@ class EngineArgs: multimodal_group.add_argument("--disable-mm-preprocessor-cache", action="store_true", deprecated=True) + multimodal_group.add_argument( + "--mm-processor-cache-type", + **multimodal_kwargs["mm_processor_cache_type"]) + multimodal_group.add_argument( + "--mm-shm-cache-max-object-size-mb", + **multimodal_kwargs["mm_shm_cache_max_object_size_mb"]) multimodal_group.add_argument( "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"]) multimodal_group.add_argument( @@ -998,6 +1008,9 @@ class EngineArgs: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_processor_cache_type=self.mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=self. + mm_shm_cache_max_object_size_mb, mm_encoder_tp_mode=self.mm_encoder_tp_mode, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, diff --git a/vllm/envs.py b/vllm/envs.py index 8ca7e0f19428d..f1f404594baa5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -175,6 +175,7 @@ if TYPE_CHECKING: VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True + VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" def get_default_cache_root(): @@ -1241,6 +1242,12 @@ environment_variables: dict[str, Callable[[], Any]] = { # raw bytes. Defaults to True for backward compatibility. "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": lambda: bool(int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))), + + # Name of the shared memory buffer used for object storage. + # Only effective when mm_config.mm_processor_cache_type == "shm". + "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": + lambda: os.getenv("VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME", + "VLLM_OBJECT_STORAGE_SHM_BUFFER"), } # --8<-- [end:env-vars-definition] diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index f45a94f3151b6..2e456ecd6de08 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os +from multiprocessing import Lock from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch @@ -10,9 +11,12 @@ import torch.distributed as dist import vllm.envs as envs from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import worker_receiver_cache_from_config from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, run_method) from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType +from vllm.v1.executor.utils import get_and_update_mm_cache from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -44,6 +48,8 @@ class UniProcExecutor(ExecutorBase): distributed_init_method=distributed_init_method, is_driver_worker=is_driver_worker, ) + self.mm_receiver_cache = worker_receiver_cache_from_config( + self.vllm_config, MULTIMODAL_REGISTRY, Lock()) self.collective_rpc("init_worker", args=([kwargs], )) self.collective_rpc("init_device") self.collective_rpc("load_model") @@ -55,6 +61,8 @@ class UniProcExecutor(ExecutorBase): kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} + if self.mm_receiver_cache is not None and method == "execute_model": + get_and_update_mm_cache(self.mm_receiver_cache, args) answer = run_method(self.driver_worker, method, args, kwargs) return [answer] @@ -128,6 +136,8 @@ class ExecutorWithExternalLauncher(UniProcExecutor): distributed_init_method=distributed_init_method, is_driver_worker=is_driver_worker, ) + self.mm_receiver_cache = worker_receiver_cache_from_config( + self.vllm_config, MULTIMODAL_REGISTRY, Lock()) self.collective_rpc("init_worker", args=([kwargs], )) self.collective_rpc("init_device") self.collective_rpc("load_model") diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 35b743ed21d92..31ae450f4c2ff 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -3,19 +3,24 @@ import sys from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence -from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union +from multiprocessing.synchronize import Lock as LockType +from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union, cast import torch from typing_extensions import TypeAlias, override +from vllm.distributed.device_communicators.shm_object_storage import ( + MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer) +from vllm.envs import VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME from vllm.logger import init_logger -from vllm.utils import GiB_bytes, LRUCache +from vllm.utils import GiB_bytes, LRUCache, MiB_bytes from vllm.utils.jsontree import (json_count_leaves, json_map_leaves, json_reduce_leaves) -from .inputs import (MultiModalFeatureSpec, MultiModalFieldElem, - MultiModalKwargs, MultiModalKwargsItem, - MultiModalKwargsItems, NestedTensors) +from .inputs import (MultiModalBatchedField, MultiModalFeatureSpec, + MultiModalFieldElem, MultiModalKwargs, + MultiModalKwargsItem, MultiModalKwargsItems, + NestedTensors) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -389,6 +394,106 @@ class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache): self._cache.clear() +class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache): + """ + The cache which is used on P0 when IPC caching is enabled. + + How to update each item: + + - If the item is already in the cache, clear the input to avoid + unnecessary IPC. + + - If the item is not in the cache, store the data in shared memory. + """ + + def __init__(self, vllm_config: "VllmConfig") -> None: + super().__init__() + + self.world_size = vllm_config.parallel_config.world_size + mm_config = vllm_config.model_config.get_multimodal_config() + + ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), + name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + create=True, # sender is the writer + ) + self._shm_cache = SingleWriterShmObjectStorage( + max_object_size=mm_config.mm_shm_cache_max_object_size_mb * + MiB_bytes, + n_readers=self.world_size, + ring_buffer=ring_buffer, + serde_class=MsgpackSerde, + ) + # cache (prompt_updates, modality) for P0 only + self._p0_cache: dict[str, tuple[Sequence[ResolvedPromptUpdate], + str]] = {} + + @override + def is_cached_item(self, mm_hash: str) -> bool: + return self._shm_cache.is_cached(mm_hash) + + @override + def get_and_update_item( + self, + mm_item: MultiModalProcessorCacheInItem, + mm_hash: str, + ) -> MultiModalProcessorCacheOutItem: + + if self._shm_cache.is_cached(mm_hash): + address, monotonic_id = self._shm_cache.get_cached(mm_hash) + prompt_updates, modality = self._p0_cache[mm_hash] + return self.address_as_item(address, monotonic_id, + modality), prompt_updates + + assert mm_item is not None, f"Expected a cached item for {mm_hash=}" + + try: + address, monotonic_id = self._shm_cache.put(mm_hash, mm_item[0]) + # Try to remove dangling items if p0 cache is too large. + if len(self._p0_cache) >= 2 * len(self._shm_cache.key_index): + self.remove_dangling_items() + self._p0_cache[mm_hash] = mm_item[1], mm_item[0].modality + address_item = self.address_as_item(address, monotonic_id, + mm_item[0].modality) + return address_item, mm_item[1] + except (ValueError, MemoryError) as e: + # put may fail if the object is too large or + # the cache is full. + # In this case we log the error and keep the original mm_input. + logger.debug("Failed to cache mm_input with hash %s: %s", mm_hash, + e) + return mm_item + + @override + def clear_cache(self) -> None: + self._shm_cache.clear() + self._p0_cache.clear() + + def remove_dangling_items(self) -> None: + """Remove items that are no longer in the shared memory cache.""" + cached_hashes = self._shm_cache.key_index.keys() + dangling_hashes = set(self._p0_cache.keys()) - cached_hashes + for mm_hash in dangling_hashes: + del self._p0_cache[mm_hash] + + def address_as_item(self, address: int, monotonic_id: int, + modality: str) -> MultiModalKwargsItem: + addr_elem = MultiModalFieldElem( + modality=modality, + key="address", + data=address, + field=MultiModalBatchedField(), + ) + id_elem = MultiModalFieldElem( + modality=modality, + key="monotonic_id", + data=monotonic_id, + field=MultiModalBatchedField(), + ) + mm_item = MultiModalKwargsItem.from_elems([addr_elem, id_elem]) + return mm_item + + def _enable_processor_cache( model_config: "ModelConfig", mm_registry: "MultiModalRegistry", @@ -408,6 +513,17 @@ def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: return supports_ipc_cache +def _enable_mm_input_shm_cache(vllm_config: "VllmConfig") -> bool: + """Whether the shared memory based cache should be enabled.""" + + if not _enable_ipc_cache(vllm_config): + return False + + mm_config = vllm_config.model_config.get_multimodal_config() + + return mm_config.mm_processor_cache_type == "shm" + + def processor_cache_from_config( vllm_config: "VllmConfig", mm_registry: "MultiModalRegistry", @@ -421,7 +537,9 @@ def processor_cache_from_config( if not _enable_ipc_cache(vllm_config): return MultiModalProcessorOnlyCache(model_config) - return MultiModalProcessorSenderCache(model_config) + if not _enable_mm_input_shm_cache(vllm_config): + return MultiModalProcessorSenderCache(model_config) + return ShmObjectStoreSenderCache(vllm_config) def processor_only_cache_from_config( @@ -491,11 +609,68 @@ class MultiModalReceiverCache(BaseMultiModalReceiverCache): self._cache.clear() -def receiver_cache_from_config( +class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache): + """ + The cache which is used on P1 Worker Process when IPC caching is enabled. + + How to update each item: + + - If the item has an address, replace the input with the cached item. + - If not, return the input. + """ + + def __init__( + self, + vllm_config: "VllmConfig", + shared_worker_lock: LockType, + ) -> None: + super().__init__() + + self.world_size = vllm_config.parallel_config.world_size + mm_config = vllm_config.model_config.get_multimodal_config() + + ring_buffer = SingleWriterShmRingBuffer( + data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), + name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + create=False, # Server is a reader + ) + self._shm_cache = SingleWriterShmObjectStorage( + max_object_size=mm_config.mm_shm_cache_max_object_size_mb * + MiB_bytes, + n_readers=self.world_size, + ring_buffer=ring_buffer, + serde_class=MsgpackSerde, + reader_lock=shared_worker_lock, + ) + + @override + def get_and_update_item( + self, + mm_item: Optional[MultiModalKwargsItem], + mm_hash: str, + ) -> MultiModalKwargsItem: + assert mm_item is not None, f"Expected an address item for {mm_hash=}" + if "address" in mm_item: + address = cast(int, mm_item["address"].data) + monotonic_id = cast(int, mm_item["monotonic_id"].data) + return self._shm_cache.get(address, monotonic_id) + + return mm_item + + @override + def clear_cache(self) -> None: + self._shm_cache.clear() + + +def engine_receiver_cache_from_config( vllm_config: "VllmConfig", mm_registry: "MultiModalRegistry", ) -> Optional[BaseMultiModalReceiverCache]: - """Return a `BaseMultiModalReceiverCache`, if enabled.""" + """ + This is used in the engine process. + Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and + mm_processor_cache_type=="lru". + """ model_config = vllm_config.model_config if not _enable_processor_cache(model_config, mm_registry): @@ -504,4 +679,31 @@ def receiver_cache_from_config( if not _enable_ipc_cache(vllm_config): return None - return MultiModalReceiverCache(model_config) + if not _enable_mm_input_shm_cache(vllm_config): + return MultiModalReceiverCache(model_config) + + return None + + +def worker_receiver_cache_from_config( + vllm_config: "VllmConfig", + mm_registry: "MultiModalRegistry", + shared_worker_lock: LockType, +) -> Optional[BaseMultiModalReceiverCache]: + """ + This is used in the worker process. + Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and + mm_processor_cache_type=="shm". + """ + model_config = vllm_config.model_config + + if not _enable_processor_cache(model_config, mm_registry): + return None + + if not _enable_ipc_cache(vllm_config): + return None + + if not _enable_mm_input_shm_cache(vllm_config): + return None + + return ShmObjectStoreReceiverCache(vllm_config, shared_worker_lock) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index ea860ca10b3a8..7984f1958ece2 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -163,6 +163,12 @@ STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" STR_DUAL_CHUNK_FLASH_ATTN_VAL: str = "DUAL_CHUNK_FLASH_ATTN" STR_INVALID_VAL: str = "INVALID" +MB_bytes = 1_000_000 +"""The number of bytes in one megabyte (MB).""" + +MiB_bytes = 1 << 20 +"""The number of bytes in one mebibyte (MiB).""" + GB_bytes = 1_000_000_000 """The number of bytes in one gigabyte (GB).""" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index b46ae72ccdf1b..27ee146c4f6ff 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -23,7 +23,7 @@ from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.cache import receiver_cache_from_config +from vllm.multimodal.cache import engine_receiver_cache_from_config from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) @@ -131,7 +131,7 @@ class EngineCore: self.use_spec_decode = vllm_config.speculative_config is not None self.mm_registry = mm_registry = MULTIMODAL_REGISTRY - self.mm_receiver_cache = receiver_cache_from_config( + self.mm_receiver_cache = engine_receiver_cache_from_config( vllm_config, mm_registry) # Setup batch queue for pipeline parallelism. diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index bcf6dda9c1e91..2f0e5aa383b13 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -14,6 +14,7 @@ from enum import Enum, auto from functools import partial from multiprocessing.connection import Connection from multiprocessing.process import BaseProcess +from multiprocessing.synchronize import Lock as LockType from threading import Thread from typing import Any, Callable, Optional, Union, cast @@ -31,10 +32,13 @@ from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, from vllm.executor.multiproc_worker_utils import ( set_multiprocessing_worker_envs) from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import worker_receiver_cache_from_config from vllm.utils import (decorate_logs, get_distributed_init_method, get_loopback_ip, get_mp_context, get_open_port, set_process_title) from vllm.v1.executor.abstract import Executor, FailureCallback +from vllm.v1.executor.utils import get_and_update_mm_cache from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput) from vllm.worker.worker_base import WorkerWrapperBase @@ -81,6 +85,8 @@ class MultiprocExecutor(Executor): scheduler_output_handle = self.rpc_broadcast_mq.export_handle() # Create workers + context = get_mp_context() + shared_worker_lock = context.Lock() unready_workers: list[UnreadyWorkerProcHandle] = [] success = False try: @@ -92,6 +98,7 @@ class MultiprocExecutor(Executor): rank=rank, distributed_init_method=distributed_init_method, input_shm_handle=scheduler_output_handle, + shared_worker_lock=shared_worker_lock, )) # Workers must be created before wait_for_ready to avoid @@ -380,6 +387,7 @@ class WorkerProc: rank: int, distributed_init_method: str, input_shm_handle: Handle, + shared_worker_lock: LockType, ): self.rank = rank wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) @@ -416,6 +424,10 @@ class WorkerProc: name="WorkerAsyncOutputCopy") self.async_output_copy_thread.start() + # Initialize multimodal receiver cache if needed + self.mm_receiver_cache = worker_receiver_cache_from_config( + vllm_config, MULTIMODAL_REGISTRY, shared_worker_lock) + # Initialize device self.worker.init_device() @@ -428,11 +440,12 @@ class WorkerProc: @staticmethod def make_worker_process( - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - input_shm_handle, # Receive SchedulerOutput + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + input_shm_handle, # Receive SchedulerOutput + shared_worker_lock: LockType, ) -> UnreadyWorkerProcHandle: context = get_mp_context() # (reader, writer) @@ -449,6 +462,7 @@ class WorkerProc: "input_shm_handle": input_shm_handle, "ready_pipe": (reader, writer), "death_pipe": death_reader, + "shared_worker_lock": shared_worker_lock, } # Run EngineCore busy loop in background process. proc = context.Process(target=WorkerProc.worker_main, @@ -646,6 +660,10 @@ class WorkerProc: func = getattr(self.worker, method) elif isinstance(method, bytes): func = partial(cloudpickle.loads(method), self.worker) + # retrieve from shm cache if available + if self.mm_receiver_cache is not None \ + and func.__name__ == "execute_model": + get_and_update_mm_cache(self.mm_receiver_cache, args) output = func(*args, **kwargs) except Exception as e: # Notes have been introduced in python 3.11 diff --git a/vllm/v1/executor/utils.py b/vllm/v1/executor/utils.py new file mode 100644 index 0000000000000..e6c107692a5b9 --- /dev/null +++ b/vllm/v1/executor/utils.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.multimodal.cache import ShmObjectStoreReceiverCache +from vllm.v1.core.sched.output import SchedulerOutput + + +def get_and_update_mm_cache( + receiver_cache: ShmObjectStoreReceiverCache, + args: tuple[SchedulerOutput], +) -> None: + """ + For each MultiModalKwargsItem in SchedulerOutput, fetch from shared memory + cache as needed. + + Args: + receiver_cache: The receiver cache to update. + args: According to the collective_rpc call of execute_model method in + executor, args is a tuple of only one SchedulerOutput element. + """ + scheduler_output = args[0] + for request_data in scheduler_output.scheduled_new_reqs: + for i in range(len(request_data.mm_kwargs)): + mm_input = request_data.mm_kwargs[i] + request_data.mm_kwargs[i] = \ + receiver_cache.get_and_update_item(mm_input, None) From 684b6870e1af43257ecac2bd0ed869bae163c89a Mon Sep 17 00:00:00 2001 From: Kebe <mail@kebe7jun.com> Date: Sat, 13 Sep 2025 00:01:24 +0900 Subject: [PATCH 227/584] [Bugfix][Frontend] Fix `--enable-log-outputs` does not match the documentation (#24626) Signed-off-by: Kebe <mail@kebe7jun.com> --- vllm/entrypoints/openai/cli_args.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index a6db97e55d704..1c2a6f58197d8 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -171,8 +171,8 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" """Enable the /get_tokenizer_info endpoint. May expose chat templates and other tokenizer configuration.""" enable_log_outputs: bool = False - """If set to True, enable logging of model outputs (generations) - in addition to the input logging that is enabled by default.""" + """If True, log model outputs (generations). + Requires --enable-log-requests.""" h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT """Maximum size (bytes) of an incomplete HTTP event (header or body) for h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" @@ -273,6 +273,9 @@ def validate_parsed_serve_args(args: argparse.Namespace): if args.enable_auto_tool_choice and not args.tool_call_parser: raise TypeError("Error: --enable-auto-tool-choice requires " "--tool-call-parser") + if args.enable_log_outputs and not args.enable_log_requests: + raise TypeError("Error: --enable-log-outputs requires " + "--enable-log-requests") def create_parser_for_docs() -> FlexibleArgumentParser: From 57f94e88ea1ed2e48ea8ea9b01e9591f0d79557b Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Fri, 12 Sep 2025 16:37:37 +0100 Subject: [PATCH 228/584] [Models] Optimise and simplify `_validate_and_reshape_mm_tensor` (#24742) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/model_executor/models/ernie45_vl.py | 2 +- vllm/model_executor/models/glm4_1v.py | 2 +- vllm/model_executor/models/keye.py | 4 ++-- vllm/model_executor/models/keye_vl1_5.py | 4 ++-- vllm/model_executor/models/midashenglm.py | 2 +- vllm/model_executor/models/qwen2_5_omni_thinker.py | 2 ++ vllm/model_executor/models/qwen2_5_vl.py | 2 +- vllm/model_executor/models/qwen2_audio.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 2 +- 9 files changed, 12 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index bcff65a717ab1..f49dd36b0ab94 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1338,7 +1338,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, mm_input.shape[-1]) else: return torch.concat(mm_input) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 869287fc0268a..4ed07bd060cf9 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1378,7 +1378,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, mm_input.shape[-1]) else: return torch.concat(mm_input) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 04824db1b6dd9..cb4cd60a89177 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1611,12 +1611,12 @@ class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, mm_input.shape[-1]) elif is_list_of(mm_input, torch.Tensor): if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2 for p in mm_input): return mm_input - return torch.concat(list(mm_input)) + return torch.concat(mm_input) def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[KeyeImageInputs]: diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py index 605c6d3eaf643..b3c44d132435c 100644 --- a/vllm/model_executor/models/keye_vl1_5.py +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -491,14 +491,14 @@ class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal, if mm_input.ndim == expected_dim: return mm_input elif mm_input.ndim == expected_dim + 1: - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, *mm_input.shape[2:]) else: raise ValueError( f"{name} should be {expected_dim}D or " f"batched {expected_dim}D tensor." f"Got ndim: {mm_input.ndim} (shape={mm_input.shape})") else: - return torch.concat(list(mm_input)) + return torch.concat(mm_input) def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[KeyeVL1_5ImageInputs]: diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 858d4e7e34cf1..e314ae357ecd4 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -669,7 +669,7 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): raise ValueError(f"Incorrect type of {name}. " f"Got type: {type(mm_input)}") if isinstance(mm_input, torch.Tensor): - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, *mm_input.shape[2:]) else: return torch.concat(mm_input) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index f8a943d4cab33..a7e71309b6074 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -551,6 +551,8 @@ class Qwen2_5OmniConditionalGenerationMixin: raise ValueError(f"Incorrect type of {name}. " f"Got type: {type(mm_input)}") if isinstance(mm_input, torch.Tensor): + if dim == 0: + return mm_input.reshape(-1, *mm_input.shape[2:]) return torch.concat(list(mm_input), dim=dim) else: return torch.concat(mm_input, dim=dim) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 98f9c0cf4c16b..5d35a70546597 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -986,7 +986,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, mm_input.shape[-1]) else: return torch.concat(mm_input) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 54ec7b8627488..c797b71b5d2e1 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -342,7 +342,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, raise ValueError(f"Incorrect type of {name}. " f"Got type: {type(mm_input)}") if isinstance(mm_input, torch.Tensor): - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, *mm_input.shape[2:]) else: return torch.concat(mm_input) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 89af79c3b5fd5..cf15dfa67743f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1167,7 +1167,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, raise ValueError(f"{name} should be 2D or batched 3D tensor. " f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") - return torch.concat(list(mm_input)) + return mm_input.reshape(-1, mm_input.shape[-1]) else: return torch.concat(mm_input) From b0d1213ac3952227834f7d075fa1a2879b468c69 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Fri, 12 Sep 2025 17:03:55 +0100 Subject: [PATCH 229/584] [Models] Prevent CUDA sync in Qwen2.5-VL (#24741) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/model_executor/models/qwen2_5_vl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 5d35a70546597..fc028aa2287a2 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -64,6 +64,7 @@ from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.utils import is_pin_memory_available from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -737,7 +738,7 @@ class Qwen2_5_VisionTransformer(nn.Module): @staticmethod def invert_permutation(perm: torch.Tensor) -> torch.Tensor: # building the inverse permutation in O(n) time - inv = torch.empty_like(perm) + inv = torch.empty_like(perm, pin_memory=is_pin_memory_available()) inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype) @@ -808,6 +809,8 @@ class Qwen2_5_VisionTransformer(nn.Module): non_blocking=True) window_index = window_index.to(device=hidden_states.device, non_blocking=True) + reverse_indices = reverse_indices.to(device=hidden_states.device, + non_blocking=True) hidden_states = hidden_states.reshape( seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) From f17c075884009a3bfb7c66cb0897719b8e18f196 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Sat, 13 Sep 2025 00:12:23 +0800 Subject: [PATCH 230/584] [Model] Switch to Fused RMSNorm in GLM-4.1V model (#24733) Signed-off-by: SamitHuang <285365963@qq.com> --- vllm/model_executor/models/glm4_1v.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 4ed07bd060cf9..22386a5e819ab 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -419,15 +419,16 @@ class Glm4vVisionBlock(nn.Module): max_seqlen: Optional[int] = None, # Only used for Flash Attention seqlens: Optional[list[int]] = None, # Only used for xFormers ) -> torch.Tensor: - x = x + self.attn( + x_attn = self.attn( self.norm1(x), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb, max_seqlen=max_seqlen, seqlens=seqlens, ) + x_fused_norm, residual = self.norm2(x, residual=x_attn) + x = residual + self.mlp(x_fused_norm) - x = x + self.mlp(self.norm2(x)) return x From 9d2a44606d6b95b4e6aa7b6c75ae3a87f0560ede Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Fri, 12 Sep 2025 13:08:44 -0400 Subject: [PATCH 231/584] [UX] Remove AsyncLLM torch profiler disabled log (#24609) Signed-off-by: mgoin <mgoin64@gmail.com> --- vllm/v1/engine/async_llm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index e467f40f6eab1..a9ced402b974f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -177,9 +177,6 @@ class AsyncLLM(EngineClient): worker_name=worker_name, use_gzip=True)) else: - logger.info( - "Torch profiler disabled. AsyncLLM CPU traces will not be collected." # noqa: E501 - ) self.profiler = None @classmethod From c8c42597abd5c6843fb8b6b02c0f62a5d3f284c8 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Fri, 12 Sep 2025 13:36:50 -0400 Subject: [PATCH 232/584] [CI] Speed up model unit tests in CI (#24253) Signed-off-by: Andrew Feldman <afeldman@redhat.com> --- .buildkite/test-pipeline.yaml | 78 ++++++++++++++++--- pyproject.toml | 1 + .../models/language/generation/test_common.py | 14 +++- .../language/pooling/test_classification.py | 5 +- .../models/language/pooling/test_embedding.py | 7 +- tests/models/test_initialization.py | 39 +++++++++- 6 files changed, 123 insertions(+), 21 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b633625a77e30..adb5c862eecd9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -571,36 +571,85 @@ steps: ##### models test ##### -- label: Basic Models Test # 57min - timeout_in_minutes: 75 +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - - tests/models + - tests/models/test_initialization.py commands: - - pytest -v -s models/test_transformers.py - - pytest -v -s models/test_registry.py - - pytest -v -s models/test_utils.py - - pytest -v -s models/test_vision.py - - pytest -v -s models/test_initialization.py + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset -- label: Language Models Test (Standard) # 35min +- label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + - tests/models/test_utils.py + - tests/models/test_vision.py + commands: + - pytest -v -s models/test_transformers.py \ + models/test_registry.py \ + models/test_utils.py \ + models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: - vllm/ - tests/models/language commands: + # Test standard language models, excluding a subset of slow tests - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m core_model + - pytest -v -s models/language -m 'core_model and (not slow_test)' -- label: Language Models Test (Hybrid) # 35 min +- label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: - vllm/ - tests/models/language/generation commands: @@ -608,7 +657,12 @@ steps: # Note: also needed to run plamo2 model in vLLM - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m hybrid_model + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 - label: Language Models Test (Extended Generation) # 80min timeout_in_minutes: 110 diff --git a/pyproject.toml b/pyproject.toml index 416423abcad88..f5a44f56f416e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,6 +145,7 @@ skip_gitignore = true [tool.pytest.ini_options] markers = [ + "slow_test", "skip_global_cleanup", "core_model: enable this model test in each PR instead of only nightly", "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)", diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index a39f24c80f1c5..a5aa1e3f49743 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -38,7 +38,7 @@ AITER_MODEL_LIST = [ [ pytest.param( "bigscience/bloom-560m", # bloom - testing alibi slopes - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.slow_test], ), pytest.param( "openai-community/gpt2", # gpt2 @@ -49,7 +49,10 @@ AITER_MODEL_LIST = [ pytest.param("EleutherAI/pythia-70m"), # gpt_neox pytest.param( "google/gemma-1.1-2b-it", # gemma - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ], ), pytest.param( "zai-org/chatglm3-6b", # chatglm (text-only) @@ -70,14 +73,17 @@ AITER_MODEL_LIST = [ ), pytest.param( "microsoft/phi-2", # phi - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.slow_test], ), pytest.param( "Qwen/Qwen-7B-Chat", # qwen (text-only) ), pytest.param( "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ], ), pytest.param( "Qwen/Qwen3-8B", # qwen (text-only) diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index c71fa96275335..8e398830d39df 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -11,7 +11,10 @@ from vllm.platforms import current_platform "model", [ pytest.param("jason9693/Qwen2.5-1.5B-apeach", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ]), ], ) @pytest.mark.parametrize("dtype", diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 41574b844a668..d61ac08475e3c 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -19,7 +19,7 @@ from ...utils import check_embeddings_close # model code with bidirectional attention. # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", - marks=[pytest.mark.core_model]), + marks=[pytest.mark.core_model, pytest.mark.slow_test]), pytest.param( "intfloat/e5-mistral-7b-instruct", # CPU v1 doesn't support sliding window @@ -29,7 +29,10 @@ from ...utils import check_embeddings_close # [Encoder-only] pytest.param( "BAAI/bge-base-en-v1.5", - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.slow_test + ], ), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 3b13a12276f5d..c22d94948d249 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -18,6 +18,26 @@ from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels) from .utils import dummy_hf_overrides +# This minimal list of model architectures is smaller than the total list of +# supported models. The intention is that in the "typical" regression testing +# scenario, we only test initializing these models. This subset was chosen +# to include representative examples of model varieties/workloads (conditional +# generation, sequence classification, causal LM, ranking, chat, reward model, +# multimodal, geospatial, voice, embedding, MTP) +MINIMAL_MODEL_ARCH_LIST = [ + "LlavaForConditionalGeneration", "Llama4ForConditionalGeneration", + "BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking", + "InternVLChatModel", "InternLM2ForRewardModel", + "TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel", + "DeepSeekMTPModel", "XLMRobertaModel" +] + +# This list is the complement of the minimal list above. The intention is that +# this list of models is only tested in a "special case" i.e. most PRs should +# not test these models +OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) - + set(MINIMAL_MODEL_ARCH_LIST)) + @create_new_process_for_each_test() def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, @@ -101,8 +121,23 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, max_num_seqs=model_info.max_num_seqs) -@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) -def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): +@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST) +def test_can_initialize_small_subset(model_arch: str, + monkeypatch: pytest.MonkeyPatch): + """Test initializing small subset of supported models""" + if model_arch == "Lfm2ForCausalLM": + pytest.skip("Skipping until test supports V1-only models") + can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS) + + +@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST) +def test_can_initialize_large_subset(model_arch: str, + monkeypatch: pytest.MonkeyPatch): + """Test initializing large subset of supported models + + This test covers the complement of the tests covered in the "small subset" + test. + """ if model_arch == "Lfm2ForCausalLM": pytest.skip("Skipping until test supports V1-only models") can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS) From 010acc6e1ea1d59ef530459e1078e8fde80f2082 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 13 Sep 2025 02:17:29 +0800 Subject: [PATCH 233/584] [Bugfix] Fix incompatibility between #20452 and #24548 (#24754) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/v1/executor/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/executor/utils.py b/vllm/v1/executor/utils.py index e6c107692a5b9..1855bc9963817 100644 --- a/vllm/v1/executor/utils.py +++ b/vllm/v1/executor/utils.py @@ -19,7 +19,5 @@ def get_and_update_mm_cache( """ scheduler_output = args[0] for request_data in scheduler_output.scheduled_new_reqs: - for i in range(len(request_data.mm_kwargs)): - mm_input = request_data.mm_kwargs[i] - request_data.mm_kwargs[i] = \ - receiver_cache.get_and_update_item(mm_input, None) + request_data.mm_features = receiver_cache.get_and_update_features( + request_data.mm_features) From 017354c0ef06c0d3bdec88ebf94e52f1c79edf97 Mon Sep 17 00:00:00 2001 From: Zhewen Li <zhewenli@meta.com> Date: Fri, 12 Sep 2025 11:44:36 -0700 Subject: [PATCH 234/584] [CI] Trigger BC Linter when labels are added/removed (#24767) --- .github/workflows/bc-lint.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml index 3795b046d7808..823695a921321 100644 --- a/.github/workflows/bc-lint.yml +++ b/.github/workflows/bc-lint.yml @@ -6,6 +6,8 @@ on: - opened - synchronize - reopened + - labeled + - unlabeled jobs: bc_lint: From bc636f21a697a8391ee2767fbf09b9981a0f9604 Mon Sep 17 00:00:00 2001 From: Clayton Coleman <smarterclayton@gmail.com> Date: Fri, 12 Sep 2025 16:57:53 -0400 Subject: [PATCH 235/584] [Benchmark] Allow arbitrary headers to be passed to benchmarked endpoints (#23937) Signed-off-by: Clayton Coleman <smarterclayton@gmail.com> --- vllm/benchmarks/lib/endpoint_request_func.py | 7 +++++ vllm/benchmarks/serve.py | 29 +++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 9d67580be26ad..e640630476630 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -68,6 +68,7 @@ class RequestFuncInput: model: str model_name: Optional[str] = None logprobs: Optional[int] = None + extra_headers: Optional[dict] = None extra_body: Optional[dict] = None multi_modal_content: Optional[Union[dict, list[dict]]] = None ignore_eos: bool = False @@ -129,6 +130,8 @@ async def async_request_openai_completions( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers if request_func_input.request_id: headers["x-request-id"] = request_func_input.request_id @@ -258,6 +261,8 @@ async def async_request_openai_chat_completions( "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers if request_func_input.request_id: headers["x-request-id"] = request_func_input.request_id @@ -364,6 +369,8 @@ async def async_request_openai_audio( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers if request_func_input.request_id: headers["x-request-id"] = request_func_input.request_id diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index a98eb2a78f103..33e831e54bbc9 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -389,6 +389,7 @@ async def benchmark( goodput_config_dict: dict[str, float], max_concurrency: Optional[int], lora_modules: Optional[Iterable[str]], + extra_headers: Optional[dict], extra_body: Optional[dict], ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, ramp_up_start_rps: Optional[int] = None, @@ -452,6 +453,7 @@ async def benchmark( logprobs=logprobs, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, + extra_headers=extra_headers, extra_body=extra_body, ) @@ -484,6 +486,7 @@ async def benchmark( logprobs=logprobs, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, + extra_headers=extra_headers, extra_body=extra_body) profile_output = await request_func( request_func_input=profile_input, session=session) @@ -568,6 +571,7 @@ async def benchmark( logprobs=logprobs, multi_modal_content=mm_content, ignore_eos=ignore_eos, + extra_headers=extra_headers, extra_body=extra_body, request_id=request_id,) tasks.append( @@ -815,6 +819,15 @@ def add_cli_args(parser: argparse.ArgumentParser): default="/v1/completions", help="API endpoint.", ) + parser.add_argument( + "--header", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " + "for headers to be passed with each request. These headers override " \ + "per backend constants and values set via environment variable, and " \ + "will be overriden by other arguments (such as request ids)." + ) parser.add_argument( "--max-concurrency", type=int, @@ -1104,6 +1117,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: api_url = f"http://{args.host}:{args.port}{args.endpoint}" base_url = f"http://{args.host}:{args.port}" + # Headers + headers = None + if args.header: + headers = {} + for item in args.header: + if "=" in item: + kvstring = item.split("=", 1) + headers[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid header format. Please use KEY=VALUE format." + ) + tokenizer = get_tokenizer(tokenizer_id, tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) @@ -1161,6 +1187,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, + extra_headers=headers, extra_body=sampling_params, ramp_up_strategy=args.ramp_up_strategy, ramp_up_start_rps=args.ramp_up_start_rps, @@ -1184,7 +1211,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: if args.metadata: for item in args.metadata: if "=" in item: - kvstring = item.split("=") + kvstring = item.split("=", 1) result_json[kvstring[0].strip()] = kvstring[1].strip() else: raise ValueError( From 3beadc2f25d1111db6a5efee52d5cfe2ccf28d45 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 12 Sep 2025 17:23:05 -0400 Subject: [PATCH 236/584] [Compilation Bug] Fix Inductor Graph Output with Shape Issue (#24772) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/model_executor/models/qwen3_moe.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index a7e0a00350e6b..85429b3a01f92 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -170,8 +170,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module): return quant_config def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - # NOTE: hidden_states can have either 1D or 2D shape. - orig_shape = hidden_states.shape + assert hidden_states.dim( + ) <= 2, "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs" + is_input_1d = hidden_states.dim() == 1 hidden_dim = hidden_states.shape[-1] hidden_states = hidden_states.view(-1, hidden_dim) @@ -180,7 +181,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module): final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits) - return final_hidden_states.view(orig_shape) + # return to 1d if input is 1d + return final_hidden_states.squeeze(0) if is_input_1d else \ + final_hidden_states class Qwen3MoeAttention(nn.Module): From c89ed8de43e9ce513cdbb683862b07984cf51aae Mon Sep 17 00:00:00 2001 From: Alexandre Marques <almarque@redhat.com> Date: Fri, 12 Sep 2025 17:45:29 -0400 Subject: [PATCH 237/584] Invert pattern order to make sure that out_proj layers are identified (#24781) Signed-off-by: Alexandre Marques <almarque@redhat.com> --- vllm/model_executor/models/voxtral.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 1ea317c2f95f9..16a97389cd21b 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -585,12 +585,12 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, r"language_model.model.layers.\1.mlp.down_proj"), (r"layers\.(\d+)\.feed_forward\.w3", r"language_model.model.layers.\1.mlp.up_proj"), - (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo", - r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj" - ), (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)", r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj" ), + (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo", + r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj" + ), (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)", r"whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2"), (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0", From 7ba32aa60b7bc1a08e9340d041014cc153f7a441 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni001@gmail.com> Date: Fri, 12 Sep 2025 17:45:53 -0400 Subject: [PATCH 238/584] [Attention][FlashInfer] Enable FP8 FlashInfer (TRTLLM) MLA decode (#24705) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> --- tests/v1/attention/test_attention_backends.py | 1 + tests/v1/tpu/test_pallas.py | 2 ++ vllm/attention/backends/abstract.py | 1 + vllm/envs.py | 2 ++ .../layers/quantization/kv_cache.py | 2 ++ vllm/platforms/cuda.py | 5 ++++- vllm/v1/attention/backends/mla/common.py | 2 -- .../attention/backends/mla/flashinfer_mla.py | 18 +++++++++++------- 8 files changed, 23 insertions(+), 10 deletions(-) diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 1ae8b91c347a2..0b7e103beca63 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -178,6 +178,7 @@ class MockAttentionLayer: self._k_scale = torch.tensor(1.0, device=device) self._v_scale = torch.tensor(1.0, device=device) # Add float versions for flashinfer + self._q_scale_float = 1.0 self._k_scale_float = 1.0 self._v_scale_float = 1.0 diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py index bfba3af57f715..1bc8dff317a74 100644 --- a/tests/v1/tpu/test_pallas.py +++ b/tests/v1/tpu/test_pallas.py @@ -33,10 +33,12 @@ def test_ragged_paged_attention(): ) class FakeAttentionLayer: + _q_scale_float: float _k_scale_float: float _v_scale_float: float layer = FakeAttentionLayer() + layer._q_scale_float = 1.0 layer._k_scale_float = 1.0 layer._v_scale_float = 1.0 diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 0217bff6adafa..75bcdc4bbcf0d 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -240,6 +240,7 @@ class AttentionLayer(Protocol): _q_scale: torch.Tensor _k_scale: torch.Tensor _v_scale: torch.Tensor + _q_scale_float: float _k_scale_float: float _v_scale_float: float _prob_scale: torch.Tensor diff --git a/vllm/envs.py b/vllm/envs.py index f1f404594baa5..bb10c7cc2ac27 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -476,6 +476,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "FLASHINFER": use flashinfer # - "FLASHMLA": use FlashMLA # - "FLASH_ATTN_MLA": use FlashAttention for MLA + # - "FLASHINFER_MLA": use FlashInfer for MLA + # - "CUTLASS_MLA": use CUTLASS for MLA "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index e5604670fb4c1..4c6fcda893a03 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -88,6 +88,7 @@ class BaseKVCacheMethod(QuantizeMethodBase): "Setting it to k_scale. This only matters for " "the flash-attn backend.") layer._q_scale.copy_(k_scale) + layer._q_scale_float = k_scale # These are used in the final Attention.forward() layer._k_scale.copy_(k_scale) @@ -124,6 +125,7 @@ class BaseKVCacheMethod(QuantizeMethodBase): # These are used in the final Attention.forward() layer._q_scale.copy_(q_scale) + layer._q_scale_float = q_scale layer._prob_scale.copy_(prob_scale) if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0 or prob_scale == 1.0): diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 77c9a012b2d3d..42fff2f07a0a4 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -179,6 +179,7 @@ class CudaPlatformBase(Platform): cache_config.block_size = 128 logger.info("Forcing kv cache block size to 128 for " "CUTLASS_MLA backend.") + if use_flashinfer_mla and cache_config.block_size not in [32, 64]: cache_config.block_size = 64 logger.info( @@ -541,7 +542,9 @@ class CudaPlatformBase(Platform): attention_backend = "FLASHMLA" # Only FlashMLA and CUTLASS_MLA support fp8 - if attention_backend in ["FLASHMLA", "CUTLASS_MLA"]: + if attention_backend in [ + "FLASHMLA", "CUTLASS_MLA", "FLASHINFER_MLA" + ]: supported = True else: supported = (not fp8_attention) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 036a281f1d26e..a990cb2f1a972 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -584,7 +584,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): window_left=self._global_hyperparameters.window_left, logits_soft_cap=self._global_hyperparameters.logits_soft_cap, q_data_type=self.model_config.dtype, - kv_data_type=self.kv_cache_spec.dtype, ) # Prepare context prefills @@ -605,7 +604,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): logits_soft_cap=self._global_hyperparameters. logits_soft_cap, q_data_type=self.model_config.dtype, - kv_data_type=self.kv_cache_spec.dtype, ) prefill.prefill_main = self._fi_prefill_main diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 71eb9e0ce70e6..701248670f72e 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -6,8 +6,7 @@ from typing import Optional, Union import torch from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla -from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, - is_quantized_kv_cache) +from vllm.attention.backends.abstract import AttentionLayer, AttentionType from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, @@ -69,11 +68,9 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): "are not implemented for " "FlashInferMLAImpl") - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "FlashInferMLA V1 with FP8 KV cache not yet supported") - self._workspace_buffer = g_fi_workspace + self.bmm1_scale: Optional[float] = None + self.bmm2_scale: Optional[float] = None def _forward_decode( self, @@ -92,6 +89,12 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): # trtllm API requires extra dimension q_len_per_request for MTP q = q.unsqueeze(1) + if self.bmm1_scale is None: + self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float * + self.scale) + if self.bmm2_scale is None: + self.bmm2_scale = layer._v_scale_float + o = trtllm_batch_decode_with_kv_cache_mla( query=q, kv_cache=kv_c_and_k_pe_cache.unsqueeze(1), @@ -102,7 +105,8 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]): block_tables=attn_metadata.decode.block_table, seq_lens=attn_metadata.decode.seq_lens, max_seq_len=attn_metadata.max_seq_len, - bmm1_scale=self.scale, + bmm1_scale=self.bmm1_scale, + bmm2_scale=self.bmm2_scale, ) # TODO: Return LSE pending support from Flashinfer API: From 5fe643fc265633b6f38752321b7883899a6a8903 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni001@gmail.com> Date: Fri, 12 Sep 2025 18:30:07 -0400 Subject: [PATCH 239/584] Add FLASHINFER_MLA to backend selector test (#24753) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> --- .../attention/test_attention_selector.py | 60 +++++++++++++------ tests/v1/attention/utils.py | 2 + 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 3c2aaabacae8c..4d969cf992d23 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -22,7 +22,10 @@ def clear_cache(): # Define MLA and non-MLA backends separately DEVICE_MLA_BACKENDS = { - "cuda": ["TRITON_MLA", "FLASHMLA", "FLASH_ATTN_MLA", "CUTLASS_MLA"], + "cuda": [ + "TRITON_MLA", "FLASHMLA", "FLASHINFER_MLA", "FLASH_ATTN_MLA", + "CUTLASS_MLA" + ], "hip": ["TRITON_MLA", "ROCM_AITER_MLA"], "cpu": [], } @@ -90,8 +93,8 @@ def test_env( with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, - block_size, False) + backend = get_attn_backend(16, torch.float16, None, block_size, + False) assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "hip": @@ -109,7 +112,7 @@ def test_env( with pytest.raises(ValueError) as exc_info: get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -120,7 +123,7 @@ def test_env( with pytest.raises(ValueError) as exc_info: get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -130,7 +133,7 @@ def test_env( # Valid backend-block_size combination backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -139,7 +142,7 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -153,6 +156,8 @@ def test_env( # CUDA MLA backend logic: # - CUTLASS_MLA: only supported with block_size == 128 # and Blackwell GPUs (SM 10.0), V1 only + # - FLASHINFER_MLA: only supported on Blackwell GPUs + # (SM 10.0+), V1 only # - FLASHMLA: only supported with block_size == 64 # - FLASH_ATTN_MLA: V1 only # - TRITON_MLA: fallback for other cases @@ -169,12 +174,31 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) expected = "CUTLASS_MLA_VLLM_V1" assert backend.get_name() == expected + elif name == "FLASHINFER_MLA": + if not use_v1: + # FlashInfer MLA only supported on V1 engine + pytest.skip( + "FlashInfer MLA only supported on V1 engine") + elif block_size not in [32, 64]: + # FlashInfer MLA only supports block_size 32 or 64 + pytest.skip( + "FlashInfer MLA only supports block_size 32 " + "or 64") + else: + backend = get_attn_backend(16, + torch.float16, + None, + block_size, + False, + use_mla=use_mla) + expected = "FLASHINFER_MLA" + assert backend.get_name() == expected elif name == "FLASHMLA": if block_size != 64: # FlashMLA only supports block_size == 64 @@ -189,7 +213,7 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -204,7 +228,7 @@ def test_env( else: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -214,7 +238,7 @@ def test_env( # TRITON_MLA or other fallback backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -224,7 +248,7 @@ def test_env( elif name == "FLASHINFER": backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -233,7 +257,7 @@ def test_env( else: backend = get_attn_backend(32, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -243,7 +267,7 @@ def test_env( if use_v1: backend = get_attn_backend(16, torch.float16, - torch.float16, + None, block_size, False, use_mla=use_mla) @@ -269,15 +293,13 @@ def test_fp32_fallback( with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float32, torch.float32, - 16, False) + backend = get_attn_backend(16, torch.float32, None, 16, False) assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "cuda": with patch("vllm.attention.selector.current_platform", CudaPlatform()): - backend = get_attn_backend(16, torch.float32, torch.float32, - 16, False) + backend = get_attn_backend(16, torch.float32, None, 16, False) assert (backend.get_name() == "FLEX_ATTENTION" if use_v1 else "XFORMERS") @@ -331,7 +353,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): assert backend.get_name() != STR_FLASH_ATTN_VAL # Attention-free models should bypass env and use PlaceholderAttention - backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + backend = get_attn_backend(16, torch.float16, None, 16, True) assert backend.get_name() != STR_FLASH_ATTN_VAL diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 5c49566240df4..f07c6eb0ea4da 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -141,6 +141,8 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend", _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", + _Backend.FLASHINFER_MLA: + "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend", _Backend.TRITON_MLA_VLLM_V1: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", } From 8226dd56bf64f80e04f2aa90c97401d8783aa2bf Mon Sep 17 00:00:00 2001 From: Tao He <linzhu.ht@alibaba-inc.com> Date: Sat, 13 Sep 2025 06:31:32 +0800 Subject: [PATCH 240/584] [Qwen3Next] Fixes the cuda graph capture conditions under large batch sizes (#24660) (#24667) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> --- vllm/v1/attention/backends/gdn_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 12233af057b04..74eb9ae9d3254 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder( # prepare tensors for cudagraph if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0 - and num_spec_decodes <= self.decode_cudagraph_max_bs): + and num_spec_decodes <= self.decode_cudagraph_max_bs + and m.num_actual_tokens <= self.decode_cudagraph_max_bs): num_total_tokens = self.vllm_config.pad_for_cudagraph( m.num_actual_tokens) batch_size = num_total_tokens // (self.num_spec + 1) From 4fdd6f5cbf877de7c4de33086fe41bb0ac1d3cf3 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Fri, 12 Sep 2025 16:34:28 -0700 Subject: [PATCH 241/584] [Core] Support async scheduling with uniproc executor (#24219) Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- .../test_basic_correctness.py | 24 ++++-- tests/v1/engine/test_engine_core.py | 4 + vllm/engine/arg_utils.py | 7 +- vllm/executor/uniproc_executor.py | 74 ++++++++++++------- vllm/v1/engine/core.py | 9 ++- vllm/v1/engine/core_client.py | 4 +- vllm/v1/executor/abstract.py | 17 ++++- vllm/v1/executor/multiproc_executor.py | 12 +-- vllm/v1/executor/ray_distributed_executor.py | 7 +- 9 files changed, 103 insertions(+), 55 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index a3b09cc817917..fba18f197074b 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -62,6 +62,8 @@ def _fix_prompt_embed_outputs( @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("async_scheduling", [True, False]) +@pytest.mark.parametrize("model_executor", ["uni", "mp"]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) def test_models( monkeypatch: pytest.MonkeyPatch, @@ -70,6 +72,8 @@ def test_models( backend: str, max_tokens: int, enforce_eager: bool, + async_scheduling: bool, + model_executor: str, enable_prompt_embeds: bool, ) -> None: @@ -77,6 +81,12 @@ def test_models( "VLLM_USE_V1") and envs.VLLM_USE_V1: pytest.skip("enable_prompt_embeds is not supported in v1.") + if not envs.VLLM_USE_V1: + if async_scheduling: + pytest.skip("async_scheduling only supported in v1.") + if model_executor != "uni": + pytest.skip("only test uniproc executor for v0.") + if backend == "XFORMERS" and model == "google/gemma-2-2b-it": pytest.skip( f"{backend} does not support gemma2 with full context length.") @@ -98,11 +108,15 @@ def test_models( prompt_embeds = hf_model.get_prompt_embeddings( example_prompts) - with VllmRunner(model, - max_model_len=8192, - enforce_eager=enforce_eager, - enable_prompt_embeds=enable_prompt_embeds, - gpu_memory_utilization=0.7) as vllm_model: + with VllmRunner( + model, + max_model_len=8192, + enforce_eager=enforce_eager, + enable_prompt_embeds=enable_prompt_embeds, + gpu_memory_utilization=0.7, + async_scheduling=async_scheduling, + distributed_executor_backend=model_executor, + ) as vllm_model: if enable_prompt_embeds: vllm_outputs = vllm_model.generate_greedy( prompt_embeds, max_tokens) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 98265c6349578..17b136aa42731 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -257,9 +257,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): def execute_model( self, scheduler_output, + non_block=False, ) -> Future[ModelRunnerOutput]: """Make execute_model non-blocking.""" + # DummyExecutor used only for testing async case. + assert non_block + def _execute(): output = self.collective_rpc("execute_model", args=(scheduler_output, )) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bc7ec0f065db2..ab43c0edc98d7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1296,11 +1296,8 @@ class EngineArgs: # Async scheduling does not work with the uniprocess backend. if self.distributed_executor_backend is None: self.distributed_executor_backend = "mp" - logger.info("Using mp-based distributed executor backend " - "for async scheduling.") - if self.distributed_executor_backend == "uni": - raise ValueError("Async scheduling is not supported with " - "uni-process backend.") + logger.info("Defaulting to mp-based distributed executor " + "backend for async scheduling.") if self.pipeline_parallel_size > 1: raise ValueError("Async scheduling is not supported with " "pipeline-parallel-size > 1.") diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 2e456ecd6de08..3b566e88a9ec2 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import os +from concurrent.futures import Future, ThreadPoolExecutor +from functools import cached_property from multiprocessing import Lock from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -17,6 +18,7 @@ from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, run_method) from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.utils import get_and_update_mm_cache +from vllm.v1.outputs import AsyncModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -31,15 +33,7 @@ class UniProcExecutor(ExecutorBase): """ self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0) - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - local_rank = 0 - # set local rank as the device index if specified - device_info = self.vllm_config.device_config.device.__str__().split( - ":") - if len(device_info) > 1: - local_rank = int(device_info[1]) - rank = 0 + distributed_init_method, rank, local_rank = self._distributed_args() is_driver_worker = True kwargs = dict( vllm_config=self.vllm_config, @@ -50,21 +44,56 @@ class UniProcExecutor(ExecutorBase): ) self.mm_receiver_cache = worker_receiver_cache_from_config( self.vllm_config, MULTIMODAL_REGISTRY, Lock()) + + self.async_output_thread: Optional[ThreadPoolExecutor] = None + if self.max_concurrent_batches > 1: + self.async_output_thread = ThreadPoolExecutor( + max_workers=1, thread_name_prefix="WorkerAsyncOutput") + self.collective_rpc("init_worker", args=([kwargs], )) self.collective_rpc("init_device") self.collective_rpc("load_model") + def _distributed_args(self) -> tuple[str, int, int]: + """Return (distributed_init_method, rank, local_rank).""" + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + # set local rank as the device index if specified + device_info = self.vllm_config.device_config.device.__str__().split( + ":") + local_rank = int(device_info[1]) if len(device_info) > 1 else 0 + return distributed_init_method, 0, local_rank + + @cached_property + def max_concurrent_batches(self) -> int: + return 2 if self.scheduler_config.async_scheduling else 1 + def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict] = None, + non_block: bool = False) -> List[Any]: if kwargs is None: kwargs = {} if self.mm_receiver_cache is not None and method == "execute_model": get_and_update_mm_cache(self.mm_receiver_cache, args) - answer = run_method(self.driver_worker, method, args, kwargs) - return [answer] + + if not non_block: + return [run_method(self.driver_worker, method, args, kwargs)] + + try: + result = run_method(self.driver_worker, method, args, kwargs) + if isinstance(result, AsyncModelRunnerOutput): + if (async_thread := self.async_output_thread) is not None: + return [async_thread.submit(result.get_output)] + result = result.get_output() + future = Future[Any]() + future.set_result(result) + except Exception as e: + future = Future[Any]() + future.set_exception(e) + return [future] def check_health(self) -> None: # UniProcExecutor will always be healthy as long as @@ -116,8 +145,9 @@ class ExecutorWithExternalLauncher(UniProcExecutor): assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \ ("To get deterministic execution in V1, " "please set VLLM_ENABLE_V1_MULTIPROCESSING=0") - self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, - rpc_rank=0) + super()._init_executor() + + def _distributed_args(self) -> tuple[str, int, int]: # engines are launched in torchrun-compatible launchers # so we can use the env:// method. # required env vars: @@ -128,19 +158,7 @@ class ExecutorWithExternalLauncher(UniProcExecutor): distributed_init_method = "env://" rank = int(os.environ["RANK"]) local_rank = int(os.environ["LOCAL_RANK"]) - is_driver_worker = True - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker, - ) - self.mm_receiver_cache = worker_receiver_cache_from_config( - self.vllm_config, MULTIMODAL_REGISTRY, Lock()) - self.collective_rpc("init_worker", args=([kwargs], )) - self.collective_rpc("init_device") - self.collective_rpc("load_model") + return distributed_init_method, rank, local_rank def determine_num_available_blocks(self) -> Tuple[int, int]: """ diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 27ee146c4f6ff..995e70385be89 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -159,6 +159,9 @@ class EngineCore: self.request_block_hasher = get_request_block_hasher( block_size, caching_hash_fn) + self.step_fn = (self.step if self.batch_queue is None else + self.step_with_batch_queue) + def _initialize_kv_caches( self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]: start = time.time() @@ -331,7 +334,8 @@ class EngineCore: model_executed = False if self.scheduler.has_requests(): scheduler_output = self.scheduler.schedule() - future = self.model_executor.execute_model(scheduler_output) + future = self.model_executor.execute_model(scheduler_output, + non_block=True) batch_queue.appendleft( (future, scheduler_output)) # type: ignore[arg-type] @@ -534,9 +538,6 @@ class EngineCoreProc(EngineCore): assert addresses.coordinator_input is not None logger.info("Waiting for READY message from DP Coordinator...") - self.step_fn = (self.step if self.batch_queue is None else - self.step_with_batch_queue) - # Mark the startup heap as static so that it's ignored by GC. # Reduces pause times of oldest generation collections. gc.collect() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 605bedaf10e66..bb0f37c6e0264 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -245,8 +245,8 @@ class InprocClient(EngineCoreClient): self.engine_core = EngineCore(*args, **kwargs) def get_output(self) -> EngineCoreOutputs: - outputs, _ = self.engine_core.step() - return outputs.get(0) or EngineCoreOutputs() + outputs, _ = self.engine_core.step_fn() + return outputs and outputs.get(0) or EngineCoreOutputs() def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return self.engine_core.get_supported_tasks() diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 68408a0b8a3d5..625017d52fff0 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from concurrent.futures import Future -from typing import Callable, Optional, Union +from typing import Any, Callable, Optional, Union import torch import torch.distributed as dist @@ -14,6 +14,7 @@ from vllm.executor.uniproc_executor import ( # noqa from vllm.executor.uniproc_executor import ( # noqa UniProcExecutor as UniProcExecutorV0) from vllm.utils import resolve_obj_by_qualname +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput @@ -86,12 +87,22 @@ class Executor(ExecutorBase): def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]: return self.collective_rpc("get_kv_cache_spec") + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: tuple = (), + kwargs: Optional[dict] = None, + non_block: bool = False) -> list[Any]: + raise NotImplementedError + def execute_model( self, - scheduler_output, + scheduler_output: SchedulerOutput, + non_block: bool = False, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: output = self.collective_rpc("execute_model", - args=(scheduler_output, )) + args=(scheduler_output, ), + non_block=non_block) return output[0] def execute_dummy_batch(self) -> None: diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 2f0e5aa383b13..f566c9aee0c54 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -11,7 +11,7 @@ import weakref from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass from enum import Enum, auto -from functools import partial +from functools import cached_property, partial from multiprocessing.connection import Connection from multiprocessing.process import BaseProcess from multiprocessing.synchronize import Lock as LockType @@ -37,6 +37,7 @@ from vllm.multimodal.cache import worker_receiver_cache_from_config from vllm.utils import (decorate_logs, get_distributed_init_method, get_loopback_ip, get_mp_context, get_open_port, set_process_title) +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.executor.utils import get_and_update_mm_cache from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds, @@ -174,9 +175,9 @@ class MultiprocExecutor(Executor): def execute_model( self, - scheduler_output, + scheduler_output: SchedulerOutput, + non_block: bool = False, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: - non_block = self.max_concurrent_batches > 1 if not self.has_connector: # get output only from a single worker (output_rank) @@ -328,7 +329,7 @@ class MultiprocExecutor(Executor): self.collective_rpc("check_health", timeout=10) return - @property + @cached_property def max_concurrent_batches(self) -> int: if self.scheduler_config.async_scheduling: return 2 @@ -632,7 +633,8 @@ class WorkerProc: result = (WorkerProc.ResponseStatus.FAILURE, str(output)) else: result = (WorkerProc.ResponseStatus.SUCCESS, output) - self.worker_response_mq.enqueue(result) + if (response_mq := self.worker_response_mq) is not None: + response_mq.enqueue(result) def handle_output(self, output: Any): """Handles output from the worker. If async scheduling is enabled, diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index 8394ae788ab01..59c9b56625a95 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -66,11 +66,13 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): def execute_model( self, scheduler_output: SchedulerOutput, + non_block: bool = False, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: """Execute the model on the Ray workers. Args: scheduler_output: The scheduler output to execute. + non_block: If True, the method will return a Future. Returns: The model runner output. @@ -84,7 +86,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): if not self.has_connector: # Get output only from a single worker (output_rank) # When PP is not used, we block here until the result is available. - if self.max_concurrent_batches == 1: + if not non_block: return refs[0].get() # When PP is used, we return a FutureWrapper immediately so that @@ -92,7 +94,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): return FutureWrapper(refs) # Get output from all workers when connector is present - if self.max_concurrent_batches == 1: + if not non_block: # Block and get results from all workers outputs = [ref.get() for ref in refs] return self.kv_output_aggregator.aggregate(outputs) @@ -106,4 +108,3 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): if reconfig_request.new_data_parallel_rank == \ ReconfigureRankType.SHUTDOWN_CURRENT_RANK: self.shutdown() - return \ No newline at end of file From 7f2ea7074e4c2ee75278623a02007fc1b6bb9639 Mon Sep 17 00:00:00 2001 From: Chenheli Hua <huachenheli@outlook.com> Date: Fri, 12 Sep 2025 19:16:06 -0700 Subject: [PATCH 242/584] [Frontend][Multimodal] Allow skipping media data when UUIDs are provided. (#23950) Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> --- docs/features/multimodal_inputs.md | 59 ++ examples/offline_inference/vision_language.py | 81 ++- tests/entrypoints/openai/test_vision.py | 65 +++ tests/entrypoints/test_chat_utils.py | 548 +++++++++++++++++- vllm/entrypoints/chat_utils.py | 228 +++++--- vllm/entrypoints/llm.py | 40 ++ vllm/multimodal/inputs.py | 5 +- vllm/multimodal/parse.py | 23 +- vllm/multimodal/processing.py | 17 +- 9 files changed, 970 insertions(+), 96 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 77baa27c7a958..7fb0337235005 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t print(o.outputs[0].text) ``` +Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache. + +??? code + + ```python + from vllm import LLM + from PIL import Image + + # Qwen2.5-VL example with two images + llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct") + + prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:" + img_b = Image.open("/path/to/b.jpg") + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": [None, img_b]}, + # Since img_a is expected to be cached, we can skip sending the actual + # image entirely. + "multi_modal_uuids": {"image": ["sku-1234-a", None]}, + }) + + for o in outputs: + print(o.outputs[0].text) + ``` + !!! warning If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored. @@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se ) ``` +For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this: + + ```python + # Image/video/audio URL: + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + + # image_embeds + { + "type": "image_embeds", + "image_embeds": None, + "uuid": image_uuid + }, + + # input_audio: + { + "type": "input_audio", + "input_audio": None, + "uuid": audio_uuid + }, + + # PIL Image: + { + "type": "image_pil", + "image_pil": None + "uuid": image_uuid + } + + ``` + !!! note Only one message can contain `{"type": "image_embeds"}`. If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index b104113b88213..4b75eb19fcf94 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1764,6 +1764,7 @@ def apply_image_repeat( probs = [1.0 - image_repeat_prob, image_repeat_prob] inputs = [] + inputs_with_empty_media = [] cur_image = data for i in range(num_prompts): if image_repeat_prob is not None: @@ -1774,14 +1775,25 @@ def apply_image_repeat( new_val = (i // 256 // 256, i // 256, i % 256) cur_image.putpixel((0, 0), new_val) + uuid = "uuid_{}".format(i) + inputs.append( { "prompt": prompts[i % len(prompts)], "multi_modal_data": {modality: cur_image}, + "multi_modal_uuids": {modality: uuid}, } ) - return inputs + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) + + return inputs, inputs_with_empty_media @contextmanager @@ -1860,6 +1872,13 @@ def parse_args(): help="If True, then use different prompt (with the same multi-modal " "data) for each request.", ) + + parser.add_argument( + "--verify-mm-cache-hit-with-uuids", + action="store_true", + help="If True, will send all requests in a second batch with empty mm " + "data to verify cache hits with UUIDs.", + ) return parser.parse_args() @@ -1903,26 +1922,48 @@ def main(args): assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference + uuid = "uuid_0" inputs = { "prompt": prompts[0], "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + inputs_with_empty_media = { + "prompt": prompts[0], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, } else: # Batch inference if args.image_repeat_prob is not None: # Repeat images with specified probability of "image_repeat_prob" - inputs = apply_image_repeat( - args.image_repeat_prob, args.num_prompts, data, prompts, modality + inputs, inputs_with_empty_media = apply_image_repeat( + args.image_repeat_prob, + args.num_prompts, + data, + prompts, + modality, ) else: # Use the same image for all prompts - inputs = [ - { - "prompt": prompts[i % len(prompts)], - "multi_modal_data": {modality: data}, - } - for i in range(args.num_prompts) - ] + inputs = [] + inputs_with_empty_media = [] + for i in range(args.num_prompts): + uuid = "uuid_{}".format(i) + inputs.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + ) + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) # Add LoRA request if applicable lora_request = ( @@ -1942,6 +1983,26 @@ def main(args): print(generated_text) print("-" * 50) + if args.verify_mm_cache_hit_with_uuids: + try: + # Verify cache hits with UUIDs + print( + "Sending a second batch of requests with empty media" + " and matching UUIDs." + ) + outputs = llm.generate( + inputs_with_empty_media, + sampling_params=sampling_params, + lora_request=lora_request, + ) + print("-" * 50) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + print("-" * 50) + except Exception as e: + print(f"Failed to verify cache hits with UUIDs. Error: {e}") + if __name__ == "__main__": args = parse_args() diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 72819f31de206..a324e86666055 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -522,6 +522,71 @@ async def test_completions_with_image_with_uuid( assert isinstance(chat_completion.choices[0].message.content, str) assert len(chat_completion.choices[0].message.content) > 0 + # Second request, with empty image but the same uuid. + chat_completion_with_empty_image = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": {}, + "uuid": image_url + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion_with_empty_image.choices[ + 0].message.content is not None + assert isinstance( + chat_completion_with_empty_image.choices[0].message.content, str) + assert len( + chat_completion_with_empty_image.choices[0].message.content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_completions_with_empty_image_with_uuid_without_cache_hit( + client: openai.AsyncOpenAI, + model_name: str, +): + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": {}, + "uuid": "uuid_not_previously_seen" + }, + ], + }, + ], + model=model_name, + ) + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 5149ca346050e..dd33f5c8c1d8e 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -79,6 +79,28 @@ def phi3v_tokenizer(): ) +@pytest.fixture(scope="function") +def qwen2_audio_model_config(): + return ModelConfig( + QWEN2AUDIO_MODEL_ID, + runner="generate", + trust_remote_code=True, + limit_mm_per_prompt={ + "audio": 1, + }, + ) + + +@pytest.fixture(scope="module") +def qwen2_audio_tokenizer(): + return TokenizerGroup( + tokenizer_id=QWEN2AUDIO_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="function") def qwen25omni_model_config_mm_interleaved(): return ModelConfig( @@ -169,6 +191,7 @@ def audio_url(): def _assert_mm_data_is_image_input( mm_data: Optional[MultiModalDataDict], image_count: int, + skipped_image_indices: Optional[list] = None, ) -> None: assert mm_data is not None assert set(mm_data.keys()) == {"image"} @@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input( assert image_data is not None assert isinstance(image_data, list) and len(image_data) == image_count + if skipped_image_indices is not None: + for i in skipped_image_indices: + assert image_data[i] is None def _assert_mm_uuids( @@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int] def _assert_mm_data_inputs( - mm_data: Optional[MultiModalDataDict], - data_count: MultiModalDataCounts, + mm_data: Optional[MultiModalDataDict], + data_count: MultiModalDataCounts, + skipped_media_indices: Optional[dict[ + str, list]] = None, # modality -> list[int] ) -> None: assert mm_data is not None assert set(data_count.keys()) == (set(mm_data.keys())) @@ -216,6 +244,13 @@ def _assert_mm_data_inputs( assert modality_data is not None assert isinstance(modality_data, list) and len(modality_data) == n + if skipped_media_indices is not None: + skipped_media_indices_for_modality = skipped_media_indices.get( + modality) + assert skipped_media_indices_for_modality is not None + for i in skipped_media_indices_for_modality: + assert modality_data[i] is None + def test_parse_chat_messages_single_image( phi3v_model_config, @@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid( _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) +def test_parse_chat_messages_single_empty_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + def test_parse_chat_messages_single_image_with_bad_uuid_format( phi3v_model_config, phi3v_tokenizer, @@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) +def test_parse_chat_messages_multiple_empty_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + +def test_parse_chat_messages_mixed_empty_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + @pytest.mark.asyncio async def test_parse_chat_messages_single_image_with_uuid_async( phi3v_model_config, @@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async( _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) +@pytest.mark.asyncio +async def test_parse_chat_messages_empty_image_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(await mm_future, + 1, + skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_uuids_async( phi3v_model_config, @@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": None, + "uuid": image_uuid1, + }, + { + "type": "image_pil", + "image_pil": None, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?", + }] + _assert_mm_data_is_image_input(await mm_future, + 2, + skipped_image_indices=[0, 1]) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( phi3v_model_config, @@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images( _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) +def test_parse_chat_messages_empty_pil_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_pil", + "image_pil": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + +def test_parse_chat_messages_empty_image_embeds_with_uuid( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + assert mm_data is not None + assert "image" in mm_data + assert mm_data["image"] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, +): + uuid = "abcd" + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + }] + mm_data = await mm_future + assert mm_data is not None + assert "image" in mm_data + assert mm_data["image"] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_async( phi3v_model_config, @@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl expected_uuids=["audio_123"]) +def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501 + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": None, + "uuid": "image_123", + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": None, + "uuid": "audio_123", + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": None, + "uuid": "image_123", + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": None, + "uuid": "video_123", + }, + ], + }, + ], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] + + _assert_mm_data_inputs(mm_data, { + "image": 2, + "video": 1, + "audio": 1 + }, + skipped_media_indices={ + "image": [0, 1], + "video": [0], + "audio": [0] + }) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=["image_123", "image_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="video", + expected_uuids=["video_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=["audio_123"]) + + def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer, @@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk(): r"[INST]Thanks, what is 3+3?[/INST]") assert string_tokens == expected_tokens + + +def test_parse_chat_messages_single_empty_audio_with_uuid( + qwen2_audio_model_config, + qwen2_audio_tokenizer, +): + audio_uuid = "abcd" + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + { + "type": "text", + "text": "What does the audio say?" + }, + ], + }], + qwen2_audio_model_config, + qwen2_audio_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" + }] + _assert_mm_data_inputs(mm_data, {"audio": 1}) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=[audio_uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_single_empty_audio_with_uuid_async( + qwen2_audio_model_config, + qwen2_audio_tokenizer, +): + audio_uuid = "abcd" + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": {}, + "uuid": audio_uuid, + }, + { + "type": "text", + "text": "What does the audio say?" + }, + ], + }], + qwen2_audio_model_config, + qwen2_audio_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?" + }] + _assert_mm_data_inputs(await mm_future, {"audio": 1}) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=[audio_uuid]) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index b53dbfb3a26a2..aa231de93c0c3 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -73,15 +73,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False): type: Required[Literal["audio_url"]] """The type of the content part.""" - uuid: Optional[str] - """ - User-provided UUID of a media. User must guarantee that it is properly - generated and unique for different medias. - """ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): - image_embeds: Required[Union[str, dict[str, str]]] + image_embeds: Optional[Union[str, dict[str, str]]] """ The image embeddings. It can be either: - A single base64 string. @@ -108,11 +103,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): type: Required[Literal["video_url"]] """The type of the content part.""" - uuid: Optional[str] - """ - User-provided UUID of a media. User must guarantee that it is properly - generated and unique for different medias. - """ class PILImage(BaseModel): @@ -133,7 +123,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False): } """ - image_pil: Required[PILImage] + image_pil: Optional[PILImage] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -151,7 +141,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): } """ - image_url: Required[str] + image_url: Optional[str] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -168,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): } """ - audio_url: Required[str] + audio_url: Optional[str] class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): @@ -180,7 +170,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): } """ - video_url: Required[str] + video_url: Optional[str] uuid: Optional[str] """ User-provided UUID of a media. User must guarantee that it is properly @@ -597,7 +587,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): self._model_config = model_config self._tokenizer = tokenizer - self._items_by_modality = defaultdict[str, list[_T]](list) + self._items_by_modality = defaultdict[str, list[Optional[_T]]](list) self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list) @property @@ -624,14 +614,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return self.mm_registry.create_processor(self.model_config) def add( - self, modality: ModalityStr, item: _T, uuid: Optional[str] = None + self, + modality: ModalityStr, + item: Optional[_T], + uuid: Optional[str] = None, ) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. An optional uuid can be added which serves as a unique identifier of the - media. + media. """ input_modality = modality.replace("_embeds", "") num_items = len(self._items_by_modality[modality]) + 1 @@ -708,10 +701,15 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): if not self._items_by_modality: return None mm_inputs = {} - items_by_modality = { - modality: await asyncio.gather(*items) - for modality, items in self._items_by_modality.items() - } + items_by_modality = {} + for modality, items in self._items_by_modality.items(): + coros = [] + for item in items: + if item is not None: + coros.append(item) + else: + coros.append(asyncio.sleep(0)) + items_by_modality[modality] = await asyncio.gather(*coros) if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError( @@ -760,35 +758,40 @@ class BaseMultiModalContentParser(ABC): return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: raise NotImplementedError @abstractmethod def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: raise NotImplementedError @abstractmethod def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: raise NotImplementedError @@ -803,15 +806,17 @@ class MultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: - image = self._connector.fetch_image(image_url) + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None + ) -> None: + image = self._connector.fetch_image(image_url) if image_url else None placeholder = self._tracker.add("image", image, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: if isinstance(image_embeds, dict): @@ -825,31 +830,49 @@ class MultiModalContentParser(BaseMultiModalContentParser): embedding = self._connector.fetch_image_embedding(image_embeds) placeholder = self._tracker.add("image_embeds", embedding, uuid) + if image_embeds is None: + placeholder = self._tracker.add("image_embeds", None, uuid) + self._add_placeholder("image", placeholder) def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: placeholder = self._tracker.add("image", image_pil, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: - audio = self._connector.fetch_audio(audio_url) + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: + audio = self._connector.fetch_audio(audio_url) if audio_url else None placeholder = self._tracker.add("audio", audio, uuid) self._add_placeholder("audio", placeholder) def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - audio_url = f"data:audio/{audio_format};base64,{audio_data}" + if input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + if audio_data: + audio_url = f"data:audio/{audio_format};base64,{audio_data}" + else: + # If a UUID is provided, audio data may be empty. + audio_url = None + else: + audio_url = None return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: - video = self._connector.fetch_video(video_url=video_url) + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: + video = ( + self._connector.fetch_video(video_url=video_url) + if video_url + else None + ) placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -865,18 +888,24 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: - image_coro = self._connector.fetch_image_async(image_url) + def parse_image( + self, image_url: Optional[str], uuid: Optional[str] = None + ) -> None: + image_coro = ( + self._connector.fetch_image_async(image_url) if image_url else None + ) placeholder = self._tracker.add("image", image_coro, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( self, - image_embeds: Union[str, dict[str, str]], + image_embeds: Union[str, dict[str, str], None], uuid: Optional[str] = None, ) -> None: - future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() + future: asyncio.Future[Union[str, dict[str, str], None]] = ( + asyncio.Future() + ) if isinstance(image_embeds, dict): embeds = { @@ -889,35 +918,58 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): embedding = self._connector.fetch_image_embedding(image_embeds) future.set_result(embedding) + if image_embeds is None: + future.set_result(None) + placeholder = self._tracker.add("image_embeds", future, uuid) self._add_placeholder("image", placeholder) def parse_image_pil( - self, image_pil: Image.Image, uuid: Optional[str] = None + self, image_pil: Optional[Image.Image], uuid: Optional[str] = None ) -> None: - future: asyncio.Future[Image.Image] = asyncio.Future() - future.set_result(image_pil) + future: asyncio.Future[Optional[Image.Image]] = asyncio.Future() + if image_pil: + future.set_result(image_pil) + else: + future.set_result(None) placeholder = self._tracker.add("image", future, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: - audio_coro = self._connector.fetch_audio_async(audio_url) + def parse_audio( + self, audio_url: Optional[str], uuid: Optional[str] = None + ) -> None: + audio_coro = ( + self._connector.fetch_audio_async(audio_url) if audio_url else None + ) placeholder = self._tracker.add("audio", audio_coro, uuid) self._add_placeholder("audio", placeholder) def parse_input_audio( - self, input_audio: InputAudio, uuid: Optional[str] = None + self, input_audio: Optional[InputAudio], uuid: Optional[str] = None ) -> None: - audio_data = input_audio.get("data", "") - audio_format = input_audio.get("format", "") - audio_url = f"data:audio/{audio_format};base64,{audio_data}" + if input_audio: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + if audio_data: + audio_url = f"data:audio/{audio_format};base64,{audio_data}" + else: + # If a UUID is provided, audio data may be empty. + audio_url = None + else: + audio_url = None return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: - video = self._connector.fetch_video_async(video_url=video_url) + def parse_video( + self, video_url: Optional[str], uuid: Optional[str] = None + ) -> None: + video = ( + self._connector.fetch_video_async(video_url=video_url) + if video_url + else None + ) placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -1130,8 +1182,9 @@ def _parse_chat_message_content_mm_part( part, dict ) # This is needed to avoid mypy errors: part.get() from str part_type = part.get("type", None) + uuid = part.get("uuid", None) - if isinstance(part_type, str) and part_type in MM_PARSER_MAP: + if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501 content = MM_PARSER_MAP[part_type](part) # Special case for 'image_url.detail' @@ -1146,25 +1199,54 @@ def _parse_chat_message_content_mm_part( # Handle missing 'type' but provided direct URL fields. # 'type' is required field by pydantic - if part_type is None: - if part.get("image_url") is not None: + if part_type is None or uuid is not None: + if "image_url" in part: image_params = cast( CustomChatCompletionContentSimpleImageParam, part ) - return "image_url", image_params.get("image_url", "") - if part.get("audio_url") is not None: + image_url = image_params.get("image_url", None) + if isinstance(image_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + image_url = image_url.get("url", None) + return "image_url", image_url + if "image_pil" in part: + # "image_pil" could be None if UUID is provided. + image_params = cast( # type: ignore + CustomChatCompletionContentPILImageParam, part + ) + image_pil = image_params.get("image_pil", None) + return "image_pil", image_pil + if "image_embeds" in part: + # "image_embeds" could be None if UUID is provided. + image_params = cast( # type: ignore + ChatCompletionContentPartImageEmbedsParam, part + ) + image_embeds = image_params.get("image_embeds", None) + return "image_embeds", image_embeds + if "audio_url" in part: audio_params = cast( CustomChatCompletionContentSimpleAudioParam, part ) - return "audio_url", audio_params.get("audio_url", "") + audio_url = audio_params.get("audio_url", None) + if isinstance(audio_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + audio_url = audio_url.get("url", None) + return "audio_url", audio_url if part.get("input_audio") is not None: input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params - if part.get("video_url") is not None: + if "video_url" in part: video_params = cast( CustomChatCompletionContentSimpleVideoParam, part ) - return "video_url", video_params.get("video_url", "") + video_url = video_params.get("video_url", None) + if isinstance(video_url, dict): + # Can potentially happen if user provides a uuid + # with url as a dict of {"url": url} + video_url = video_url.get("url", None) + return "video_url", video_url # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -1173,15 +1255,9 @@ def _parse_chat_message_content_mm_part( return part_type, "unknown part_type content" -VALID_MESSAGE_CONTENT_MM_PART_TYPES = ( +PART_TYPES_TO_SKIP_NONE_CONTENT = ( "text", "refusal", - "image_url", - "image_embeds", - "image_pil", - "audio_url", - "input_audio", - "video_url", ) @@ -1242,7 +1318,7 @@ def _parse_chat_message_content_part( part_type, content = _parse_chat_message_content_mm_part(part) # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but # content is None, log a warning and skip - if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None: + if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None: logger.warning( "Skipping multimodal part '%s' (type: '%s') " "with empty / unparsable content.", @@ -1266,7 +1342,10 @@ def _parse_chat_message_content_part( modality = None if part_type == "image_pil": - image_content = cast(Image.Image, content) + if content is not None: + image_content = cast(Image.Image, content) + else: + image_content = None mm_parser.parse_image_pil(image_content, uuid) modality = "image" elif part_type in ("image_url", "input_image"): @@ -1274,7 +1353,10 @@ def _parse_chat_message_content_part( mm_parser.parse_image(str_content, uuid) modality = "image" elif part_type == "image_embeds": - content = cast(Union[str, dict[str, str]], content) + if content is not None: + content = cast(Union[str, dict[str, str]], content) + else: + content = None mm_parser.parse_image_embeds(content, uuid) modality = "image" elif part_type == "audio_url": diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 94749eb884512..4b51dbcd8acb9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1491,6 +1491,11 @@ class LLM: for i, prompt in enumerate(it): + if isinstance(prompt, dict): + self._validate_mm_data_and_uuids( + prompt.get("multi_modal_data"), + prompt.get("multi_modal_uuids")) + param = params[i] if isinstance(params, Sequence) else params tokenization_kwargs: dict[str, Any] = {} @@ -1507,6 +1512,41 @@ class LLM: priority=priority[i] if priority else 0, ) + def _validate_mm_data_and_uuids( + self, + multi_modal_data: Optional[Any], # MultiModalDataDict + multi_modal_uuids: Optional[Any], # MultiModalUUIDDict + ): + """ + Validate that if any multi-modal data is skipped (i.e. None), + then its corresponding UUID must be set. + """ + if multi_modal_data is None: + return + + for modality, data in multi_modal_data.items(): + if isinstance(data, list): + for i, d in enumerate(data): + if d is None: + if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[ # noqa: E501 + modality] is None: + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided") + else: + if len( + multi_modal_uuids[modality] + ) <= i or multi_modal_uuids[modality][i] is None: + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided") + else: + if data is None and (multi_modal_uuids is None + or modality not in multi_modal_uuids + or multi_modal_uuids[modality] is None): + raise ValueError(f"Multi-modal data for {modality} is None" + f" but UUID is not provided") + def _add_request( self, prompt: PromptType, diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index f8ea3835f049d..240e34e139cfe 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -85,9 +85,10 @@ which are treated as audio embeddings; these are directly passed to the model without HF processing. """ -ModalityData: TypeAlias = Union[_T, list[_T]] +ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None] """ -Either a single data item, or a list of data items. +Either a single data item, or a list of data items. Can only be None if UUID +is provided. The number of data items allowed per modality is restricted by `--limit-mm-per-prompt`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 88bb99529f200..493dd3560a516 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -36,7 +36,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]): def __init__(self, data: _T, modality: str) -> None: super().__init__() - self.data = data + self.data: _T = data self.modality = modality def __repr__(self) -> str: @@ -177,7 +177,9 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor], class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): - def __init__(self, data: Sequence[HfAudioItem]) -> None: + def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None: + if data is None: + data = [None] super().__init__(data, "audio") def get_audio_length(self, item_idx: int) -> int: @@ -198,7 +200,9 @@ class ImageSize(NamedTuple): class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): - def __init__(self, data: Sequence[HfImageItem]) -> None: + def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None: + if data is None: + data = [None] super().__init__(data, "image") def get_image_size(self, item_idx: int) -> ImageSize: @@ -223,10 +227,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): def __init__( self, - data: Sequence[HfVideoItem], + data: Optional[Sequence[HfVideoItem]], metadata: Optional[Union[dict[str, Any], list[Optional[dict[str, Any]]]]] = None, ) -> None: + if data is None: + data = [None] super().__init__(data, "video") self.metadata = metadata @@ -385,6 +391,9 @@ class MultiModalDataParser: self, data: ModalityData[AudioItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return AudioProcessorItems(None) + # also check single audio item with sampling rate if self._is_empty(data) or (isinstance(data, tuple) and self._is_empty(data[0])): @@ -420,6 +429,9 @@ class MultiModalDataParser: self, data: ModalityData[ImageItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return ImageProcessorItems(None) + if self._is_empty(data): return None @@ -441,6 +453,9 @@ class MultiModalDataParser: self, data: ModalityData[VideoItem], ) -> Optional[ModalityDataItems[Any, Any]]: + if data is None: + return VideoProcessorItems(None) + if self._is_empty(data): return None diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index e5db356b635f3..7471bfcb4d508 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1075,7 +1075,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ mm_items = self.data_parser.parse_mm_data(mm_data) - for modality, items in mm_items.items(): self.validate_num_items(modality, len(items)) @@ -1436,10 +1435,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ] for modality, items_is_cached in mm_is_cached.items() } - mm_missing_data = { - modality: [mm_data_items[modality][idx] for idx in idxs] - for modality, idxs in mm_missing_idxs.items() - } + mm_missing_data = {} + for modality, idxs in mm_missing_idxs.items(): + missing_modality_data = [] + for idx in idxs: + data = mm_data_items[modality][idx] + if data is None: + raise ValueError( + f"Cache miss for {modality} at index {idx} " + f"but data is not provided.") + else: + missing_modality_data.append(data) + mm_missing_data[modality] = missing_modality_data return self._to_mm_items(mm_missing_data) From 89e08d6d180c76019daa5aa1bbf7759dfaedde2e Mon Sep 17 00:00:00 2001 From: Shane A <shanea@allenai.org> Date: Fri, 12 Sep 2025 20:26:21 -0700 Subject: [PATCH 243/584] [Model] Add Olmo3 model implementation (#24534) Signed-off-by: Shane A <shanea@allenai.org> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- docs/models/supported_models.md | 1 + tests/models/registry.py | 1 + vllm/model_executor/models/olmo2.py | 42 +++++++---- vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 1 + vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/olmo3.py | 80 +++++++++++++++++++++ 7 files changed, 114 insertions(+), 14 deletions(-) create mode 100644 vllm/transformers_utils/configs/olmo3.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index db3dd2c252c76..6295a2aa8dc2f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -389,6 +389,7 @@ th { | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 0c77ec5ef13c3..b268bf12a3f30 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -301,6 +301,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"), "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"), + "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"), "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"), "OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m", {"1b": "facebook/opt-iml-max-1.3b"}), diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index bccd1b87043a5..3e4c580a11211 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -52,10 +52,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( - AutoWeightsLoader, is_pp_missing_parameter, + AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs import Olmo3Config class Olmo2Attention(nn.Module): @@ -68,7 +69,7 @@ class Olmo2Attention(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config - assert isinstance(self.config, Olmo2Config) + assert isinstance(self.config, (Olmo2Config, Olmo3Config)) hidden_size = self.config.hidden_size self.tp_size = get_tensor_model_parallel_world_size() @@ -111,14 +112,14 @@ class Olmo2Attention(nn.Module): self.q_norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) - # Rotary embeddings. - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_theta, # type: ignore - ) self.scaling = self.head_dim**-0.5 + + layer_idx = extract_layer_index(prefix) + sliding_window = None + if ((layer_types := getattr(self.config, "layer_types", None)) + is not None and layer_types[layer_idx] == "sliding_attention"): + sliding_window = self.config.sliding_window + self.attn = Attention( self.num_heads, self.head_dim, @@ -126,7 +127,20 @@ class Olmo2Attention(nn.Module): num_kv_heads=self.num_kv_heads, cache_config=vllm_config.cache_config, quant_config=vllm_config.quant_config, - prefix=prefix, + per_layer_sliding_window=sliding_window, + prefix=f"{prefix}.attn", + ) + + # Rotary embeddings. Rope scaling is only applied on full attention + # layers. + self.rope_scaling = (self.config.rope_scaling + if sliding_window is None else None) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, # type: ignore + rope_scaling=self.rope_scaling, ) # Attention output projection. @@ -176,7 +190,7 @@ class Olmo2MLP(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - assert isinstance(config, Olmo2Config) + assert isinstance(config, (Olmo2Config, Olmo3Config)) hidden_size = config.hidden_size intermediate_size = config.intermediate_size @@ -221,7 +235,7 @@ class Olmo2DecoderLayer(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - assert isinstance(config, Olmo2Config) + assert isinstance(config, (Olmo2Config, Olmo3Config)) # Attention block. self.self_attn = Olmo2Attention(vllm_config=vllm_config, prefix=f"{prefix}.self_attn") @@ -261,7 +275,7 @@ class Olmo2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config - assert isinstance(self.config, Olmo2Config) + assert isinstance(self.config, (Olmo2Config, Olmo3Config)) self.embed_tokens = VocabParallelEmbedding( self.config.vocab_size, @@ -376,7 +390,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - assert isinstance(config, Olmo2Config) + assert isinstance(config, (Olmo2Config, Olmo3Config)) self.config = config self.model = Olmo2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 7d7654e846e1c..85759df369850 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -120,6 +120,7 @@ _TEXT_GENERATION_MODELS = { "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"), + "Olmo3ForCausalLM": ("olmo2", "Olmo2ForCausalLM"), "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2852d16ec53f4..a76f9504eccef 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -75,6 +75,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( eagle="EAGLEConfig", speculators="SpeculatorsConfig", nemotron="NemotronConfig", + olmo3="Olmo3Config", ovis="OvisConfig", ultravox="UltravoxConfig", step3_vl="Step3VLConfig", diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index cdae59ccc24e0..ca0d5def760a8 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config +from vllm.transformers_utils.configs.olmo3 import Olmo3Config from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig @@ -45,6 +46,7 @@ __all__ = [ "NemotronConfig", "NemotronHConfig", "Nemotron_Nano_VL_Config", + "Olmo3Config", "OvisConfig", "SpeculatorsConfig", "UltravoxConfig", diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py new file mode 100644 index 0000000000000..874507db43a7f --- /dev/null +++ b/vllm/transformers_utils/configs/olmo3.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from transformers.configuration_utils import PretrainedConfig + + +class Olmo3Config(PretrainedConfig): + + model_type = "olmo3" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=50304, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + use_cache=True, + pad_token_id=1, + bos_token_id=None, + eos_token_id=50279, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + rms_norm_eps=1e-5, + sliding_window=4096, + layer_types=None, + **kwargs, + ): + # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM + # in vLLM. + if "architectures" not in kwargs: + kwargs["architectures"] = ["Olmo2ForCausalLM"] + elif "Olmo3ForCausalLM" in kwargs["architectures"]: + kwargs["architectures"].remove("Olmo3ForCausalLM") + kwargs["architectures"].append("Olmo2ForCausalLM") + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + self.rms_norm_eps = rms_norm_eps + + self.sliding_window = sliding_window + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" + for i in range(self.num_hidden_layers) + ] From 99bfef841fc705c17a46b364b324da86a2aee3f9 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Sat, 13 Sep 2025 14:55:14 +0800 Subject: [PATCH 244/584] [Bugfix] Fix GPUModelRunner has no attribute lora_manager (#24762) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- vllm/v1/worker/lora_model_runner_mixin.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index f2ebd5e10210b..01d5f0525c4e2 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -63,8 +63,7 @@ class LoRAModelRunnerMixin: def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...], token_lora_mapping: tuple[int, ...], lora_requests: set[LoRARequest]) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() # Set is_prefill to True, so we always use the SGMV kernels on # non-cuda platforms. @@ -75,6 +74,11 @@ class LoRAModelRunnerMixin: is_prefill=True) self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + def _ensure_lora_enabled(self) -> None: + if not hasattr(self, "lora_manager"): + raise RuntimeError( + "LoRA is not enabled. Use --enable-lora to enable LoRA.") + def set_active_loras(self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray) -> None: @@ -172,21 +176,17 @@ class LoRAModelRunnerMixin: self.lora_manager.remove_all_adapters() def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.add_adapter(lora_request) def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.remove_adapter(lora_id) def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.pin_adapter(lora_id) def list_loras(self) -> set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") + self._ensure_lora_enabled() return self.lora_manager.list_adapters() From 5febdc8750ae3d670abbc0c8b83585653627032e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 13 Sep 2025 00:08:20 -0700 Subject: [PATCH 245/584] [Chore] Remove unused batched RoPE op & kernel (#24789) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- csrc/ops.h | 6 - csrc/pos_encoding_kernels.cu | 122 --------------- csrc/torch_bindings.cpp | 10 -- tests/kernels/core/test_pos_encoding.py | 147 +----------------- tests/kernels/core/test_rotary_embedding.py | 22 +-- vllm/_custom_ops.py | 10 -- vllm/_ipex_ops.py | 11 -- .../layers/rotary_embedding/base.py | 36 ++--- 8 files changed, 16 insertions(+), 348 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 2ee5df4cac54c..3ecfd2cd9bf3f 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -122,12 +122,6 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, std::optional<torch::Tensor> key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox); -void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query, - std::optional<torch::Tensor> key, - int64_t head_size, torch::Tensor& cos_sin_cache, - bool is_neox, int64_t rot_dim, - torch::Tensor& cos_sin_cache_offsets); - void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 266f2a0667a24..b5645b33b9073 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -99,35 +99,6 @@ __global__ void rotary_embedding_kernel( token_idx, query_stride, key_stride, head_stride); } -template <typename scalar_t, bool IS_NEOX> -__global__ void batched_rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or - // [num_tokens] - scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, - // head_size] or [num_tokens, num_heads, - // head_size] - scalar_t* __restrict__ key, // nullptr or - // [batch_size, seq_len, num_kv_heads, - // head_size] or [num_tokens, num_kv_heads, - // head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // - // 2] - const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] - const int rot_dim, const int64_t query_stride, const int64_t key_stride, - const int64_t head_stride, const int num_heads, const int num_kv_heads, - const int head_size) { - // Each thread block is responsible for one token. - const int token_idx = blockIdx.x; - int64_t pos = positions[token_idx]; - int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx]; - const scalar_t* cache_ptr = - cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; - - apply_rotary_embedding<scalar_t, IS_NEOX>( - query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, - token_idx, query_stride, key_stride, head_stride); -} - } // namespace vllm void rotary_embedding( @@ -211,96 +182,3 @@ void rotary_embedding( } }); } - -/* -Batched version of rotary embedding, pack multiple LoRAs together -and process in batched manner. -*/ -void batched_rotary_embedding( - torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] - torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or - // [num_tokens, num_heads * head_size] or - // [batch_size, seq_len, num_heads, head_size] or - // [num_tokens, num_heads, head_size] - std::optional<torch::Tensor> - key, // null or - // [batch_size, seq_len, num_kv_heads * head_size] or - // [num_tokens, num_kv_heads * head_size] or - // [batch_size, seq_len, num_heads, head_size] or - // [num_tokens, num_heads, head_size] - int64_t head_size, - torch::Tensor& cos_sin_cache, // [max_position, rot_dim] - bool is_neox, int64_t rot_dim, - torch::Tensor& cos_sin_cache_offsets // [num_tokens] or [batch_size] -) { - // num_tokens = batch_size * seq_len - int64_t num_tokens = cos_sin_cache_offsets.size(0); - TORCH_CHECK( - positions.size(0) == num_tokens || positions.numel() == num_tokens, - "positions must have the same num_tokens or batch_size as " - "cos_sin_cache_offsets"); - - int positions_ndim = positions.dim(); - // Make sure num_tokens dim is consistent across positions, query, and key - TORCH_CHECK( - positions_ndim == 1 || positions_ndim == 2, - "positions must have shape [num_tokens] or [batch_size, seq_len]"); - if (positions_ndim == 1) { - TORCH_CHECK(query.size(0) == positions.size(0) && - (!key.has_value() || key->size(0) == positions.size(0)), - "query, key and positions must have the same number of tokens"); - } - if (positions_ndim == 2) { - TORCH_CHECK( - query.size(0) == positions.size(0) && - (!key.has_value() || key->size(0) == positions.size(0)) && - query.size(1) == positions.size(1) && - (!key.has_value() || key->size(1) == positions.size(1)), - "query, key and positions must have the same batch_size and seq_len"); - } - - // Make sure head_size is valid for query and key - int query_hidden_size = query.numel() / num_tokens; - int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0; - TORCH_CHECK(query_hidden_size % head_size == 0); - TORCH_CHECK(key_hidden_size % head_size == 0); - - // Make sure query and key have concistent number of heads - int num_heads = query_hidden_size / head_size; - int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads; - TORCH_CHECK(num_heads % num_kv_heads == 0); - - int seq_dim_idx = positions_ndim - 1; - int64_t query_stride = query.stride(seq_dim_idx); - int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0; - // Determine head stride: for [*, heads, head_size] use stride of last dim; - // for flat [*, heads*head_size], heads blocks are contiguous of size - // head_size - int query_ndim = query.dim(); - int64_t head_stride = - (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size; - - dim3 grid(num_tokens); - dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] { - if (is_neox) { - vllm::batched_rotary_embedding_kernel<scalar_t, true> - <<<grid, block, 0, stream>>>( - positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(), - key.has_value() ? key->data_ptr<scalar_t>() : nullptr, - cos_sin_cache.data_ptr<scalar_t>(), - cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride, - key_stride, head_stride, num_heads, num_kv_heads, head_size); - } else { - vllm::batched_rotary_embedding_kernel<scalar_t, false> - <<<grid, block, 0, stream>>>( - positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(), - key.has_value() ? key->data_ptr<scalar_t>() : nullptr, - cos_sin_cache.data_ptr<scalar_t>(), - cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride, - key_stride, head_stride, num_heads, num_kv_heads, head_size); - } - }); -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 9e4586e8f0450..c63fa7cffc78d 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -214,16 +214,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor cos_sin_cache, bool is_neox) -> ()"); ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding); - // Apply GPT-NeoX or GPT-J style rotary embedding to query and key - // (supports multiple loras). - ops.def( - "batched_rotary_embedding(Tensor positions, Tensor! query," - " Tensor!? key, int head_size," - " Tensor cos_sin_cache, bool is_neox," - " int rot_dim," - " Tensor cos_sin_cache_offsets) -> ()"); - ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding); - // Quantization ops #ifndef USE_ROCM // Quantized GEMM for AWQ. diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index ab6f1ccf881fd..bf9b1d9b4401a 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from itertools import accumulate, product +from itertools import product from typing import Callable, Optional import pytest @@ -111,151 +111,6 @@ def test_rotary_embedding( "expected returned key to be None" -@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) -@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("seq_len", SEQ_LENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_key", USE_KEY) -@torch.inference_mode() -def test_batched_rotary_embedding( - is_neox_style: bool, - tensor_shape_fn: Callable[[int, int, int, int], tuple[int]], - batch_size: int, - seq_len: int, - num_heads: int, - head_size: int, - rotary_dim: Optional[int], - dtype: torch.dtype, - seed: int, - device: str, - use_key: bool, - max_position: int = 8192, - base: float = 10000, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - if rotary_dim is None: - rotary_dim = head_size - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "rope_type": "linear", - "factor": (1, ) - }) - rope = rope.to(dtype=dtype, device=torch.get_default_device()) - - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size) - query = torch.randn(query_shape, dtype=dtype) - key = torch.randn_like(query) if use_key else None - - # slice tensor if required, noop otherwise - query = query[..., :head_size] - key = key[..., :head_size] if use_key else None - - # NOTE(woosuk): The reference implementation should be executed first - # because the custom kernel is in-place. - ref_query, ref_key = rope.forward_native(positions, query, key) - out_query, out_key = rope.forward(positions, - query, - key, - offsets=torch.zeros(batch_size * seq_len, - dtype=torch.long, - device=device)) - # Compare the results. - torch.testing.assert_close(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - if use_key: - torch.testing.assert_close(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) - else: - assert ref_key is None and out_key is None, \ - "expected returned key to be None" - - -@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("seq_len", SEQ_LENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_key", USE_KEY) -@torch.inference_mode() -def test_batched_rotary_embedding_multi_lora( - is_neox_style: bool, - batch_size: int, - seq_len: int, - num_heads: int, - head_size: int, - rotary_dim: Optional[int], - dtype: torch.dtype, - seed: int, - device: str, - use_key: bool, - max_position: int = 8192, - base: float = 10000, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - if rotary_dim is None: - rotary_dim = head_size - scaling_factors: list[int] = [1, 2, 4] - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "rope_type": "linear", - "factor": tuple(scaling_factors) - }) - rope = rope.to(dtype=dtype, device=torch.get_default_device()) - - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=dtype) - key = torch.randn_like(query) if use_key else None - - offset_map = torch.tensor( - list( - accumulate([0] + [ - max_position * scaling_factor * 2 - for scaling_factor in scaling_factors[:-1] - ]))) - query_types = torch.randint(0, - len(scaling_factors), (batch_size, seq_len), - device=device) - query_offsets = offset_map[query_types] - - # NOTE(woosuk): The reference implementation should be executed first - # because the custom kernel is in-place. - ref_query, ref_key = rope.forward_native(positions, query, key, - query_offsets) - out_query, out_key = rope.forward(positions, query, key, - query_offsets.flatten()) - # Compare the results. - torch.testing.assert_close(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - if use_key: - torch.testing.assert_close(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) - else: - assert ref_key is None and out_key is None, \ - "expected returned key to be None" - - @torch.inference_mode() def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index d1fd960bf115c..5857dd5ba3fad 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -16,20 +16,14 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding def rotary_embedding_opcheck(rot, positions: torch.Tensor, query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None): + key: Optional[torch.Tensor] = None): cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. - if offsets is not None: - opcheck(torch.ops._C.batched_rotary_embedding, - (positions, query, key, rot.head_size, cos_sin_cache, - rot.is_neox_style, rot.rotary_dim, offsets)) - else: - opcheck(torch.ops._C.rotary_embedding, - (positions, query, key, rot.head_size, cos_sin_cache, - rot.is_neox_style)) + # ops.rotary_embedding() is a in-place operation + # that updates the query and key tensors. + opcheck(torch.ops._C.rotary_embedding, + (positions, query, key, rot.head_size, cos_sin_cache, + rot.is_neox_style)) @pytest.mark.parametrize("device", ["cuda"]) @@ -65,10 +59,6 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, key = key[..., :head_size] if use_key else None rotary_embedding_opcheck(rot, positions, query, key) - offsets = torch.zeros(batch_size * seq_len, - device=device, - dtype=torch.long) - rotary_embedding_opcheck(rot, positions, query, key, offsets) # if we have a contiguous head stride, test the alternate # [..., num_heads * head_dim] shape/layout diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 3b2e859b76af7..93b4f87ed260c 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -257,16 +257,6 @@ def rotary_embedding( cos_sin_cache, is_neox) -def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, - key: Optional[torch.Tensor], head_size: int, - cos_sin_cache: torch.Tensor, is_neox: bool, - rot_dim: int, - cos_sin_cache_offsets: torch.Tensor) -> None: - torch.ops._C.batched_rotary_embedding(positions, query, key, head_size, - cos_sin_cache, is_neox, rot_dim, - cos_sin_cache_offsets) - - # layer norm ops def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float) -> None: diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index c2868c040aa16..59b0aed321502 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -148,17 +148,6 @@ class ipex_ops: head_size, cos_sin_cache, is_neox, rot_dim) - @staticmethod - def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, - key: torch.Tensor, head_size: int, - cos_sin_cache: torch.Tensor, is_neox: bool, - rot_dim: int, - cos_sin_cache_offsets: torch.Tensor) -> None: - ipex.llm.functional.rotary_embedding_batched(positions, query, key, - head_size, cos_sin_cache, - is_neox, rot_dim, - cos_sin_cache_offsets) - @staticmethod def rms_norm(input: torch.Tensor, weight: torch.Tensor, epsilon: float) -> torch.Tensor: diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index be25e90abf821..db50eb08db3ff 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -62,11 +62,8 @@ class RotaryEmbedding(CustomOp): positions: torch.Tensor, query: torch.Tensor, key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """A PyTorch-native implementation of forward().""" - if offsets is not None: - positions = positions + offsets positions = positions.flatten() num_tokens = positions.shape[0] cos_sin = self.cos_sin_cache.index_select(0, positions) @@ -96,7 +93,6 @@ class RotaryEmbedding(CustomOp): positions: torch.Tensor, query: torch.Tensor, key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: from vllm import _custom_ops as ops @@ -107,16 +103,10 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. - if offsets is not None: - ops.batched_rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, - self.is_neox_style, self.rotary_dim, - offsets) - else: - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) return query, key def forward_xpu( @@ -124,29 +114,21 @@ class RotaryEmbedding(CustomOp): positions: torch.Tensor, query: torch.Tensor, key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: from vllm._ipex_ops import ipex_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. if key is None: # XPU kernel doesn't support key=None so fall back to native impl # TODO(sarckk): add support for optional key in # ipex.llm.functional.rotary_embedding_batched - return self.forward_native(positions, query, key, offsets) + return self.forward_native(positions, query, key) else: - if offsets is not None: - ops.batched_rotary_embedding(positions, query, key, - self.head_size, - self.cos_sin_cache, - self.is_neox_style, - self.rotary_dim, offsets) - else: - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) return query, key def extra_repr(self) -> str: From 9a8966bcc207036e9b65bd7951e808254ffba542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Sat, 13 Sep 2025 16:13:44 +0900 Subject: [PATCH 246/584] [Docs] Fix warnings in mkdocs build (continued) (#24791) Signed-off-by: Zerohertz <ohg3417@gmail.com> --- vllm/distributed/eplb/eplb_state.py | 8 +++--- vllm/distributed/eplb/rebalance_algo.py | 6 ++--- .../kv_transfer/kv_connector/v1/base.py | 5 ++-- .../kv_connector/v1/lmcache_connector.py | 5 ++-- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 7 ++--- .../kv_connector/v1/p2p/tensor_memory_pool.py | 5 ++-- .../v1/shared_storage_connector.py | 7 ++--- .../kv_transfer/kv_pipe/pynccl_pipe.py | 4 +-- vllm/model_executor/models/interfaces.py | 2 +- vllm/model_executor/models/keye.py | 12 +++------ vllm/model_executor/models/keye_vl1_5.py | 11 ++++---- vllm/model_executor/models/llava.py | 4 ++- vllm/model_executor/models/llava_next.py | 5 ++-- vllm/model_executor/models/mistral3.py | 4 ++- vllm/model_executor/models/mllama4.py | 9 +++---- vllm/model_executor/models/moonvit.py | 18 ++++++++++--- vllm/model_executor/models/phi4_multimodal.py | 4 +-- vllm/model_executor/models/qwen2_vl.py | 12 +++------ vllm/model_executor/models/siglip2navit.py | 27 ++++++------------- vllm/model_executor/models/ultravox.py | 9 ++++--- vllm/model_executor/models/zamba2.py | 4 +-- vllm/transformers_utils/config.py | 13 ++++----- vllm/transformers_utils/configs/jais.py | 8 +++--- vllm/transformers_utils/configs/ultravox.py | 4 --- .../processors/deepseek_vl2.py | 9 +++---- vllm/transformers_utils/runai_utils.py | 4 +-- vllm/transformers_utils/s3_utils.py | 6 ++--- 27 files changed, 102 insertions(+), 110 deletions(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index d5ab61473ab01..8f8baa7d59db7 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -337,11 +337,11 @@ class EplbState: Args: model (MixtureOfExperts): The MoE model. is_dummy (bool): If `True`, this is a dummy step and the load - metrics recorded in this forward pass will not count. Defaults - to `False`. + metrics recorded in this forward pass will not count. Defaults + to `False`. is_profile (bool): If `True`, perform a dummy rearrangement - with maximum communication cost. This is used in `profile_run` - to reserve enough memory for the communication buffer. + with maximum communication cost. This is used in `profile_run` + to reserve enough memory for the communication buffer. log_stats (bool): If `True`, log the expert load metrics. # Stats diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py index 879b5b9f18240..3564a10dfc684 100644 --- a/vllm/distributed/eplb/rebalance_algo.py +++ b/vllm/distributed/eplb/rebalance_algo.py @@ -102,14 +102,14 @@ def rebalance_experts_hierarchical( num_groups: int, num_nodes: int, num_gpus: int, -): +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Parameters: weight: [num_moe_layers, num_logical_experts] num_physical_experts: number of physical experts after replication num_groups: number of expert groups - num_nodes: number of server nodes, where the intra-node network - (e.g, NVLink) is faster + num_nodes: number of server nodes, where the intra-node network + (e.g, NVLink) is faster num_gpus: number of GPUs, must be a multiple of `num_nodes` Returns: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index cd4561154b78b..7e0b927c5b78f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC): @abstractmethod def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """ Start loading the KV cache from the connector to vLLM's paged KV buffer. This is called from the forward context before the @@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC): @abstractmethod def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """ Start saving a layer of KV cache from vLLM's paged buffer to the connector. This is called from within attention layer to diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index c99f538ee4185..2b0abe983fbb3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1): # Worker-side methods # ============================== def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """ Start loading the KV cache from the connector to vLLM's paged KV buffer. This is called from the forward context before the @@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1): self._lmcache_engine.wait_for_layer_load(layer_name) def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """ Start saving the a layer of KV cache from vLLM's paged buffer to the connector. This is called from within attention layer to diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 2485c57d86ecc..ec72905a0d3ec 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1): # ============================== def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """Start loading the KV cache from the connector buffer to vLLM's paged KV buffer. @@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1): return def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """Start saving the KV cache of the layer from vLLM's paged buffer to the connector. @@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1): def get_finished( self, finished_req_ids: set[str], - **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: + **kwargs: Any) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have finished generating tokens. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index b775276d4a846..26070488bad89 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -218,8 +218,9 @@ class TensorMemoryPool: return addr - def load_tensor(self, addr: int, dtype: torch.dtype, - shape: tuple[int, ...], device) -> torch.Tensor: + def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int, + ...], + device: torch.device) -> torch.Tensor: """Loads a tensor from pinned host memory to the specified device. Args: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 25460ed295d30..48fa1a82c6775 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -3,7 +3,7 @@ import hashlib import os from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Any, Optional import safetensors import torch @@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1): logger.info("Shared storage path is %s", self._storage_path) def start_load_kv(self, forward_context: "ForwardContext", - **kwargs) -> None: + **kwargs: Any) -> None: """Start loading the KV cache from the connector buffer to vLLM's paged KV buffer. @@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1): return def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, - attn_metadata: "AttentionMetadata", **kwargs) -> None: + attn_metadata: "AttentionMetadata", + **kwargs: Any) -> None: """Start saving the KV cache of the layer from vLLM's paged buffer to the connector. diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 66120e9a0a1a0..7a79a8cc0c932 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase): """ Receives a tensor and its metadata from the source rank. Blocking call. - Args: - tensor: The received tensor, or `None` if no tensor is received. + Returns: + The received tensor, or `None` if no tensor is received. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index d5b71b057831b..8f8e300c84d71 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -823,7 +823,7 @@ class SupportsEagle3(Protocol): Args: layers: Tuple of layer indices that should output auxiliary - hidden states. + hidden states. """ ... diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index cb4cd60a89177..afe33b4d4ad26 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module): batch. **NOTE**: If mrope is enabled (default setting for Qwen2-VL opensource models), the shape will be `(3, seq_len)`, - otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. + otherwise it will be `(seq_len,)`. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py index b3c44d132435c..93a3bf5f98f7b 100644 --- a/vllm/model_executor/models/keye_vl1_5.py +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor: return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0) -def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int], - torch.Tensor]): +def get_num_patches(grid_thw: torch.Tensor, + num_frames: Union[list[int], torch.Tensor]) -> list[int]: """ Return num_patches per video. Args: - t: tensor with shape [N, ...] where each item is a list/tensor - cu_seqlens: list indicating the boundaries of groups + grid_thw: Tensor with shape [N, 3] containing temporal, height, width + dimensions + num_frames: List or tensor indicating the number of frames per video Returns: - list of ints representing the sum of products for each group + List of ints representing the number of patches for each video Examples: >>> # Suppose there are 2 videos with a total of 3 grids diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index d692b2783048f..9591deea06ce9 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each input image. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. Info: [LlavaImageInputs][] diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index a63c18493df5e..5e82f9799e0fe 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each grid patch for each input image. - image_sizes: The original `(height, width)` for each input image. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. Info: [LlavaNextImageInputs][] diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 08948960b275c..09479012a03ad 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. - pixel_values: The pixels in each input image. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. Info: [Mistral3ImagePixelInputs][] diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 6dc77666e1582..2f0e8a2a5e575 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module): ) -> torch.Tensor: r""" Args: - inputs_embeds (`torch.FloatTensor` of shape - `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. This is useful if you - want more control over how to convert `input_ids` indices into + hidden_states: Input tensor of shape + (batch_size, sequence_length, hidden_size). + Hidden states from the model embeddings, representing + the input tokens. associated vectors than the model's internal embedding lookup matrix. """ diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 41a2c836b09f3..caa00763fc3d4 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -70,11 +70,15 @@ def multihead_attention( v: torch.Tensor, q_cu_seqlens: Optional[torch.Tensor] = None, k_cu_seqlens: Optional[torch.Tensor] = None, -): +) -> torch.Tensor: """Multi-head attention using flash attention 2. Args: - q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim), + q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim), or (tot_seqlens, num_heads, head_dim) if packing. q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q. The first element should be 0 and the last element should be q.shape[0]. @@ -123,8 +127,14 @@ def sdpa_attention( """SDPA attention. Args: - q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim), + q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim), or (tot_seqlens, num_heads, head_dim) if packing. + k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim), + or (tot_seqlens, num_heads, head_dim) if packing. + q_cu_seqlens: Optional cumulative sequence lengths of q. + k_cu_seqlens: Optional cumulative sequence lengths of k. """ seq_length = q.shape[0] attention_mask = torch.zeros([1, seq_length, seq_length], @@ -387,7 +397,7 @@ class MLP2(nn.Module): def __init__(self, dims: list[int], activation, - bias=True, + bias: bool = True, prefix: str = "", use_data_parallel: bool = False): super().__init__() diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index ab63649b43561..25df9e9261d91 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module): Typically used as a very first layer in a model. Args: - input_size: int - layer input size. + config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig) + object containing model parameters. """ def __init__(self, config: Phi4MultimodalAudioConfig): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index cf15dfa67743f..d08181c5fd53b 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, batch. **NOTE**: If mrope is enabled (default setting for Qwen2-VL opensource models), the shape will be `(3, seq_len)`, - otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. + otherwise it will be `(seq_len,)`. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index a86700fe68dd3..7d90d3a7ef128 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module): position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]: """ Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(batch, seq_len, embed_dim)`. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all - attention layers. See `attentions` under - returned tensors for more detail. + hidden_states: Input tensor of shape (batch, seq_len, embed_dim). + cu_seqlens: Cumulative sequence lengths tensor. + position_embeddings: Position embeddings tensor. """ residual = hidden_states @@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module): ) -> torch.Tensor: r""" Args: - inputs_embeds (`torch.FloatTensor` of shape - `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. This is useful if - you want more control over how to convert `input_ids` indices - into associated vectors than the model's internal embedding - lookup matrix. - grid_thws (`torch.LongTensor`): - grid shape (num_patches, 3) - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): + inputs_embeds: Input tensor of shape + (batch_size, sequence_length, hidden_size). + Embedded representation of the input tokens. + grid_thws: Grid tensor of shape (num_patches, 3) + containing grid dimensions. Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 9e28b0c443df4..ad911ebedf895 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): with the `input_ids`. Args: - audio_features: A batch of audio input chunks [B, N, 80, M]. - audio_lens: Length of audio frames for each audio chunk [B]. - audio_token_len: Length of audio tokens for each audio chunk [B']. - Note: batch dim is different from batch dim in audio chunks. + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Position indices for the input tokens. + intermediate_tensors: Intermediate tensors from prior forward pass. + inputs_embeds: Optional tensor of input embeddings. """ diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 34b9c1ad07d76..86335d48c1454 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -909,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): prefix: Optional prefix for parameter names Raises: - AssertionError: If prefix caching is enabled - (not supported by Mamba) + AssertionError: If prefix caching is enabled + (not supported by Mamba) """ config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index a76f9504eccef..fd19d33ca0c89 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -679,20 +679,21 @@ def get_hf_file_to_dict(file_name: str, @cache -def get_pooling_config(model: str, revision: Optional[str] = 'main'): +def get_pooling_config(model: str, + revision: Optional[str] = 'main') -> Optional[dict]: """ This function gets the pooling and normalize config from the model - only applies to sentence-transformers models. Args: - model (str): The name of the Hugging Face model. - revision (str, optional): The specific version - of the model to use. Defaults to 'main'. + model: The name of the Hugging Face model. + revision: The specific version of the model to use. + Defaults to 'main'. Returns: - dict: A dictionary containing the pooling - type and whether normalization is used. + A dictionary containing the pooling type and whether + normalization is used, or None if no pooling configuration is found. """ modules_file_name = "modules.json" diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index 767c4ddae870d..d5ca2c7b4751a 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -74,10 +74,10 @@ class JAISConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). - scale_attn_by_inverse_layer_idx (`bool`, *optional*, - defaults to `False`): - Whether to additionally scale attention weights by - `1 / layer_idx + 1`. + scale_attn_by_inverse_layer_idx + (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights + by `1 / layer_idx + 1`. reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): Whether to scale keys (K) prior to computing attention (dot-product) diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index e67479516560f..aaf31d84d0c1a 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -37,10 +37,6 @@ class UltravoxConfig(transformers.PretrainedConfig): The initialization value for the layer normalization. projector_act (`str`, *optional*, defaults to `"swiglu"`): The activation function used by the multimodal projector. - text_model_lora_config (`LoraConfigSimplified`, *optional*): - The LoRA configuration for finetuning the text model. - audio_model_lora_config (`LoraConfigSimplified`, *optional*): - The LoRA configuration for finetuning the audio model. projector_ln_mid (`bool`, *optional*, defaults to `False`): Whether to apply layer normalization at the middle of the projector or at the end. Versions v0.4.1 and below diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index 5896bde312657..d1d117b4e2cf4 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -25,6 +25,7 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import math +from typing import Any import torch import torchvision.transforms as T @@ -178,17 +179,15 @@ class DeepseekVLV2Processor(ProcessorMixin): prompt: str, images: list[Image.Image], inference_mode: bool = True, - **kwargs, + **kwargs: Any, ): """ Args: prompt (str): the formatted prompt; - conversations (list[dict]): conversations with a list of messages; images (list[ImageType]): the list of images; inference_mode (bool): if True, then remove the last eos token; - system_prompt (str): the system prompt; - **kwargs: + **kwargs: Additional keyword arguments. Returns: outputs (BaseProcessorOutput): the output of the processor, @@ -259,7 +258,7 @@ class DeepseekVLV2Processor(ProcessorMixin): text: str, images: list[Image.Image], inference_mode: bool = True, - **kwargs, + **kwargs: Any, ): """ diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py index 357c180ed35f2..b7bee1974de5b 100644 --- a/vllm/transformers_utils/runai_utils.py +++ b/vllm/transformers_utils/runai_utils.py @@ -33,7 +33,6 @@ def list_safetensors(path: str = "") -> list[str]: Args: path: The object storage path to list from. - allow_pattern: A list of patterns of which files to pull. Returns: list[str]: List of full object storage paths allowed by the pattern @@ -54,8 +53,7 @@ class ObjectStorageModel: dir: The temporary created directory. Methods: - pull_files(): Pull model from object storage to the temporary - directory. + pull_files(): Pull model from object storage to the temporary directory. """ def __init__(self) -> None: diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 62c87c167e682..d17c1afe9b504 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -from typing import Optional +from typing import Any, Optional from vllm.utils import PlaceholderModule @@ -26,7 +26,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: ] -def glob(s3=None, +def glob(s3: Optional[Any] = None, path: str = "", allow_pattern: Optional[list[str]] = None) -> list[str]: """ @@ -51,7 +51,7 @@ def glob(s3=None, def list_files( - s3, + s3: Any, path: str, allow_pattern: Optional[list[str]] = None, ignore_pattern: Optional[list[str]] = None From abc7989adccf9c4d5ef7b5f909ed46fb32fa8a49 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 13 Sep 2025 08:15:03 +0100 Subject: [PATCH 247/584] [Docs] Remove Neuron install doc as backend no longer exists (#24396) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- README.md | 2 +- docs/README.md | 2 +- docs/features/README.md | 3 - docs/features/quantization/README.md | 26 ++-- docs/getting_started/installation/.nav.yml | 2 - docs/getting_started/installation/README.md | 1 - .../installation/aws_neuron.md | 147 ------------------ 7 files changed, 15 insertions(+), 168 deletions(-) delete mode 100644 docs/getting_started/installation/aws_neuron.md diff --git a/README.md b/README.md index b4a3583c214cf..0c6e5aa6b31d2 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ vLLM is flexible and easy to use with: - Tensor, pipeline, data and expert parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron +- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Prefix caching support - Multi-LoRA support diff --git a/docs/README.md b/docs/README.md index 683e1d37563f5..ae95717def4cd 100644 --- a/docs/README.md +++ b/docs/README.md @@ -56,7 +56,7 @@ vLLM is flexible and easy to use with: - Tensor, pipeline, data and expert parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Prefix caching support - Multi-LoRA support diff --git a/docs/features/README.md b/docs/features/README.md index de23cd0a90eb5..d8e26ec02aecc 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -76,6 +76,3 @@ th:not(:first-child) { | multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ | | best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | - -!!! note - Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 4605ba7781ed4..4c8377871e141 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -43,19 +43,19 @@ th:not(:first-child) { } </style> -| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU | -|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------| -| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | -| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ | +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | Google TPU | +|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------| +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | +| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - ✅︎ indicates that the quantization method is supported on the specified hardware. diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml index d4a727c926406..ba1f8099a6456 100644 --- a/docs/getting_started/installation/.nav.yml +++ b/docs/getting_started/installation/.nav.yml @@ -3,5 +3,3 @@ nav: - gpu.md - cpu.md - google_tpu.md - - intel_gaudi.md - - aws_neuron.md diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 8a658b7a9103f..5e57d23f4a1df 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -12,7 +12,6 @@ vLLM supports the following hardware platforms: - [Apple silicon](cpu.md#apple-silicon) - [IBM Z (S390X)](cpu.md#ibm-z-s390x) - [Google TPU](google_tpu.md) -- [AWS Neuron](aws_neuron.md) ## Hardware Plugins diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md deleted file mode 100644 index ff2500f035270..0000000000000 --- a/docs/getting_started/installation/aws_neuron.md +++ /dev/null @@ -1,147 +0,0 @@ -# AWS Neuron - -[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and -generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, -and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. -This describes how to set up your environment to run vLLM on Neuron. - -!!! warning - There are no pre-built wheels or images for this device, so you must build vLLM from source. - -## Requirements - -- OS: Linux -- Python: 3.9 or newer -- Pytorch 2.5/2.6 -- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips) -- AWS Neuron SDK 2.23 - -## Configure a new environment - -### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies - -The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this -[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image). - -- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance -- Once inside your instance, activate the pre-installed virtual environment for inference by running - -```bash -source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate -``` - -Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html) -for alternative setup instructions including using Docker and manually installing dependencies. - -!!! note - NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) - library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html). - -## Set up using Python - -### Pre-built wheels - -Currently, there are no pre-built Neuron wheels. - -### Build wheel from source - -To build and install vLLM from source, run: - -```bash -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -U -r requirements/neuron.txt -VLLM_TARGET_DEVICE="neuron" pip install -e . -``` - -AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at -<https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2>, which contains several features in addition to what's -available on vLLM V0. Please utilize the AWS Fork for the following features: - -- Llama-3.2 multi-modal support -- Multi-node distributed inference - -Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) - for more details and usage examples. - -To install the AWS Neuron fork, run the following: - -```bash -git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git -cd upstreaming-to-vllm -pip install -r requirements/neuron.txt -VLLM_TARGET_DEVICE="neuron" pip install -e . -``` - -Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested. - -## Set up using Docker - -### Pre-built images - -Currently, there are no pre-built Neuron images. - -### Build image from source - -See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. - -Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile. - -## Extra information - -[](){ #feature-support-through-nxd-inference-backend } - -### Feature support through NxD Inference backend - -The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend -to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most -[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. - -To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override -as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include - -```python -override_neuron_config={ - "enable_bucketing":False, -} -``` - -or when launching vLLM from the CLI, pass - -```bash ---override-neuron-config "{\"enable_bucketing\":false}" -``` - -Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts -(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. - -### Known limitations - -- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this - [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility) - for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI. -- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this - [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) - to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM. -- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at - runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py) -- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed - to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature. -- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer - to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node) - to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main. -- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches - max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt - to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support - for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is - implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic. - -### Environment variables - -- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid - compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the - artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set, - but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts - under this specified path. -- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend). -- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend). From 30498f2a6592263a0e7e42080f8790ea72d9b122 Mon Sep 17 00:00:00 2001 From: Rakesh Asapanna <45640029+rozeappletree@users.noreply.github.com> Date: Sat, 13 Sep 2025 12:45:41 +0530 Subject: [PATCH 248/584] [Doc]: Remove 404 hyperlinks (#24785) Signed-off-by: Rakesh Asapanna <45640029+rozeappletree@users.noreply.github.com> --- docs/examples/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/examples/README.md b/docs/examples/README.md index 3cf93027f4209..94f5efc92f386 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -2,6 +2,6 @@ vLLM's examples are split into three categories: -- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference) -- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving) -- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others) +- If you are using vLLM from within Python code, see the *Offline Inference* section. +- If you are using vLLM from an HTTP application or client, see the *Online Serving* section. +- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section. From dbeee3844cf93278e18e736ea7f177648de2d6aa Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Sat, 13 Sep 2025 15:16:24 +0800 Subject: [PATCH 249/584] [Perf] Use NVIDIA hardware-accelerated instruction for float to fp8_e4m3 quantization (#24757) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- csrc/quantization/fp8/common.cuh | 8 ++++++-- csrc/quantization/fp8/nvidia/quant_utils.cuh | 19 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh index 1aad6330c44b8..7838f211c59db 100644 --- a/csrc/quantization/fp8/common.cuh +++ b/csrc/quantization/fp8/common.cuh @@ -5,7 +5,9 @@ #include <cmath> -#ifdef USE_ROCM +#ifndef USE_ROCM + #include "nvidia/quant_utils.cuh" +#else #include "amd/quant_utils.cuh" #endif @@ -48,7 +50,9 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val, float r = fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>)); #ifndef USE_ROCM - return static_cast<fp8_type>(r); + // Use hardware cvt instruction for fp8 on nvidia + // Currently only support fp8_type = c10::Float8_e4m3fn + return fp8::vec_conversion<fp8_type, float>(r); #else // Use hardware cvt instruction for fp8 on rocm return fp8::cvt_c10<fp8_type>(r); diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh index f8cd1dcba4ab3..5b9c2df8468cb 100644 --- a/csrc/quantization/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh @@ -12,13 +12,26 @@ namespace vllm { namespace fp8 { #ifdef ENABLE_FP8 - #if 0 // Disable the following code to reduce the binary size. template <typename Tout, typename Tin> -__inline__ __device__ Tout -vec_conversion(const Tin &x, const __nv_fp8_interpretation_t fp8_type) { +__inline__ __device__ Tout vec_conversion( + const Tin& x, const __nv_fp8_interpretation_t fp8_type = __NV_E4M3) { return x; } +// float -> c10::Float8_e4m3fn +template <> +__inline__ __device__ c10::Float8_e4m3fn +vec_conversion<c10::Float8_e4m3fn, float>( + const float& a, const __nv_fp8_interpretation_t fp8_type) { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return static_cast<c10::Float8_e4m3fn>(a); + #else + return c10::Float8_e4m3fn(__nv_cvt_float_to_fp8(a, __NV_SATFINITE, fp8_type), + c10::Float8_e4m3fn::from_bits()); + #endif +} + + #if 0 // Disable the following code to reduce the binary size. // fp8 -> half template <> __inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>( From 98229db2444b016d572bfd36e685960b3352d900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= <elvircrn@gmail.com> Date: Sat, 13 Sep 2025 09:17:27 +0200 Subject: [PATCH 250/584] [Kernels][DP/EP] Optimize Silu Kernel for R1 (#24054) Signed-off-by: elvircrn <elvircrn@gmail.com> --- .../kernels/benchmark_silu_mul_fp8_quant.py | 674 +++++++++++++++++- csrc/ops.h | 6 + csrc/quantization/activation_kernels.cu | 465 ++++++++++++ csrc/torch_bindings.cpp | 7 + .../moe/test_silu_mul_fp8_quant_deep_gemm.py | 104 ++- .../layers/fused_moe/batched_deep_gemm_moe.py | 141 ++-- 6 files changed, 1269 insertions(+), 128 deletions(-) diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index 0650cbf3cc18e..c7a4066b39d70 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -1,77 +1,675 @@ -#!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time +from collections.abc import Callable +import matplotlib.pyplot as plt +import numpy as np import torch from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - silu_mul_fp8_quant_deep_gemm, + silu_mul_fp8_quant_deep_gemm_cuda, ) from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used -def benchmark(E, T, H, G=128, runs=50): - current_platform.seed_everything(42) - y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda") - tokens_per_expert = torch.randint( - T // 2, T, size=(E,), dtype=torch.int32, device="cuda" +@triton.jit +def _silu_mul_fp8_quant_deep_gemm( + # Pointers ------------------------------------------------------------ + input_ptr, # 16-bit activations (E, T, 2*H) + y_q_ptr, # fp8 quantized activations (E, T, H) + y_s_ptr, # 16-bit scales (E, T, G) + counts_ptr, # int32 num tokens per expert (E) + # Sizes --------------------------------------------------------------- + H: tl.constexpr, # hidden dimension (per output) + GROUP_SIZE: tl.constexpr, # elements per group (usually 128) + # Strides for input (elements) --------------------------------------- + stride_i_e, + stride_i_t, + stride_i_h, + # Strides for y_q (elements) ----------------------------------------- + stride_yq_e, + stride_yq_t, + stride_yq_h, + # Strides for y_s (elements) ----------------------------------------- + stride_ys_e, + stride_ys_t, + stride_ys_g, + # Stride for counts (elements) + stride_counts_e, + # Numeric params ------------------------------------------------------ + eps: tl.constexpr, + fp8_min: tl.constexpr, + fp8_max: tl.constexpr, + use_ue8m0: tl.constexpr, + # Meta --------------------------------------------------------------- + BLOCK: tl.constexpr, + NUM_STAGES: tl.constexpr, +): + G = H // GROUP_SIZE + + # map program id -> (e, g) + pid = tl.program_id(0) + e = pid // G + g = pid % G + + e = e.to(tl.int64) + g = g.to(tl.int64) + + # number of valid tokens for this expert + n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64) + + cols = tl.arange(0, BLOCK).to(tl.int64) + mask = cols < BLOCK + + base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h + base_gate_offset = base_input_offset + cols * stride_i_h + base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h + base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h + base_ys_offset = e * stride_ys_e + g * stride_ys_g + + for t in tl.range(0, n_tokens, num_stages=NUM_STAGES): + gate = tl.load( + input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0 + ).to(tl.float32) + up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0) + + gate = gate * (1.0 / (1.0 + tl.exp(-gate))) + y = gate * up + + y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max + if use_ue8m0: + y_s = tl.exp2(tl.ceil(tl.log2(y_s))) + + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask) + tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) + + +def silu_mul_fp8_quant_deep_gemm_triton( + y: torch.Tensor, # (E, T, 2*H) + tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + num_parallel_tokens, + group_size: int = 128, + eps: float = 1e-10, +) -> tuple[torch.Tensor, torch.Tensor]: + """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales + + y has shape (E, T, 2*H). The first half of the last dimension is + silu-activated, multiplied by the second half, then quantized into FP8. + + Returns `(y_q, y_s)` where + * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] + * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + """ + assert y.ndim == 3, "y must be (E, T, 2*H)" + E, T, H2 = y.shape + assert H2 % 2 == 0, "last dim of y must be even (2*H)" + H = H2 // 2 + G = (H + group_size - 1) // group_size + assert H % group_size == 0, "H must be divisible by group_size" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, ( + "tokens_per_expert must be shape (E,)" + ) + tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) + + # allocate outputs + fp8_dtype = torch.float8_e4m3fn + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + # desired scale strides (elements): (T*G, 1, T) + stride_ys_e = T * G + stride_ys_t = 1 + stride_ys_g = T + y_s = torch.empty_strided( + (E, T, G), + (stride_ys_e, stride_ys_t, stride_ys_g), + dtype=torch.float32, + device=y.device, ) + stride_cnt_e = tokens_per_expert.stride()[0] + + # Static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G,) + + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = f_info.min + + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + stride_ys_e, + stride_ys_t, + stride_ys_g, + stride_cnt_e, + eps, + fp8_min, + fp8_max, + is_deep_gemm_e8m0_used(), + BLOCK=group_size, + NUM_STAGES=4, + num_warps=1, + ) + + return y_q, y_s + + +# Parse generation strategies +strategies = ["uniform", "max_t", "first_t"] + + +def benchmark( + kernel: Callable, + E: int, + T: int, + H: int, + total_tokens: int, + num_parallel_tokens: int = 64, + G: int = 128, + runs: int = 200, + num_warmups: int = 20, + gen_strategy: str = "default", + iterations_per_run: int = 20, +): + def generate_data(seed_offset=0): + """Generate input data with given seed offset""" + current_platform.seed_everything(42 + seed_offset) + y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() + + if gen_strategy == "uniform": + r = torch.rand(size=(E,), device="cuda") + r /= r.sum() + r *= total_tokens + tokens_per_expert = r.int() + tokens_per_expert = torch.minimum( + tokens_per_expert, + torch.ones((E,), device=r.device, dtype=torch.int) * T, + ) + elif gen_strategy == "max_t": + tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert.fill_(total_tokens / E) + elif gen_strategy == "first_t": + tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert[0] = min(T, total_tokens) + else: + raise ValueError(f"Unknown generation strategy: {gen_strategy}") + return y, tokens_per_expert + + dataset_count = 4 + # Pre-generate different input matrices for each iteration to avoid cache effects + data_sets = [generate_data(i) for i in range(dataset_count)] + # Warmup - for _ in range(10): - silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G) - torch.cuda.synchronize() + y, tokens_per_expert = data_sets[0] + for _ in range(num_warmups): + kernel( + y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G + ) + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) # Benchmark - torch.cuda.synchronize() - start = time.perf_counter() + latencies: list[float] = [] for _ in range(runs): - silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G) - torch.cuda.synchronize() + torch.cuda.synchronize() - avg_time = (time.perf_counter() - start) / runs * 1000 + start_event.record() + for i in range(iterations_per_run): + y, tokens_per_expert = data_sets[i % dataset_count] + kernel( + y, + tokens_per_expert, + num_parallel_tokens=num_parallel_tokens, + group_size=G, + ) + end_event.record() + end_event.synchronize() - # Calculate actual work done (only count valid tokens) + total_time_ms = start_event.elapsed_time(end_event) + per_iter_time_ms = total_time_ms / iterations_per_run + latencies.append(per_iter_time_ms) + + # Use median instead of average for better outlier handling + median_time_ms = np.median(latencies) + median_time_s = median_time_ms / 1000 + + # Calculate actual work done (using first dataset for consistency) + _, tokens_per_expert = data_sets[0] actual_tokens = tokens_per_expert.sum().item() actual_elements = actual_tokens * H # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops ops_per_element = 8 total_ops = actual_elements * ops_per_element - gflops = total_ops / (avg_time / 1000) / 1e9 + gflops = total_ops / median_time_s / 1e9 # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes) input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs output_bytes = actual_tokens * H * 1 # H fp8 outputs scale_bytes = actual_tokens * (H // G) * 4 # scales in float32 total_bytes = input_bytes + output_bytes + scale_bytes - memory_bw = total_bytes / (avg_time / 1000) / 1e9 + memory_bw = total_bytes / median_time_s / 1e9 - return avg_time, gflops, memory_bw + HOPPER_BANDWIDTH_TBPS = 3.35 + return ( + median_time_ms, + gflops, + memory_bw, + (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100, + ) +def create_comparison_plot( + ratio, cuda_times, baseline_times, config_labels, strategy_name, id +): + """Create a comparison plot for a specific generation strategy""" + fig, ax = plt.subplots(1, 1, figsize=(16, 6)) + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.35 + + # Execution Time plot (lower is better) + ax.bar( + x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue" + ) + ax.bar( + x + width / 2, + baseline_times, + width, + label="Baseline", + alpha=0.8, + color="orange", + ) + + # Add speedup labels over each bar pair + for i in range(len(x)): + speedup = ratio[i] + max_height = max(cuda_times[i], baseline_times[i]) + ax.text( + x[i], + max_height + max_height * 0.02, + f"{speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=9, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + return fig, ax + + +def create_combined_plot(all_results): + """Create a combined plot with all strategies in one PNG""" + num_strategies = len(all_results) + fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies)) + + if num_strategies == 1: + axes = [axes] + + for idx, ( + strategy_name, + ratio, + cuda_times, + baseline_times, + config_labels, + ) in enumerate(all_results): + ax = axes[idx] + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.35 + + # Execution Time plot (lower is better) + ax.bar( + x - width / 2, + cuda_times, + width, + label="CUDA Kernel", + alpha=0.8, + color="blue", + ) + ax.bar( + x + width / 2, + baseline_times, + width, + label="Baseline", + alpha=0.8, + color="orange", + ) + + # Add speedup labels over each bar pair + for i in range(len(x)): + speedup = ratio[i] + max_height = max(cuda_times[i], baseline_times[i]) + ax.text( + x[i], + max_height + max_height * 0.02, + f"{speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=9, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + filename = "../../silu_bench/silu_benchmark_combined.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +outer_dim = 7168 configs = [ - (8, 32, 1024), - (16, 64, 2048), - (32, 128, 4096), # DeepSeekV3 Configs - (256, 16, 7168), - (256, 32, 7168), - (256, 64, 7168), - (256, 128, 7168), - (256, 256, 7168), - (256, 512, 7168), + (8, 1024, 7168), + # DeepSeekV3 Configs + (32, 1024, 7168), + # DeepSeekV3 Configs (256, 1024, 7168), ] -print(f"GPU: {torch.cuda.get_device_name()}") -print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}") -print("-" * 50) +runs = 100 +num_warmups = 20 -for E, T, H in configs: - try: - time_ms, gflops, gbps = benchmark(E, T, H) - print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}") - except Exception: - print(f"E={E:3d},T={T:4d},H={H:4d} FAILED") +strategy_descriptions = { + "uniform": "Uniform Random", + "max_t": "Even Assignment", + "first_t": "experts[0] = T, experts[1:] = 0", +} + +print(f"GPU: {torch.cuda.get_device_name()}") +print(f"Testing strategies: {', '.join(strategies)}") +print(f"Configurations: {len(configs)} configs") + +all_results = [] + +# Run benchmarks for each strategy +for id, strategy in enumerate(strategies): + print(f"\n{'=' * 60}") + print(f"Testing strategy: {strategy_descriptions[strategy]}") + print(f"{'=' * 60}") + + # Collect benchmark data for both algorithms + config_labels = [] + config_x_axis = [] + all_cuda_results = [] + all_baseline_results = [] + all_ratios = [] + + for E, T, H in configs: + total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E] + config_x_axis.append(total_tokens_config) + + cuda_results = [] + baseline_results = [] + ratios = [] + + for total_tokens in total_tokens_config: + config_label = f"E={E},T={T},H={H},TT={total_tokens}" + config_labels.append(config_label) + + # CUDA kernel results + time_ms_cuda, gflops, gbps, perc = benchmark( + silu_mul_fp8_quant_deep_gemm_cuda, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + cuda_results.append((time_ms_cuda, gflops, gbps, perc)) + + # Baseline results + time_ms_triton, gflops, gbps, perc = benchmark( + silu_mul_fp8_quant_deep_gemm_triton, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + baseline_results.append((time_ms_triton, gflops, gbps, perc)) + ratios.append(time_ms_triton / time_ms_cuda) + + print(f"Completed: {config_label}") + all_cuda_results.append(cuda_results) + all_baseline_results.append(baseline_results) + all_ratios.append(ratios) + + # Store results for combined plotting + all_results.append( + ( + strategy_descriptions[strategy], + all_ratios, + all_cuda_results, + all_baseline_results, + config_labels, + config_x_axis, + ) + ) + + # Print summary table for this strategy + print(f"\nSummary Table - {strategy_descriptions[strategy]}:") + print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}") + print("-" * 60) + + for i, (E, T, H) in enumerate(configs): + speedup = baseline_results[i][0] / cuda_results[i][0] + config_label = f"E={E:3d},T={T:4d},H={H:4d}" + print( + f"{config_label:<20} {cuda_results[i][0]:8.5f} " + f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x" + ) + + +def create_total_tokens_plot(all_results): + num_strategies = len(all_results) + num_configs = len(configs) + + # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage + fig, axs = plt.subplots( + num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies) + ) + + # Add main title to the entire figure + fig.suptitle( + "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)", + fontsize=16, + fontweight="bold", + y=0.98, + ) + + # Handle single strategy case + if num_strategies == 1: + axs = axs.reshape(1, -1) + + # Handle single config case + if num_configs == 1: + axs = axs.reshape(-1, 2) + + for strategy_idx, result in enumerate(all_results): + ( + strategy_name, + all_ratios, + all_cuda_results, + all_baseline_results, + config_labels, + config_x_axis, + ) = result + + for config_idx in range(num_configs): + # Speedup plot (left column) + ax_speedup = axs[strategy_idx, config_idx * 2] + # Bandwidth plot (right column) + ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1] + + E, T, H = configs[config_idx] + ratios = all_ratios[config_idx] + total_tokens_values = config_x_axis[config_idx] + + # Extract CUDA and Triton bandwidth percentages + cuda_bandwidth_percentages = [ + result[3] for result in all_cuda_results[config_idx] + ] + triton_bandwidth_percentages = [ + result[3] for result in all_baseline_results[config_idx] + ] + + # Plot speedup ratios vs total tokens (left plot) + ax_speedup.plot( + total_tokens_values, ratios, "bo-", linewidth=3, markersize=8 + ) + ax_speedup.set_title( + f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) + ax_speedup.grid(True, alpha=0.3) + + ax_bandwidth.plot( + total_tokens_values, + cuda_bandwidth_percentages, + "ro-", + linewidth=3, + markersize=8, + label="CUDA", + ) + ax_bandwidth.plot( + total_tokens_values, + triton_bandwidth_percentages, + "go-", + linewidth=3, + markersize=8, + label="Triton", + ) + ax_bandwidth.set_title( + f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_bandwidth.set_ylabel( + "% of Peak Bandwidth", fontweight="bold", fontsize=11 + ) + ax_bandwidth.legend(prop={"weight": "bold"}) + ax_bandwidth.grid(True, alpha=0.3) + + # Format x-axis labels for both plots + for ax in [ax_speedup, ax_bandwidth]: + ax.set_xticks(total_tokens_values) + ax.set_xticklabels( + [ + f"{tt // 1000}K" if tt >= 1000 else str(tt) + for tt in total_tokens_values + ], + fontweight="bold", + ) + # Make tick labels bold + for label in ax.get_xticklabels() + ax.get_yticklabels(): + label.set_fontweight("bold") + + # Add value labels on speedup points + for x, y in zip(total_tokens_values, ratios): + ax_speedup.annotate( + f"{y:.2f}x", + (x, y), + textcoords="offset points", + xytext=(0, 12), + ha="center", + fontsize=10, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7), + ) + + # Add value labels on CUDA bandwidth points + for x, y in zip(total_tokens_values, cuda_bandwidth_percentages): + ax_bandwidth.annotate( + f"{y:.1f}%", + (x, y), + textcoords="offset points", + xytext=(0, 12), + ha="center", + fontsize=9, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3), + ) + + # Add value labels on Triton bandwidth points + for x, y in zip(total_tokens_values, triton_bandwidth_percentages): + ax_bandwidth.annotate( + f"{y:.1f}%", + (x, y), + textcoords="offset points", + xytext=(0, -15), + ha="center", + fontsize=9, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3), + ) + + plt.tight_layout() + plt.subplots_adjust(top=0.93) # Make room for main title + filename = "silu_benchmark_total_tokens.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +# Create combined plot with all strategies +combined_plot_filename = create_total_tokens_plot(all_results) + +print(f"\n{'=' * 60}") +print("Benchmark Complete!") +print(f"Generated combined plot: {combined_plot_filename}") +print(f"{'=' * 60}") diff --git a/csrc/ops.h b/csrc/ops.h index 3ecfd2cd9bf3f..c65bf431640d5 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -133,6 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& input_global_scale); #endif +void silu_mul_fp8_quant_deep_gemm_cuda( + const at::Tensor& input, // (E, T, 2*H) + const at::Tensor& counts, // (E) + at::Tensor& y_q, // (E, T, H) [OUT] + at::Tensor& y_s, // (E, T, H//group_size) [OUT] + int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens); void mul_and_silu(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 8bc2b9bff3d5a..9ddb5af3052fa 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -9,6 +9,26 @@ #include "quantization/fp8/common.cuh" +#include <c10/util/Float8_e4m3fn.h> + +#ifndef USE_ROCM + #include <cuda_bf16.h> + #include <cuda_fp16.h> + #include <cuda_fp8.h> +#else + #include <hip/hip_bf16.h> + #include <hip/hip_fp16.h> + #include <hip/hip_fp8.h> + +typedef __hip_bfloat162 __nv_bfloat162; +typedef __hip_bfloat16 __nv_bfloat16; +typedef __hip_bfloat16_raw __nv_bfloat16_raw; + +typedef __hip_fp8_e4m3 __nv_fp8_e4m3; +typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3; +#endif + +#include "core/registration.h" namespace vllm { template <typename T> @@ -87,6 +107,337 @@ __global__ void act_and_mul_quant_kernel( } } } + +__device__ __forceinline__ float silu(float x) { + return (__fdividef(x, (1.f + expf(-x)))); +} + +__device__ __forceinline__ float2 silu2(float2 x) { + return make_float2(silu(x.x), silu(x.y)); +} + +#ifndef USE_ROCM +__device__ __forceinline__ float warp_max(float v) { + static constexpr unsigned FULL_MASK = 0xffffffffu; + for (int offset = 1; offset < WARP_SIZE; offset *= 2) { + v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, offset)); + } + return v; +} + +__device__ __forceinline__ __nv_bfloat16 warp_max(__nv_bfloat16 v) { + static constexpr unsigned FULL_MASK = 0xffffffffu; + for (int offset = 1; offset < WARP_SIZE; offset *= 2) { + v = __hmax(v, __shfl_xor_sync(FULL_MASK, v, offset)); + } + return v; +} +#endif + +template <typename T, typename U> +__device__ __forceinline__ void cp_async4(T* _smem_ptr, const U* _glob_ptr) { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + auto smem_ptr = reinterpret_cast<void*>(_smem_ptr); + auto glob_ptr = reinterpret_cast<const void*>(_glob_ptr); + const int BYTES = 16; + uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); +#else + _smem_ptr[0] = _glob_ptr[0]; +#endif +} + +__device__ __forceinline__ void cp_async_fence() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.commit_group;\n" ::); +#else +#endif +} + +template <int N> +__device__ __forceinline__ void cp_async_wait() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); +#else +#endif +} + +template <> +__device__ __forceinline__ void cp_async_wait<0>() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_all;\n" ::); +#else +#endif +} + +__device__ __forceinline__ float clip(float v, float mmin, float mmax) { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + return fminf(mmax, fmaxf(v, mmin)); +#else +#endif +} + +__device__ __forceinline__ __nv_bfloat16 clip(__nv_bfloat16 v, + __nv_bfloat16 mmin, + __nv_bfloat16 mmax) { + return __hmin(mmax, __hmax(v, mmin)); +} + +__device__ __forceinline__ __nv_bfloat162 clip(__nv_bfloat162 v, + __nv_bfloat162 mmin, + __nv_bfloat162 mmax) { + return __hmin2(mmax, __hmax2(v, mmin)); +} + +// We use the following values for fp8 min/max: +// __nv_fp8_e4m3 = (-448, +448) +// __nv_fp8_e4m3uz = (-240.0, +240.0) +// It is currently assumed that only +template <class T> +constexpr __nv_bfloat16 get_fp8_max() { + static_assert(std::is_same_v<T, c10::Float8_e4m3fn> || + std::is_same_v<T, c10::Float8_e4m3fnuz>); + if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 17376}); + } else { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 17264}); + } +} + +template <class T> +constexpr __nv_bfloat16 get_fp8_min() { + static_assert(std::is_same_v<T, c10::Float8_e4m3fn> || + std::is_same_v<T, c10::Float8_e4m3fnuz>); + if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 50144}); + } else { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032}); + } +} +#ifndef USE_ROCM +template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t, + int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128, + int NUM_STAGES = 3> +__global__ void silu_mul_fp8_quant_deep_gemm_kernel( + const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q, + float* __restrict__ _y_s, const int32_t* __restrict__ counts, + + // sizes + int H, int G, + + // strides (in elements) + Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e, + Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t, + Idx_t stride_ys_g, Idx_t stride_counts_e) { + static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>(); + static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>(); + // We assign EPS with its 16-bit unsigned counterpart to allow constexpr. + static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996}); + + // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t. + static constexpr int32_t BFLOAT16_PER_GROUP = 8; + + // We split the shared memory in half, corresponding to gate and up matrices: + // [...gate_i, ...up_i] where 0 <= i < stages. + static constexpr int32_t S_NUM_128 = + 2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES; + static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE; + static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2; + static constexpr int32_t S_NUM_64 = S_NUM_128 * 2; + __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128]; + + const int32_t tid = threadIdx.x; + const int32_t warp_id = tid / WARP_SIZE; + const int32_t lane_id = tid % WARP_SIZE; + + auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128); + + // block handles one (expert e, group g) + int32_t pid = blockIdx.x; + int32_t e = pid / G; + int32_t g = pid % G; + + const int32_t n_tokens = counts[e * stride_counts_e]; + + if (!n_tokens) { + return; // Exit ASAP. + } + + const Idx_t stride_i_t_128 = stride_i_t / 8u; + + int32_t n_tokens_lower, n_tokens_upper; + + // Each block i iterates over tokens of a slice of n_tokens = + // expert_counts[i], with the size of chunk being + // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of + // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling. + if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) { + // Specialize this, but can be likely fused. + if (blockIdx.y >= NUM_PARALLEL_TOKENS) { + return; + } + n_tokens_lower = blockIdx.y; + n_tokens_upper = blockIdx.y + 1; + } else { + auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS; + auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS; + auto calc_id = [&](int32_t id) { + if (id < residual) { + return min(n_tokens, id * (chunk_size + 1)); + } else { + return min(n_tokens, id * chunk_size + residual); + } + }; + n_tokens_lower = calc_id(blockIdx.y); + n_tokens_upper = calc_id(blockIdx.y + 1); + } + + if (n_tokens_lower >= n_tokens_upper) { + return; + } + + // We do calculations here, using constexpr wherever possible. + const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h; + const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g; + const Idx_t base_yq = + e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h; + Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u)); + auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input); + auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) + + stride_i_t_128 * n_tokens_lower; + auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u; + auto y_s_ptr = + _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t; + auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE + + stride_yq_t * n_tokens_lower + 4 * lane_id; + int32_t t_load = n_tokens_lower, load_stage_id = 0; + auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT); + auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u; + int32_t stage_offset{}; + + static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2); + static constexpr int32_t LOAD_STAGE_MOD = + NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2); + + // Two halves of all threads in a block conduct global loads for gate and up, + // repsectively. + auto load_and_advance_y_pred = [&] { + if (t_load < n_tokens_upper) { + auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset; + auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset; + + // It is very important that LOAD_STAGE_SIZE is constexpr to avoid + // unnecessary ALU ops. + stage_offset += LOAD_STAGE_SIZE; + stage_offset %= LOAD_STAGE_MOD; + + if (tid < HALF_THREAD_COUNT) { + cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr); + gate_128_ptr += stride_i_t_128; + } else { + cp_async4(s_up_stage_128_staged_ptr, up_128_ptr); + up_128_ptr += stride_i_t_128; + } + ++t_load; + ++load_stage_id; + } + // We fence even if there is nothing to load to simplify pipelining. + cp_async_fence(); + }; + + #pragma unroll + for (int i = 0; i < NUM_STAGES - 1; i++) { + load_and_advance_y_pred(); + } + + __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>( + s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) + + lane_id; + __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2; + + static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u; + static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES; + + int32_t compute_pipeline_offset_64 = 0; + + for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) { + __nv_bfloat16 y_max_bf16 = EPS; + __nv_bfloat162 results_bf162[2]; + + cp_async_wait<NUM_STAGES - 2>(); + __syncthreads(); + + // We double-buffer pipelined loads so that the next load will + // concurrently run with compute without overwrites. + load_and_advance_y_pred(); + + auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64; + auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64; + + // STAGE_SIZE must also be constexpr! + compute_pipeline_offset_64 += STAGE_SIZE; + compute_pipeline_offset_64 %= STAGE_MOD; + + // Each thread loads (gate/up) 2X 4X bfloat16 values into registers. + __int64_t gate64 = *s_gate_compute_64; + __nv_bfloat162* s_gate_compute_32 = + reinterpret_cast<__nv_bfloat162*>(&gate64); + + __int64_t up64 = *s_up_compute_64; + __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64); + + #pragma unroll + for (int i = 0; i < 2; i++) { + // For silu, we make sure that div is emitted. + float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i])); + results_bf162[i] = __float22bfloat162_rn(gate); + } + + #pragma unroll + for (int i = 0; i < 2; i++) { + results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]); + } + + auto _y_max2 = + __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1])); + + y_max_bf16 = __hmax(_y_max2.x, _y_max2.y); + + // An entire group is assigned to a single warp, so a simple warp reduce + // is used. + __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max; + + if constexpr (USE_UE8M0) { + y_s = hexp2(hceil(hlog2(y_s))); + } + + auto inv_y = __float2bfloat16_rn(1.f) / y_s; + + auto y_s2 = make_bfloat162(inv_y, inv_y); + + #pragma unroll + for (int32_t i = 0; i < 2; ++i) { + results_bf162[i] = + clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min), + __bfloat162bfloat162(fp8_max)); + } + + auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]); + *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4; + y_q_ptr += stride_yq_t; + + if (lane_id == 0) { + *y_s_ptr = y_s; + y_s_ptr += stride_ys_t; + } + } +} +#endif + } // namespace vllm // Launch activation, gating, and quantize kernel. @@ -119,3 +470,117 @@ void silu_and_mul_quant(torch::Tensor& out, // [..., d] TORCH_CHECK(input.size(-1) % 2 == 0); LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); } + +void silu_mul_fp8_quant_deep_gemm_cuda( + const at::Tensor& input, // (E, T, 2*H) + const at::Tensor& counts, // (E) + at::Tensor& y_q, // (E, T, H) [OUT] + at::Tensor& y_s, // (E, T, H//group_size) [OUT] + int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) { +#ifndef USE_ROCM + // This kernel relies heavily on cp.async and fp8 support. + // This kernel currently only supports H % 128 == 0 and assumes a + // fixed GROUP_SIZE of 128. + TORCH_CHECK(input.dtype() == torch::kBFloat16); + TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn || + y_q.dtype() == torch::kFloat8_e4m3fnuz); + TORCH_CHECK(y_s.dtype() == torch::kFloat32); + TORCH_CHECK(input.size(-1) % 256 == 0); + + // Check that num_parallel_tokens is of power of 2 and between 1 and 64. + TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64); + TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1))); + + using Idx_t = int64_t; + + Idx_t E = input.size(0); + Idx_t T = input.size(1); + Idx_t H = input.size(2) / 2; + Idx_t stride_i_e = input.stride(0); + Idx_t stride_i_t = input.stride(1); + Idx_t stride_i_h = input.stride(2); + Idx_t stride_yq_e = y_q.stride(0); + Idx_t stride_yq_t = y_q.stride(1); + Idx_t stride_yq_h = y_q.stride(2); + Idx_t stride_ys_e = y_s.stride(0); + Idx_t stride_ys_t = y_s.stride(1); + Idx_t stride_ys_g = y_s.stride(2); + + Idx_t stride_counts_e = counts.stride(0); + + static constexpr int GROUP_SIZE = 128; + + #define KERNEL_FN \ + if (use_ue8m0) { \ + vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \ + NUM_PARALLEL_TOKENS, true> \ + <<<grid, block, 0, stream>>>( \ + reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \ + (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \ + reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \ + stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \ + stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \ + stride_counts_e); \ + } else { \ + vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \ + NUM_PARALLEL_TOKENS, false> \ + <<<grid, block, 0, stream>>>( \ + reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \ + (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \ + reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \ + stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \ + stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \ + stride_counts_e); \ + } + + #define KERNEL_CALL_H \ + if (H % (4 * GROUP_SIZE) == 0) { \ + static constexpr int NUM_WARPS = 4; \ + populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \ + KERNEL_FN \ + } else { \ + static constexpr int NUM_WARPS = 1; \ + populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \ + KERNEL_FN \ + } + + #define KERNEL_CALL_TOP_LEVEL \ + if (num_parallel_tokens == 1) { \ + static constexpr int NUM_PARALLEL_TOKENS = 1; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 2) { \ + static constexpr int NUM_PARALLEL_TOKENS = 2; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 4) { \ + static constexpr int NUM_PARALLEL_TOKENS = 4; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 8) { \ + static constexpr int NUM_PARALLEL_TOKENS = 8; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 16) { \ + static constexpr int NUM_PARALLEL_TOKENS = 16; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 32) { \ + static constexpr int NUM_PARALLEL_TOKENS = 32; \ + KERNEL_CALL_H \ + } else if (num_parallel_tokens == 64) { \ + static constexpr int NUM_PARALLEL_TOKENS = 64; \ + KERNEL_CALL_H \ + } + + Idx_t G; + dim3 block, grid; + auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) { + G = H / Idx_t(group_size * num_warps); + grid = dim3(E * G, _num_parallel_tokens); + block = dim3(num_warps * WARP_SIZE); + }; + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(), + "silu_mul_fp8_quant_deep_gemm_kernel", + [&] { KERNEL_CALL_TOP_LEVEL }); + +#endif +} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index c63fa7cffc78d..81aca7b8860d5 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -32,6 +32,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { #define stride_tag #endif + ops.def( + "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! " + "y_q, Tensor! y_s, int group_size, " + "bool use_ue8m0, int num_parallel_tokens) -> ()"); + ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA, + &silu_mul_fp8_quant_deep_gemm_cuda); + ops.def("weak_ref_tensor(Tensor input) -> Tensor"); ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 5a0379dfb4475..383b5ebfba9b7 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -5,28 +5,52 @@ import pytest import torch from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - silu_mul_fp8_quant_deep_gemm) + silu_mul_fp8_quant_deep_gemm_cuda) from vllm.platforms import current_platform +from vllm.utils import cdiv + +fp8_dtype = torch.float8_e4m3fn -# (E, T, H, group_size, seed) CASES = [ - (1, 1, 128, 64, 0), - (1, 4, 128, 128, 0), - (2, 4, 256, 128, 0), - (32, 64, 256, 128, 0), - (17, 31, 768, 128, 0), + (1, 1, 128, fp8_dtype), + (1, 4, 128, fp8_dtype), + (2, 4, 256, fp8_dtype), + (32, 64, 256, fp8_dtype), + (17, 31, 768, fp8_dtype), + (1, 1, 128 * 1, fp8_dtype), + (1, 1, 128 * 2, fp8_dtype), + (1, 1, 128 * 3, fp8_dtype), + (1, 1, 128 * 4, fp8_dtype), + (8, 16, 128 * 1, fp8_dtype), + (8, 16, 128 * 2, fp8_dtype), + (8, 16, 128 * 3, fp8_dtype), + (8, 16, 128 * 4, fp8_dtype), + (8, 64, 7168, fp8_dtype), + (8, 128, 7168, fp8_dtype), + (8, 256, 7168, fp8_dtype), + (8, 512, 7168, fp8_dtype), + (8, 1024, 7168, fp8_dtype), + (256, 8, 7168, fp8_dtype), + (256, 16, 7168, fp8_dtype), + (256, 32, 7168, fp8_dtype), + (256, 64, 7168, fp8_dtype), + + # Only add a few fnuz tests to help with long CI times. + (8, 512, 7168, torch.float8_e4m3fnuz), + (8, 1024, 7168, torch.float8_e4m3fnuz), ] -@pytest.mark.parametrize("E,T,H,group_size,seed", CASES) +@pytest.mark.parametrize("E,T,H,fp8_type", CASES) @torch.inference_mode() -def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed): - current_platform.seed_everything(seed) +def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type): + group_size = 128 + current_platform.seed_everything(42) # Input tensor of shape (E, T, 2*H) y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda") tokens_per_expert = torch.randint( - low=0, + low=T // 2, high=T, size=(E, ), dtype=torch.int32, @@ -34,45 +58,59 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed): ) # Run the Triton kernel - y_q, y_s = silu_mul_fp8_quant_deep_gemm(y, - tokens_per_expert, - group_size=group_size, - eps=1e-10) + y_q, y_s = silu_mul_fp8_quant_deep_gemm_cuda(y, + tokens_per_expert, + group_size=group_size) - # Reference implementation - fp8_info = torch.finfo(torch.float8_e4m3fn) + torch.cuda.synchronize() + fp8_info = torch.finfo(fp8_dtype) fp8_max = fp8_info.max fp8_min = fp8_info.min eps = 1e-10 - # Compute silu activation and elementwise multiplication - y1 = y[..., :H] + y1 = y[..., :H].float() y2 = y[..., H:] silu_x = y1 * torch.sigmoid(y1) merged = silu_x * y2 - # Compute reference scales and quantized output, skipping padded tokens for e in range(E): nt = tokens_per_expert[e].item() - ref_s = torch.empty((T, H // group_size), + ref_s = torch.empty((T, cdiv(H, group_size)), dtype=torch.float32, device="cuda") - ref_q = torch.empty((T, H), dtype=torch.float8_e4m3fn, device="cuda") + ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda") + for t in range(nt): - data = merged[e, t] - data_grp = data.view(H // group_size, group_size) - amax = data_grp.abs().amax(dim=1).clamp(min=eps) - scale = amax / fp8_max + data = merged[e, t].float() + ref_q_row = torch.empty_like(data) - scaled = data / scale.repeat_interleave(group_size) - clamped = scaled.clamp(fp8_min, fp8_max) - q = clamped.to(torch.float8_e4m3fn) + # process full groups + n_full_groups = H // group_size + if n_full_groups > 0: + data_grp = data[:n_full_groups * group_size].view( + n_full_groups, group_size) + amax = data_grp.abs().amax(dim=1).clamp(min=eps) + scale = amax / fp8_max + scaled = data[:n_full_groups * + group_size] / scale.repeat_interleave(group_size) + ref_q_row[:n_full_groups * group_size] = scaled.clamp( + fp8_min, fp8_max).to(fp8_dtype) + ref_s[t, :n_full_groups] = scale - ref_s[t] = scale - ref_q[t] = q + # process remainder group + rem = H % group_size + if rem > 0: + data_rem = data[-rem:] + amax = data_rem.abs().amax().clamp(min=eps) + scale = amax / fp8_max + scaled = data_rem / scale + ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype) + ref_s[t, -1] = scale - y_se = y_s[e] - y_qe = y_q[e] + ref_q[t] = ref_q_row + + y_se = y_s[e].float() + y_qe = y_q[e].float() torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2) torch.testing.assert_close( diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index a5326dfe84f6d..0ab6355f41565 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from math import log2 from typing import Optional import torch @@ -10,6 +11,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked, is_deep_gemm_e8m0_used) @@ -24,35 +26,28 @@ def _silu_mul_fp8_quant_deep_gemm( y_q_ptr, # fp8 quantized activations (E, T, H) y_s_ptr, # 16-bit scales (E, T, G) counts_ptr, # int32 num tokens per expert (E) - # Sizes --------------------------------------------------------------- H: tl.constexpr, # hidden dimension (per output) GROUP_SIZE: tl.constexpr, # elements per group (usually 128) - # Strides for input (elements) --------------------------------------- stride_i_e, stride_i_t, stride_i_h, - # Strides for y_q (elements) ----------------------------------------- stride_yq_e, stride_yq_t, stride_yq_h, - # Strides for y_s (elements) ----------------------------------------- stride_ys_e, stride_ys_t, stride_ys_g, - # Stride for counts (elements) stride_counts_e, - # Numeric params ------------------------------------------------------ eps: tl.constexpr, fp8_min: tl.constexpr, fp8_max: tl.constexpr, use_ue8m0: tl.constexpr, - # Meta --------------------------------------------------------------- BLOCK: tl.constexpr, NUM_STAGES: tl.constexpr, @@ -101,17 +96,15 @@ def _silu_mul_fp8_quant_deep_gemm( tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) -def silu_mul_fp8_quant_deep_gemm( +def silu_mul_fp8_quant_deep_gemm_cuda( y: torch.Tensor, # (E, T, 2*H) tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + num_parallel_tokens=16, group_size: int = 128, - eps: float = 1e-10, ) -> tuple[torch.Tensor, torch.Tensor]: """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales - - y has shape (E, T, 2*H). The first half of the last dimension is + y has shape (E, T, 2*H). The first half of the last dimension is silu-activated, multiplied by the second half, then quantized into FP8. - Returns `(y_q, y_s)` where * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) @@ -120,22 +113,17 @@ def silu_mul_fp8_quant_deep_gemm( E, T, H2 = y.shape assert H2 % 2 == 0, "last dim of y must be even (2*H)" H = H2 // 2 - G = H // group_size - assert H % group_size == 0, "H must be divisible by group_size" - assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, \ - "tokens_per_expert must be shape (E,)" + G = (H + group_size - 1) // group_size + assert H % 8 == 0, "H must be divisible by 8" + assert group_size == 128, "H must be divisible by 8" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E + tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) - # allocate outputs fp8_dtype = torch.float8_e4m3fn y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) - # strides (elements) - stride_i_e, stride_i_t, stride_i_h = y.stride() - stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() - - # desired scale strides (elements): (T*G, 1, T) stride_ys_e = T * G stride_ys_t = 1 stride_ys_g = T @@ -144,47 +132,86 @@ def silu_mul_fp8_quant_deep_gemm( dtype=torch.float32, device=y.device) - stride_cnt_e = tokens_per_expert.stride()[0] + use_ue8m0 = is_deep_gemm_e8m0_used() - # Static grid over experts and H-groups. - # A loop inside the kernel handles the token dim - grid = (E * G, ) + if E <= 16: + max_empirical_parallelism = 64 + elif E <= 32: + max_empirical_parallelism = 16 + else: + max_empirical_parallelism = 4 - f_info = torch.finfo(fp8_dtype) - fp8_max = f_info.max - fp8_min = f_info.min + # We never want to launch more than Tx number of threads + # This computes the clip. + num_parallel_tokens = max( + 1, + min(max_empirical_parallelism, 2**int(log2(min(num_parallel_tokens, + T))))) + cuda_arch = current_platform.get_device_capability( + device_id=y.device.index).to_int() - _silu_mul_fp8_quant_deep_gemm[grid]( - y, - y_q, - y_s, - tokens_per_expert, - H, - group_size, - stride_i_e, - stride_i_t, - stride_i_h, - stride_yq_e, - stride_yq_t, - stride_yq_h, - stride_ys_e, - stride_ys_t, - stride_ys_g, - stride_cnt_e, - eps, - fp8_min, - fp8_max, - is_deep_gemm_e8m0_used(), - BLOCK=group_size, - NUM_STAGES=4, - num_warps=1, - ) + if cuda_arch >= 80: + torch.ops._C.silu_mul_fp8_quant_deep_gemm_cuda(y, tokens_per_expert, + y_q, y_s, group_size, + use_ue8m0, + num_parallel_tokens) + else: + # Default to triton if not on cuda or if arch is too old + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + stride_cnt_e = tokens_per_expert.stride()[0] + + # Static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G, ) + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + # desired scale strides (elements): (T*G, 1, T) + stride_ys_e = T * G + stride_ys_t = 1 + stride_ys_g = T + y_s = torch.empty_strided( + (E, T, G), + (stride_ys_e, stride_ys_t, stride_ys_g), + dtype=torch.float32, + device=y.device, + ) + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = f_info.min + eps: float = 1e-10 + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + stride_ys_e, + stride_ys_t, + stride_ys_g, + stride_cnt_e, + eps, + fp8_min, + fp8_max, + is_deep_gemm_e8m0_used(), + BLOCK=group_size, + NUM_STAGES=4, + num_warps=1, + ) return y_q, y_s class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): - # The Deep Gemm kernels only support block size of 128 DEEPGEMM_BLOCK_SHAPE: list[int] = [128, 128] @@ -297,8 +324,8 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): fp8_m_grouped_gemm_nt_masked((a1q, a1q_scale), (w1, w1_scale), workspace1, expert_num_tokens, expected_m) - a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm(workspace1, - expert_num_tokens) + a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm_cuda( + workspace1, expert_num_tokens) fp8_m_grouped_gemm_nt_masked((a2q, a2q_scale), (w2, w2_scale), output, expert_num_tokens, expected_m) From 1da0f1441d934c4fa9257fdc6807da30b52e38a3 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Sat, 13 Sep 2025 08:27:04 +0100 Subject: [PATCH 251/584] [Core][Multimodal] Cache `supports_kw` (#24773) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 7984f1958ece2..f13381ecd9ff3 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2082,6 +2082,7 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, return await task(*args, **kwargs) +@lru_cache def supports_kw( callable: Callable[..., object], kw_name: str, From 59d7ffc17f4c948b4d25d014e4f90d0cd4c20990 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Sat, 13 Sep 2025 03:29:19 -0400 Subject: [PATCH 252/584] [CI Failure] Fix test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe (#24750) Signed-off-by: mgoin <mgoin64@gmail.com> --- csrc/attention/mla/sm100_cutlass_mla_kernel.cu | 1 + tests/kernels/moe/test_mxfp4_moe.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index c60f1823b8a1d..d1874515cc8fd 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode( torch::Tensor const& seq_lens, torch::Tensor const& page_table, torch::Tensor const& workspace, + double sm_scale, int64_t num_kv_splits) { TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode"); } diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index 9fd72ee152b55..a3b8f07638d9a 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -771,11 +771,11 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe( w13_ref = dequant_mxfp4_batches( w13_q.view(torch.uint8), w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape( - num_experts, 2 * intermediate_size, hidden_size) + num_experts, 2 * intermediate_size, hidden_size).to(device) w2_ref = dequant_mxfp4_batches( w2_q.view(torch.uint8), w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape( - num_experts, hidden_size, intermediate_size) + num_experts, hidden_size, intermediate_size).to(device) # Quantize activations for SM100 path and dequantize for reference hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32) From 4dad72f0d903933429e96227894f98a2a6e30fa7 Mon Sep 17 00:00:00 2001 From: Russell Bryant <rbryant@redhat.com> Date: Sat, 13 Sep 2025 03:34:53 -0400 Subject: [PATCH 253/584] [Misc] Correct an outdated comment. (#24765) Signed-off-by: Russell Bryant <rbryant@redhat.com> --- vllm/v1/engine/processor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index a54a98adf56b5..f3fad15b750ad 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -260,10 +260,10 @@ class Processor: else: # NOTE: engine_level_backend must be "auto" here, because we have # checked supported_backends above. - # "auto" is an opt-in to opinionated behavior where we try to - # choose a backend based on request contents. This is not the - # default as it is less predictable and subject to change - # between releases as feature support changes. + # In this mode, we set opinionated defaults based on what we think + # will satisfy the most use cases without having to worry about + # this setting. We include fallback behavior here, but not with any + # other setting where a specific backend was specified. try: validate_xgrammar_grammar(params) params.guided_decoding.backend = "xgrammar" From 41ae4a1eab22b332cd7d8f233f09611a5da53b1c Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Sat, 13 Sep 2025 09:43:33 +0200 Subject: [PATCH 254/584] [Doc]: fix typos in various files (#24798) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- examples/offline_inference/tpu.py | 2 +- vllm/core/block/naive_block.py | 2 +- vllm/core/evictor.py | 2 +- vllm/engine/metrics.py | 4 ++-- vllm/model_executor/layers/linear.py | 4 ++-- vllm/model_executor/layers/vocab_parallel_embedding.py | 2 +- vllm/model_executor/models/ernie45_vl.py | 2 +- vllm/model_executor/models/gemma3n_mm.py | 4 ++-- vllm/model_executor/models/nemotron_vl.py | 2 +- vllm/model_executor/models/phi4mm.py | 2 +- 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 9776f4fe322b9..0093b63b0b1f3 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -42,7 +42,7 @@ def main(): llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct" # Set `enforce_eager=True` to avoid ahead-of-time compilation. - # In real workloads, `enforace_eager` should be `False`. + # In real workloads, `enforce_eager` should be `False`. llm = LLM(**llm_args) outputs = llm.generate(prompts, sampling_params) print("-" * 50) diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 7d9b32cd4b674..ae876d131eb66 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -182,7 +182,7 @@ class NaiveBlockAllocator(BlockAllocator): # Increment refcount for each block. assert block.block_id is not None refcount = self._refcounter.incr(block.block_id) - assert refcount != 1, "can't fork free'd block" + assert refcount != 1, "can't fork freed block" forked_block = self._block_pool.init_block( prev_block=prev_block, diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 7a4a836ee348e..85ff6bc9ca610 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -58,7 +58,7 @@ class Evictor(ABC): class BlockMetaData: """Data structure for storing key data describe cached block, so that - evitor could use to make its decision which one to choose for eviction + evictor could use to make its decision which one to choose for eviction Here we use physical block id as the dict key, as there maybe several blocks with the same content hash, but their physical id is unique. diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 0a8709db40880..2762175c430fb 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -379,7 +379,7 @@ class LoggingStatLogger(StatLoggerBase): if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): # Compute summary metrics for tracked stats (and log them - # to promethus if applicable). + # to prometheus if applicable). prompt_throughput = get_throughput(self.num_prompt_tokens, now=stats.now, last_log=self.last_local_log) @@ -432,7 +432,7 @@ class LoggingStatLogger(StatLoggerBase): class PrometheusStatLogger(StatLoggerBase): - """PrometheusStatLogger is used LLMEngine to log to Promethus.""" + """PrometheusStatLogger is used LLMEngine to log to Prometheus.""" _metrics_cls = Metrics _gauge_cls = prometheus_client.Gauge diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index fd88eac55cb51..773dfeae25d93 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -740,7 +740,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): """ Handle special case for models where MLP layers are already fused on disk. In this case, we have no shard id. This function - determmines the shard id by splitting these layers and then calls + determines the shard id by splitting these layers and then calls the weight loader using the shard id. An example of a model with these fused layers: @@ -914,7 +914,7 @@ class QKVParallelLinear(ColumnParallelLinear): """ Handle special case for models where QKV layers are already fused on disk. In this case, we have no shard id. This function - determmines the shard id by splitting these layers and then calls + determines the shard id by splitting these layers and then calls the weight loader using the shard id. An example of a model with these fused layers: diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index c915ebac91c59..aa64d4e09ae18 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -258,7 +258,7 @@ class VocabParallelEmbedding(CustomOp): if params_dtype is None: params_dtype = torch.get_default_dtype() - # Divide the weight matrix along the vocaburaly dimension. + # Divide the weight matrix along the vocabulary dimension. self.num_added_embeddings = self.num_embeddings - self.org_vocab_size self.num_embeddings_per_partition = divide(self.num_embeddings_padded, self.tp_size) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index f49dd36b0ab94..3396c67f42b7b 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1446,7 +1446,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal, return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 3074451e40a4d..663d4da7cec23 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -586,10 +586,10 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, # ruff: noqa # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the - # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will + # text to account for this. However, the audio preprocessing and encoder do not guarantee they will # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad - # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab. + # the audio feature out to 188 soft tokens with the embedding of the last token in the embed_audio vocab. # TODO precompute and cache padding audio_padding_toks = torch.tensor([[self.vocab_size - 1]], dtype=torch.long, diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index a9c7d8044e10c..acda2027401d9 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -560,7 +560,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, return [] # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image). + # tensor corresponding to a multimodal data item (image). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 352ae4064cc61..46963828186cc 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1154,7 +1154,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary From cfa3234a5b8a6ec4f65ceb080bdc232eaee9b080 Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Sat, 13 Sep 2025 00:45:11 -0700 Subject: [PATCH 255/584] [CI][Spec Decode] Adjust threshold for flaky ngram spec decoding test again (#24771) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> --- tests/v1/e2e/test_spec_decode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 0b240b7d434e5..bf90f50b10828 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -117,9 +117,9 @@ def test_ngram_correctness( print(f"ref_output: {ref_output.outputs[0].text}") print(f"spec_output: {spec_output.outputs[0].text}") - # Heuristic: expect at least 68% of the prompts to match exactly + # Heuristic: expect at least 66% of the prompts to match exactly # Upon failure, inspect the outputs to check for inaccuracy. - assert matches >= int(0.68 * len(ref_outputs)) + assert matches >= int(0.66 * len(ref_outputs)) del spec_llm torch.cuda.empty_cache() cleanup_dist_env_and_memory() From 15b8fef453b373b84406207a947005a4d9d68acc Mon Sep 17 00:00:00 2001 From: TaoYu Chen <ctynb@qq.com> Date: Sat, 13 Sep 2025 16:11:59 +0800 Subject: [PATCH 256/584] Remove redundant assignment in xfer_buffers, This is a little fix (#24732) Signed-off-by: ChenTaoyu-SJTU <ctynb@qq.com> --- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 17f5be76ce400..c306eeb5aa7ab 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -708,8 +708,6 @@ class NixlConnectorWorker: caches_data = [] # With hybrid allocator, layers can share a kv cache tensor seen_base_addresses = [] - xfer_buffers = (self.host_xfer_buffers - if self.use_host_buffer else kv_caches) # Note(tms): I modified this from the original region setup code. # K and V are now in different regions. Advantage is that we can From 973c9d01da863cac9c51e8a5c0d390fc84b84fbc Mon Sep 17 00:00:00 2001 From: Victor Ziliang Peng <ziliangdotme@gmail.com> Date: Sat, 13 Sep 2025 11:28:38 -0700 Subject: [PATCH 257/584] [Minor] Simplify duplicative device check for cuda (#24793) Signed-off-by: Ziliang Peng <ziliangdotme@gmail.com> --- vllm/platforms/cuda.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 42fff2f07a0a4..8e3436a9e73c5 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -64,8 +64,7 @@ class CudaPlatformBase(Platform): if self.has_device_capability(80): # Ampere and Hopper or later NVIDIA GPUs. return [torch.bfloat16, torch.float16, torch.float32] - elif (not self.has_device_capability(80) - ) and self.has_device_capability(60): + if self.has_device_capability(60): # Pascal, Volta and Turing NVIDIA GPUs, BF16 is not supported return [torch.float16, torch.float32] # Kepler and Maxwell NVIDIA GPUs, only FP32 is supported, From 3e903b6cb4292ca1425a37cb809c1e3cddfdadcb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 13 Sep 2025 17:41:36 -0700 Subject: [PATCH 258/584] [Chore] Minor simplification for non-PP path (#24810) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- vllm/v1/worker/gpu_model_runner.py | 64 ++++++++++++++++++------------ 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6572e421b65b9..d4afaf51e6e86 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -86,7 +86,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.kv_connector_model_runner_mixin import ( - KVConnectorModelRunnerMixin, KVConnectorOutput) + KVConnectorModelRunnerMixin) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from .utils import (AttentionGroup, MultiModalBudget, @@ -196,6 +196,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs + # Broadcast PP output for external_launcher (torchrun) + # to make sure we are synced across pp ranks + # TODO: Support overlapping mirco-batches + # https://github.com/vllm-project/vllm/issues/18019 + self.broadcast_pp_output = ( + self.parallel_config.distributed_executor_backend + == "external_launcher" and len(get_pp_group().ranks) > 0) + # Model-related. self.num_query_heads = model_config.get_num_attention_heads( parallel_config) @@ -1701,7 +1709,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): hidden_states: torch.Tensor, num_scheduled_tokens: int, num_scheduled_tokens_np: np.ndarray, - kv_connector_output: Optional[KVConnectorOutput], ) -> ModelRunnerOutput: assert self.input_batch.num_reqs ==\ len(self.input_batch.pooling_params), \ @@ -1732,7 +1739,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logprobs=None, prompt_logprobs_dict={}, pooler_output=pooler_output, - kv_connector_output=kv_connector_output, ) def _preprocess( @@ -2073,39 +2079,47 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with record_function_or_nullcontext("Postprocess"): if self.use_aux_hidden_state_outputs: + # True when EAGLE 3 is used. hidden_states, aux_hidden_states = model_output else: + # Common case. hidden_states = model_output aux_hidden_states = None - # Broadcast PP output for external_launcher (torchrun) - # to make sure we are synced across pp ranks - # TODO: Support overlapping mirco-batches - # https://github.com/vllm-project/vllm/issues/18019 - broadcast_pp_output = \ - self.parallel_config.distributed_executor_backend \ - == "external_launcher" and len(get_pp_group().ranks) > 0 - if not get_pp_group().is_last_rank: - # For mid-pipeline stages, return the hidden states. - assert isinstance(hidden_states, IntermediateTensors) - if not broadcast_pp_output: + if not self.broadcast_pp_output: + # Common case. + if not get_pp_group().is_last_rank: + # Return the intermediate tensors. + assert isinstance(hidden_states, IntermediateTensors) hidden_states.kv_connector_output = kv_connector_output return hidden_states - get_pp_group().send_tensor_dict( - hidden_states.tensors, all_gather_group=get_tp_group()) - logits = None - else: + if self.is_pooling_model: - return self._pool(hidden_states, num_scheduled_tokens, - num_scheduled_tokens_np, - kv_connector_output) + # Return the pooling output. + output = self._pool(hidden_states, num_scheduled_tokens, + num_scheduled_tokens_np) + output.kv_connector_output = kv_connector_output + return output sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None) - if broadcast_pp_output: - model_output_broadcast_data = { - "logits": logits.contiguous(), - } if logits is not None else {} + else: + # Rare case. + assert not self.is_pooling_model + + if not get_pp_group().is_last_rank: + get_pp_group().send_tensor_dict( + hidden_states.tensors, all_gather_group=get_tp_group()) + logits = None + else: + sample_hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(sample_hidden_states, + None) + + model_output_broadcast_data = {} + if logits is not None: + model_output_broadcast_data["logits"] = logits.contiguous() + model_output_broadcast_data = get_pp_group( ).broadcast_tensor_dict(model_output_broadcast_data, src=len(get_pp_group().ranks) - 1) From cc3173ae982ce4e589b24b70263cc25c3035d172 Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Sun, 14 Sep 2025 01:10:21 -0700 Subject: [PATCH 259/584] [Multi Modal][Performance] Fused Q,K's apply_rope into one (#24511) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/model_executor/models/qwen2_5_vl.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index fc028aa2287a2..dbf486374bcf3 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -363,8 +363,10 @@ class Qwen2_5_VisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: if self.attn_backend == _Backend.ROCM_AITER_FA: @@ -388,8 +390,8 @@ class Qwen2_5_VisionAttention(nn.Module): causal=False) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -408,6 +410,8 @@ class Qwen2_5_VisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -418,8 +422,8 @@ class Qwen2_5_VisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output From fec347dee130a79d8a56390ffcb2dde2e480f6ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Sun, 14 Sep 2025 21:11:14 +0900 Subject: [PATCH 260/584] [Misc] Improve `s3_utils` type hints with `BaseClient` (#24825) Signed-off-by: Zerohertz <ohg3417@gmail.com> --- vllm/transformers_utils/s3_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index d17c1afe9b504..b848898ff6dad 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -2,10 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -from typing import Any, Optional +from typing import TYPE_CHECKING, Optional from vllm.utils import PlaceholderModule +if TYPE_CHECKING: + from botocore.client import BaseClient + try: import boto3 except ImportError: @@ -26,7 +29,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: ] -def glob(s3: Optional[Any] = None, +def glob(s3: Optional["BaseClient"] = None, path: str = "", allow_pattern: Optional[list[str]] = None) -> list[str]: """ @@ -51,7 +54,7 @@ def glob(s3: Optional[Any] = None, def list_files( - s3: Any, + s3: "BaseClient", path: str, allow_pattern: Optional[list[str]] = None, ignore_pattern: Optional[list[str]] = None From fc2dbcda8b717e6aac0794cb9b4cc86b78c36504 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Sun, 14 Sep 2025 11:20:17 -0400 Subject: [PATCH 261/584] [Perf] Fix DeepGEMM Contiguous Layout Issue, 5.5% Throughput Improvement (#24783) Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- vllm/model_executor/layers/quantization/fp8.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 3d94626e5d8c6..49ff87df93c31 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -772,10 +772,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.allow_deep_gemm and not is_deep_gemm_e8m0_used(): if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = \ - get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous() + get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv) if _is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = \ - get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous() + get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv) # If checkpoint is fp16, quantize in place. elif not self.quant_config.is_checkpoint_fp8_serialized: @@ -923,10 +923,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): # Ensure column-major TMA alignment expected by DeepGEMM. if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor( - layer.w13_weight_scale_inv).contiguous() + layer.w13_weight_scale_inv) if _is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor( - layer.w2_weight_scale_inv).contiguous() + layer.w2_weight_scale_inv) def select_gemm_impl( self, From 1177dd53e95f2e65f253b1ac613ecf4768327e49 Mon Sep 17 00:00:00 2001 From: co63oc <co63oc@users.noreply.github.com> Date: Mon, 15 Sep 2025 00:17:16 +0800 Subject: [PATCH 262/584] fix type of sampling rate for encode_base64 (#24826) Signed-off-by: co63oc <co63oc@users.noreply.github.com> --- vllm/multimodal/audio.py | 2 +- vllm/multimodal/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index f3b273eb41e8f..d7e9d402a1f97 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -106,7 +106,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: return librosa.load(filepath, sr=None) - def encode_base64(self, media: tuple[npt.NDArray, float]) -> str: + def encode_base64(self, media: tuple[npt.NDArray, int]) -> str: audio, sr = media with BytesIO() as buffer: diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index e09c97de576ef..b308366fca282 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -310,7 +310,7 @@ class MediaConnector: def encode_audio_base64( audio: np.ndarray, - sampling_rate: float, + sampling_rate: int, ) -> str: """Encode audio as base64.""" audio_io = AudioMediaIO() From ff68035932431c1892563f823735ee350336036f Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" <yeq@meta.com> Date: Sun, 14 Sep 2025 10:50:01 -0700 Subject: [PATCH 263/584] [Benchmarks] Throw usage error when using dataset-name random and dataset-path together (#24819) Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> --- vllm/benchmarks/datasets.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index bf9e87198bcf1..0a297479bcc00 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -11,6 +11,7 @@ generation. Supported dataset types include: - HuggingFace - VisionArena """ +import argparse import ast import base64 import io @@ -1019,6 +1020,25 @@ class ShareGPTDataset(BenchmarkDataset): return samples +class _ValidateDatasetArgs(argparse.Action): + """Argparse action to validate dataset name and path compatibility.""" + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + + # Get current values of both dataset_name and dataset_path + dataset_name = getattr(namespace, 'dataset_name', 'random') + dataset_path = getattr(namespace, 'dataset_path', None) + + # Validate the combination + if dataset_name == "random" and dataset_path is not None: + parser.error( + "Cannot use 'random' dataset with --dataset-path. " + "Please specify the appropriate --dataset-name (e.g., " + "'sharegpt', 'custom', 'sonnet') for your dataset file: " + f"{dataset_path}" + ) + + def add_dataset_parser(parser: FlexibleArgumentParser): parser.add_argument("--seed", type=int, default=0) parser.add_argument( @@ -1031,6 +1051,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "--dataset-name", type=str, default="random", + action=_ValidateDatasetArgs, choices=[ "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", "custom", "prefix_repetition", "spec_bench" @@ -1046,6 +1067,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "--dataset-path", type=str, default=None, + action=_ValidateDatasetArgs, help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.", ) From 79cbcab8717d737945fc458f21fb1f98711ed50e Mon Sep 17 00:00:00 2001 From: FengjinChen <19762931+chenfengjin@users.noreply.github.com> Date: Mon, 15 Sep 2025 03:30:10 +0800 Subject: [PATCH 264/584] Force use C++17 globally to avoid compilation error (#24823) Signed-off-by: chenfengjin <1871653365@qq.com> --- CMakeLists.txt | 4 ++++ cmake/utils.cmake | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f1f9a781a07a..8df349ce14fda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26) # cmake --install . --component _C project(vllm_extensions LANGUAGES CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + + # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 9c0ed1d09572e..8558976e2c392 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME) ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") endif() - set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) target_compile_options(${GPU_MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>) From 6dc8da5dc1e772a7318d0725dcd5bc99b3e9803f Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sun, 14 Sep 2025 15:41:53 -0400 Subject: [PATCH 265/584] [Chore] Remove ipex_ops warning (#24835) Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- vllm/_ipex_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 59b0aed321502..9d2eda482fcf8 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -13,7 +13,7 @@ logger = init_logger(__name__) try: import intel_extension_for_pytorch as ipex except ImportError as e: - logger.warning("Import error msg: %s", e.msg) + logger.debug("Import error msg: %s", e.msg) class ipex_ops: From 90f3f7d73e09aff12dc96767223fa54f48f117a8 Mon Sep 17 00:00:00 2001 From: wuhang <wuhang6@huawei.com> Date: Mon, 15 Sep 2025 05:11:09 +0800 Subject: [PATCH 266/584] [Spec Decoding]Support Spec Decoding Metrics in DP Mode (#24049) Signed-off-by: wuhang <wuhang6@huawei.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- vllm/v1/metrics/loggers.py | 16 +++----- vllm/v1/spec_decode/metrics.py | 74 +++++++++++++++++++++------------- 2 files changed, 53 insertions(+), 37 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 347185d8341ee..3f4699281ccea 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -169,15 +169,11 @@ class PrometheusStatLogger(StatLoggerBase): model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len - if (len(self.engine_indexes) > 1 - and vllm_config.speculative_config is not None): - raise NotImplementedError("Prometheus metrics with Spec Decoding " - "with >1 EngineCore per AsyncLLM is not " - "supported yet.") - spec_decode_labelvalues = [ - vllm_config.model_config.served_model_name, - str(self.engine_indexes[0]) - ] + spec_decode_labelvalues: dict[int, list[str]] = { + idx: [model_name, str(idx)] + for idx in engine_indexes + } + self.spec_decoding_prom = self._spec_decoding_cls( vllm_config.speculative_config, labelnames, spec_decode_labelvalues) @@ -530,7 +526,7 @@ class PrometheusStatLogger(StatLoggerBase): if scheduler_stats.spec_decoding_stats is not None: self.spec_decoding_prom.observe( - scheduler_stats.spec_decoding_stats) + scheduler_stats.spec_decoding_stats, engine_idx) if iteration_stats is None: return diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index 2aa8962f5739c..282e6f65e7abe 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -140,27 +140,32 @@ class SpecDecodingProm: self, speculative_config: Optional[SpeculativeConfig], labelnames: list[str], - labelvalues: list[str], + per_engine_labelvalues: dict[int, list[str]], ): self.spec_decoding_enabled = speculative_config is not None if not self.spec_decoding_enabled: return - self.counter_spec_decode_num_drafts = \ - self._counter_cls( - name="vllm:spec_decode_num_drafts", - documentation="Number of spec decoding drafts.", - labelnames=labelnames).labels(*labelvalues) - self.counter_spec_decode_num_draft_tokens = \ - self._counter_cls( - name="vllm:spec_decode_num_draft_tokens", - documentation="Number of draft tokens.", - labelnames=labelnames,).labels(*labelvalues) - self.counter_spec_decode_num_accepted_tokens = \ - self._counter_cls( - name="vllm:spec_decode_num_accepted_tokens", - documentation="Number of accepted tokens.", - labelnames=labelnames).labels(*labelvalues) + counter_drafts = self._counter_cls( + name="vllm:spec_decode_num_drafts", + documentation="Number of spec decoding drafts.", + labelnames=labelnames) + self.counter_spec_decode_num_drafts = make_per_engine( + counter_drafts, per_engine_labelvalues) + + counter_draft_tokens = self._counter_cls( + name="vllm:spec_decode_num_draft_tokens", + documentation="Number of draft tokens.", + labelnames=labelnames) + self.counter_spec_decode_num_draft_tokens = make_per_engine( + counter_draft_tokens, per_engine_labelvalues) + + counter_accepted_tokens = self._counter_cls( + name="vllm:spec_decode_num_accepted_tokens", + documentation="Number of accepted tokens.", + labelnames=labelnames) + self.counter_spec_decode_num_accepted_tokens = make_per_engine( + counter_accepted_tokens, per_engine_labelvalues) assert speculative_config is not None num_spec_tokens = (speculative_config.num_speculative_tokens @@ -171,21 +176,36 @@ class SpecDecodingProm: documentation="Accepted tokens per draft position.", labelnames=pos_labelnames, ) - self.counter_spec_decode_num_accepted_tokens_per_pos: list[ - prometheus_client.Counter] = [] - for pos in range(num_spec_tokens): - pos_labelvalues = labelvalues + [str(pos)] - self.counter_spec_decode_num_accepted_tokens_per_pos.append( - base_counter.labels(*pos_labelvalues)) + self.counter_spec_decode_num_accepted_tokens_per_pos: dict[ + int, list[prometheus_client.Counter]] = { + idx: [ + base_counter.labels(*lv, str(pos)) + for pos in range(num_spec_tokens) + ] + for idx, lv in per_engine_labelvalues.items() + } - def observe(self, spec_decoding_stats: SpecDecodingStats): + def observe(self, + spec_decoding_stats: SpecDecodingStats, + engine_idx: int = 0): if not self.spec_decoding_enabled: return - self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts) - self.counter_spec_decode_num_draft_tokens.inc( + self.counter_spec_decode_num_drafts[engine_idx].inc( + spec_decoding_stats.num_drafts) + self.counter_spec_decode_num_draft_tokens[engine_idx].inc( spec_decoding_stats.num_draft_tokens) - self.counter_spec_decode_num_accepted_tokens.inc( + self.counter_spec_decode_num_accepted_tokens[engine_idx].inc( spec_decoding_stats.num_accepted_tokens) for pos, counter in enumerate( - self.counter_spec_decode_num_accepted_tokens_per_pos): + self. + counter_spec_decode_num_accepted_tokens_per_pos[engine_idx]): counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos]) + + +def make_per_engine(counter: prometheus_client.Counter, + per_engine_labelvalues: dict[int, list[str]]): + """Create a counter for each label value.""" + return { + idx: counter.labels(*labelvalues) + for idx, labelvalues in per_engine_labelvalues.items() + } From 8e5cdcda4e5a55ff49d92d37139042dda44b6b3c Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Sun, 14 Sep 2025 15:55:17 -0700 Subject: [PATCH 267/584] [Hybrid Allocator] Support Pipeline Parallel (#23974) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- tests/distributed/test_pipeline_parallel.py | 4 +- tests/models/test_initialization.py | 10 +- tests/v1/core/test_kv_cache_utils.py | 404 ++++++++++++++----- tests/v1/tpu/worker/test_tpu_model_runner.py | 10 +- tests/v1/worker/test_gpu_model_runner.py | 14 +- vllm/v1/core/kv_cache_utils.py | 247 +++++++----- vllm/v1/engine/core.py | 18 +- 7 files changed, 472 insertions(+), 235 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index fffab1a984c26..08702e8c061fa 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -215,9 +215,7 @@ TEXT_GENERATION_MODELS = { EMBEDDING_MODELS = { # type: ignore[var-annotated] # [Text-only] "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"), - # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883 - # is fixed - #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), + "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast( load_format="dummy", runner="pooling" ), diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index c22d94948d249..0e18c45a21eeb 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -10,7 +10,7 @@ from vllm import LLM from vllm.config import ModelImpl from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.utils import GiB_bytes -from vllm.v1.core.kv_cache_utils import get_kv_cache_config +from vllm.v1.core.kv_cache_utils import get_kv_cache_configs from vllm.v1.engine.core import EngineCore as V1EngineCore from ..utils import create_new_process_for_each_test @@ -68,11 +68,11 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, def _initialize_kv_caches_v1(self, vllm_config): kv_cache_specs = self.model_executor.get_kv_cache_specs() - scheduler_kv_cache_config = get_kv_cache_config( + scheduler_kv_cache_config = get_kv_cache_configs( vllm_config, - kv_cache_specs[0], - 10 * GiB_bytes, - ) + kv_cache_specs, + [10 * GiB_bytes], + )[0] # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config return 1, 0, scheduler_kv_cache_config diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 44e479098ad5d..2b44b16fd63b8 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -18,13 +18,12 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.kv_cache_utils import ( BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, - get_kv_cache_config, get_max_concurrency_for_kv_cache_config, + get_kv_cache_configs, get_max_concurrency_for_kv_cache_config, get_request_block_hasher, hash_block_tokens, init_none_hash, - is_kv_cache_type_uniform, make_block_hash_with_group_id, - unify_kv_cache_configs) + is_kv_cache_type_uniform, make_block_hash_with_group_id) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor, - SlidingWindowSpec) + KVCacheGroupSpec, KVCacheSpec, + KVCacheTensor, SlidingWindowSpec) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -531,102 +530,288 @@ def test_metrics(): assert not metrics.query_queue -def test_unify_kv_cache_configs(): - same_kv_cache_config = [ - KVCacheConfig( - num_blocks=10, - kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), - ], - kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), - ], - ), - KVCacheConfig( - num_blocks=20, - kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), - ], - kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), - ], - ), - ] - unify_kv_cache_configs(same_kv_cache_config) - assert same_kv_cache_config[0].num_blocks == 10 - assert same_kv_cache_config[1].num_blocks == 10 +def test_get_kv_cache_configs_multiple_workers(): + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig(model_config=model_config) - need_sort_kv_cache_config = [ + ref_kv_cache_spec = new_kv_cache_spec() + same_kv_cache_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }] + + # Basic case. All things are the same. + kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10 + ]) + assert kv_cache_configs == [ KVCacheConfig( num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), KVCacheConfig( - num_blocks=20, + num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), ] - unify_kv_cache_configs(need_sort_kv_cache_config) - sorted_kv_cache_groups = [ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)), - ] - assert ( - need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups) - assert ( - need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups) - - diff_kv_cache_config = [ + # Different available memory. This is the case for TP. + # Use the smallest memory available. + kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 20 + ]) + assert kv_cache_configs == [ KVCacheConfig( num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), KVCacheConfig( - num_blocks=20, + num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=8)), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), ] + + # Different KV cache specs. This is the case for PP. + different_layer_specs = [{ + "layer1": new_kv_cache_spec(), + }, { + "layer2": new_kv_cache_spec(), + "layer3": new_kv_cache_spec(), + }] + + # Different workers have different layers. + kv_cache_configs = get_kv_cache_configs( + vllm_config, different_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10 + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer1"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer2", "layer3"], new_kv_cache_spec()), + ], + ), + ] + + # Some layers are the same, some are different. This is the case for TP+PP + tp_pp_kv_cache_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer3": new_kv_cache_spec(), + }, { + "layer3": new_kv_cache_spec(), + }] + + kv_cache_configs = get_kv_cache_configs( + vllm_config, tp_pp_kv_cache_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer3"], ref_kv_cache_spec), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer3"], ref_kv_cache_spec), + ], + ), + ] + + # Different workers have different types of layers. This is the case for + # hybrid models + PP. + different_type_layer_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer3": new_sliding_window_spec(), + "layer4": new_sliding_window_spec(), + }] + kv_cache_configs = get_kv_cache_configs( + vllm_config, different_type_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), + KVCacheGroupSpec([], new_sliding_window_spec()), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer3"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer4"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec([], ref_kv_cache_spec), + KVCacheGroupSpec(["layer3", "layer4"], + new_sliding_window_spec()), + ], + ), + ] + + # When divided into multiple KVCacheGroups, need to ensure the number of + # layers per group is similar. + different_type_layer_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_sliding_window_spec(), + "layer3": new_sliding_window_spec(), + }, { + "layer4": new_kv_cache_spec(), + "layer5": new_sliding_window_spec(), + "layer6": new_sliding_window_spec(), + }] + kv_cache_configs = get_kv_cache_configs( + vllm_config, different_type_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 10, + ref_kv_cache_spec.page_size_bytes * 10, + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1", "layer2", "layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], ref_kv_cache_spec), + KVCacheGroupSpec(["layer2"], new_sliding_window_spec()), + KVCacheGroupSpec(["layer3"], new_sliding_window_spec()), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer4", "layer5", "layer6"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer4"], ref_kv_cache_spec), + KVCacheGroupSpec(["layer5"], new_sliding_window_spec()), + KVCacheGroupSpec(["layer6"], new_sliding_window_spec()), + ], + ), + ] + + # Have conflicting layers. Need to raise an error. + conflicting_layer_specs = [{ + "layer1": new_kv_cache_spec(), + }, { + "layer1": new_sliding_window_spec(), + }] with pytest.raises(AssertionError): - unify_kv_cache_configs(diff_kv_cache_config) + get_kv_cache_configs(vllm_config, conflicting_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ]) def test_merge_kv_cache_spec(): @@ -890,7 +1075,7 @@ def test_allocate_with_lookahead(): assert len(blocks.get_block_ids()[0]) == 2 -def test_get_kv_cache_config(): +def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory model_config = ModelConfig(max_model_len=16) vllm_config = VllmConfig(model_config=model_config) @@ -901,8 +1086,10 @@ def test_get_kv_cache_config(): 'layer_1': new_kv_cache_spec(), 'layer_2': new_kv_cache_spec(), } - kv_cache_config_full = get_kv_cache_config( - vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32) + kv_cache_config_full = get_kv_cache_configs( + vllm_config, [kv_cache_specs_full], + [mem_per_block_per_layer * 2 * 32])[0] + print(kv_cache_config_full) assert kv_cache_config_full == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ @@ -920,8 +1107,9 @@ def test_get_kv_cache_config(): 'layer_1': new_sliding_window_spec(), 'layer_2': new_sliding_window_spec(), } - kv_cache_config_sliding = get_kv_cache_config( - vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32) + kv_cache_config_sliding = get_kv_cache_configs( + vllm_config, [kv_cache_specs_sliding], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_sliding == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ @@ -940,8 +1128,9 @@ def test_get_kv_cache_config(): 'layer_1': new_kv_cache_spec(), 'layer_2': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ @@ -962,8 +1151,9 @@ def test_get_kv_cache_config(): 'layer_1': new_kv_cache_spec(), 'layer_2': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=64, kv_cache_tensors=[ @@ -985,21 +1175,22 @@ def test_get_kv_cache_config(): 'layer_5': new_sliding_window_spec(), 'layer_6': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_1", "layer_3", "layer_5"]), + shared_by=["layer_1", "layer_3", "layer_4"]), KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_2", "layer_4", "layer_6"]), + shared_by=["layer_2", "layer_5", "layer_6"]), ], kv_cache_groups=[ KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer_3", "layer_4"], + KVCacheGroupSpec(["layer_3", "layer_5"], new_sliding_window_spec()), - KVCacheGroupSpec(["layer_5", "layer_6"], + KVCacheGroupSpec(["layer_4", "layer_6"], new_sliding_window_spec()), ], ) @@ -1017,27 +1208,30 @@ def test_get_kv_cache_config(): 'layer_9': new_sliding_window_spec(), 'layer_10': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 3 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ KVCacheTensor( size=mem_per_block_per_layer * 32, - shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]), + shared_by=["layer_1", "layer_4", "layer_5", "layer_6"]), + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_2", "layer_7", "layer_8", "layer_9"]), KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_2", "layer_5", "layer_8"]), - KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_3", "layer_6", "layer_9"]), + shared_by=["layer_3", "layer_10"]), ], kv_cache_groups=[ KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"], + KVCacheGroupSpec(["layer_4", "layer_7", "layer_10"], new_sliding_window_spec()), - KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"], + KVCacheGroupSpec(["layer_5", "layer_8"], + new_sliding_window_spec()), + KVCacheGroupSpec(["layer_6", "layer_9"], new_sliding_window_spec()), - KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()), ], ) @@ -1047,13 +1241,14 @@ def test_get_kv_cache_config(): 'layer_2': new_kv_cache_spec(), } with pytest.raises(NotImplementedError): - get_kv_cache_config(vllm_config, kv_cache_specs_hybrid, - mem_per_block_per_layer * 2 * 32) + get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] # Test num_gpu_blocks_override vllm_config.cache_config.num_gpu_blocks_override = 16 - kv_cache_config_override_blocks = get_kv_cache_config( - vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32) + kv_cache_config_override_blocks = get_kv_cache_configs( + vllm_config, [kv_cache_specs_full], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_override_blocks == KVCacheConfig( num_blocks=16, kv_cache_tensors=[ @@ -1065,3 +1260,16 @@ def test_get_kv_cache_config(): kv_cache_groups=[ KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()) ]) + + +def test_get_kv_cache_configs_attention_free(): + kv_cache_specs: dict[str, KVCacheSpec] = {} + vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16)) + kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=1, + kv_cache_tensors=[], + kv_cache_groups=[], + ) + ] diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index c719e44acc9c2..bd9b6131c2222 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -10,7 +10,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, - get_kv_cache_config) + get_kv_cache_configs) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.worker.tpu_model_runner import ( @@ -477,8 +477,8 @@ def test_init_kv_cache_without_kv_sharing(): # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim) # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB num_expected_blocks = 20480 # 20GB / 512KB / 2 (num layers) - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 2 assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2 @@ -550,8 +550,8 @@ def test_init_kv_cache_with_kv_sharing_valid(): # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing num_expected_blocks = 2 * 20480 # 20GB / 512KB - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 1 # Each layer now has twice the available memory for KV cache diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 5ebc00d573030..4ad8df1ce3868 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -15,7 +15,7 @@ from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes, update_environment_variables from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, - get_kv_cache_config) + get_kv_cache_configs) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -585,8 +585,8 @@ def test_init_kv_cache_without_kv_sharing(): available_memory = 20 * GiB_bytes # page size for layer 0's kv_cache_spec is 32KB num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 2 assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2 @@ -657,8 +657,8 @@ def test_init_kv_cache_with_kv_sharing_valid(): # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing num_expected_blocks = 655360 # 20GB / 32KB - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 1 # Each layer now has twice the available memory for KV cache @@ -788,8 +788,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): kv_cache_spec = runner.get_kv_cache_spec() available_memory = 5 * GiB_bytes - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] runner.initialize_kv_cache(kv_cache_config) # random partition of blocks diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index f939da8c5b5c3..533c0236dad7f 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -5,7 +5,7 @@ import os from collections import defaultdict, deque from collections.abc import Iterable, Sequence -from dataclasses import astuple, dataclass +from dataclasses import dataclass from typing import Any, Callable, NewType, Optional, Union from vllm import envs @@ -811,59 +811,21 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int: return page_sizes.pop() -def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, - kv_cache_spec: dict[str, KVCacheSpec], - available_memory: int) -> KVCacheConfig: +def _get_kv_cache_groups_uniform_type( + kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ Generates the KV cache configuration for a model with one type of KV cache. Divide the available memory equally among all layers. Args: - vllm_config: The global VllmConfig - kv_cache_spec: The kv cache spec of each attention layer in the model - available_memory: Memory available for KV cache in bytes. + kv_cache_specs: The kv cache spec of each attention layer in the model Returns: - The generated KVCacheConfig + The generated KVCacheGroupSpecs """ - page_size = get_uniform_page_size(kv_cache_spec) - num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec), - available_memory, page_size) - - per_layer_size = page_size * num_blocks - # All layers have the same KV cache spec, so we create one kv cache group - # for all layers. - grouped_layer_names = [list(kv_cache_spec.keys())] - - # Each layer uses a separate Tensor to store its KV cache. - kv_cache_tensors = [ - KVCacheTensor(size=per_layer_size, shared_by=[layer_name]) - for layer_name in kv_cache_spec - ] - - kv_cache_config = KVCacheConfig( - num_blocks=num_blocks, - kv_cache_tensors=kv_cache_tensors, - kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec, - grouped_layer_names), - ) - - num_tokens = num_blocks * vllm_config.cache_config.block_size - if vllm_config.parallel_config.decode_context_parallel_size > 1: - num_tokens *= vllm_config.parallel_config.decode_context_parallel_size - logger.info( - "Multiplying the GPU KV cache size by the dcp_world_size %d.", - vllm_config.parallel_config.decode_context_parallel_size) - - num_tokens_str = f"{num_tokens:,}" - logger.info("GPU KV cache size: %s tokens", num_tokens_str) - max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" - max_concurrency = get_max_concurrency_for_kv_cache_config( - vllm_config, kv_cache_config) - logger.info("Maximum concurrency for %s tokens per request: %.2fx", - max_model_len_str, max_concurrency) - return kv_cache_config + return create_kv_cache_group_specs(kv_cache_specs, + [list(kv_cache_specs.keys())]) def is_kv_cache_page_size_uniform( @@ -888,11 +850,10 @@ def is_kv_cache_type_attention_free( return not kv_cache_spec -def _get_kv_cache_config_uniform_page_size( - vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], - available_memory: int) -> KVCacheConfig: +def _get_kv_cache_groups_uniform_page_size( + kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ - Generates the KV cache configuration for hybrid models with multiple + Generates the KV cache groups for hybrid models with multiple attention types but still with a uniform page size (physical memory per block per layer) for all layers. @@ -949,11 +910,9 @@ def _get_kv_cache_config_uniform_page_size( memory per block is the same for all groups. Args: - vllm_config: The global VllmConfig kv_cache_spec: The KVCacheSpec of each attention layer in the model - available_memory: Memory available for KV cache in bytes. Returns: - The generated KVCacheConfig + The generated KVCacheGroupSpecs """ # Group all layers by kv_cache_spec. # E.g., 2 full attention layers and 3 sliding window attention layers, @@ -966,7 +925,7 @@ def _get_kv_cache_config_uniform_page_size( # group identical. Add padding to the last group of each type if necessary. # E.g., (full.0, full.1), (sw.0, sw.1, sw.2) # split to 3 groups with 2 layers each: - # (full.0, full.1), (sw.0, sw.1), (sw.2, padding). + # (full.0, full.1), (sw.0, sw.2), (sw.1, padding). # FIXME(Chen): At the moment of writing this code (2025-06-02), all # open-source hybrid model follows a n:1 pattern between different attention # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and @@ -984,19 +943,60 @@ def _get_kv_cache_config_uniform_page_size( num_padding_layers, num_padding_layers / len(layers) * 100, ) - for i in range(0, len(layers), group_size): - grouped_layers.append(layers[i:i + group_size]) - kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec, - grouped_layers) + num_groups = cdiv(len(layers), group_size) + # In PP case, say if we have + # - stage 0: full.0, sw.0, sw.1 + # - stage 1: full.1, sw.2, sw.3 + # We should have 3 groups: (full.0, full.1), (sw.0, sw.2), (sw.1, sw.3) + # It can't be (full.0, full.1), (sw.0, sw.1), (sw.2, sw.3) because + # the 3 groups in stage 0 will be (full.0), (sw.0, sw.1), (empty group) + # and it will be padded to (full.0, padding), (sw.0, sw.1), + # (padding, padding) to ensure the number of layers in each group is + # the same and will cause memory waste. + # To avoid this, we assign layers[i::num_groups] to the i-th group + # instead of layers[i * group_size: (i + 1) * group_size] + for i in range(num_groups): + grouped_layers.append(layers[i::num_groups]) + return create_kv_cache_group_specs(kv_cache_spec, grouped_layers) + + +def get_kv_cache_config_from_groups(vllm_config: VllmConfig, + kv_cache_groups: list[KVCacheGroupSpec], + kv_cache_specs: dict[str, KVCacheSpec], + available_memory: int) -> KVCacheConfig: + """ + Generate the KV cache configuration from the KV cache groups and spec + of each layer. + + Args: + vllm_config: The global VllmConfig + kv_cache_groups: The KV cache groups + kv_cache_specs: The KV cache spec of each attention layer in the model + available_memory: Memory available for KV cache in bytes + Returns: + The generated KVCacheConfig + """ + if len(kv_cache_groups) == 0: + # Attention free models do not have KV cache. + # Return num_blocks=1 as BlockPool always needs a null_block. + return KVCacheConfig( + num_blocks=1, + kv_cache_tensors=[], + kv_cache_groups=kv_cache_groups, + ) # Determine how model runners should initialize the KV cache tensors. # We will have group_size memory pools, each is shared by one layer from # each group. As layers of different groups have different block table, # they will use different parts of the shared Tensor. - # The memory layout in the example will be: - # full.0, sw.0, sw.2: share a Tensor with size=available_memory//2 - # full.1, sw.1: share another Tensor with size=available_memory//2 - page_size = get_uniform_page_size(kv_cache_spec) + # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), + # (sw.1, padding) will be: (group_size = 2) + # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 + # full.1, sw.2: share another Tensor with size=available_memory//2 + group_size = max(len(group.layer_names) for group in kv_cache_groups) + + page_size = get_uniform_page_size(kv_cache_specs) + assert group_size > 0, "group_size must be greater than 0" num_blocks = get_num_blocks(vllm_config, group_size, available_memory, page_size) per_memory_pool_size = page_size * num_blocks @@ -1004,8 +1004,8 @@ def _get_kv_cache_config_uniform_page_size( for i in range(group_size): shared_by = [] for j in range(len(kv_cache_groups)): - if i < len(grouped_layers[j]): - shared_by.append(grouped_layers[j][i]) + if i < len(kv_cache_groups[j].layer_names): + shared_by.append(kv_cache_groups[j].layer_names[i]) kv_cache_tensors.append( KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by)) @@ -1019,7 +1019,12 @@ def _get_kv_cache_config_uniform_page_size( [group.kv_cache_spec.block_size for group in kv_cache_groups]) # Print the KV cache size and maximum concurrency. - num_tokens = num_blocks // len(grouped_layers) * min_block_size + num_tokens = num_blocks // len(kv_cache_groups) * min_block_size + if vllm_config.parallel_config.decode_context_parallel_size > 1: + num_tokens *= vllm_config.parallel_config.decode_context_parallel_size + logger.info( + "Multiplying the GPU KV cache size by the dcp_world_size %d.", + vllm_config.parallel_config.decode_context_parallel_size) num_tokens_str = f"{num_tokens:,}" logger.info("GPU KV cache size: %s tokens", num_tokens_str) max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" @@ -1030,10 +1035,6 @@ def _get_kv_cache_config_uniform_page_size( return kv_cache_config -def _get_kv_cache_config_attention_free() -> KVCacheConfig: - return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[]) - - def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): """ This function tries to convert the KV cache specs to one type if the model @@ -1087,72 +1088,112 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): "convert the KV cache specs to one unified type.") -def get_kv_cache_config( - vllm_config: VllmConfig, - kv_cache_spec: dict[str, KVCacheSpec], - available_memory: int, -) -> KVCacheConfig: +def get_kv_cache_groups( + vllm_config: VllmConfig, + kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ - Generates the KV cache configuration for a model. + Split the layers in the model into groups with the same KV cache spec. Args: vllm_config: The global VllmConfig kv_cache_spec: The kv cache spec of each attention layer in the model - available_memory: Memory available for KV cache in bytes. Returns: - The generated KVCacheConfigs + The generated KVCacheGroups """ - check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager: unify_hybrid_kv_cache_specs(kv_cache_spec) if is_kv_cache_type_attention_free(kv_cache_spec): - # This returns a kv_cache config with 0 kv_cache groups and 1 block - # to allow for the KVCache manager to handle attention free models. - return _get_kv_cache_config_attention_free() + # This returns an empty list to allow for the KVCacheManager to handle + # attention free models. + return [] elif is_kv_cache_type_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. - return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, - available_memory) + return _get_kv_cache_groups_uniform_type(kv_cache_spec) elif is_kv_cache_page_size_uniform(kv_cache_spec): # Model contains multiple attention types, but KV cache of all layers # have the same physical memory per block per layer. Split the layers # into groups with the same number of layers, and thus same total page # size. - return _get_kv_cache_config_uniform_page_size(vllm_config, - kv_cache_spec, - available_memory) + return _get_kv_cache_groups_uniform_page_size(kv_cache_spec) raise NotImplementedError -def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]): +def get_kv_cache_configs(vllm_config: VllmConfig, + kv_cache_specs: list[dict[str, KVCacheSpec]], + available_memory: list[int]) -> list[KVCacheConfig]: """ - Make the KV cache configurations for each worker consistent, so that all - workers can be controlled by the same KVCacheManager. - This function verifies that the layer group of each worker are the same, - and changes the num_blocks of each worker to the smallest among all workers. + Generates the KV cache configurations for a model. + Since we use a shared centralized controller for all workers, we need the + `kv_cache_config` to be consistent across all workers to make sure + the KV cache allocation can be applied to all workers. However, different + workers may have different memory available, and different type of layers + (when pipeline parallel is enabled). To handle the difference between + workers, the current implementation is: + 1. Merge the KV cache specs of all workers to get the KVCacheSpecs for + the whole model. + 2. Generate the KV cache groups based on the layer ratio of the whole model. + 3. Generate the KV cache configs for each worker based on the KV cache + grouping strategy. (This is reasonable because the layer ratio of + different PP stages are similar.) + 4. Change the num_blocks of each worker to the smallest among all workers. Args: - kv_cache_configs: The KV cache configurations for each worker. Will be - in-place modified to make them consistent. + vllm_config: The global VllmConfig + kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker. + available_memory: Memory available for KV cache in bytes for each + worker. + + Returns: + The generated KVCacheConfigs for each worker. """ - # Sort the kv cache groups by their KV cache spec. - # This can avoid the inconsistency caused by the order of groups. - for kv_cache_config in kv_cache_configs: - kv_cache_config.kv_cache_groups.sort(key=lambda x: (type( - x.kv_cache_spec).__name__, astuple(x.kv_cache_spec))) + # Check if the available memory is enough for each worker. + for kv_cache_spec_one_worker, available_memory_one_worker in zip( + kv_cache_specs, available_memory): + check_enough_kv_cache_memory(vllm_config, kv_cache_spec_one_worker, + available_memory_one_worker) - # Verify that the groups of each rank are the same. - for kv_cache_config in kv_cache_configs[1:]: - for group_rank_0, group_rank_i in zip( - kv_cache_configs[0].kv_cache_groups, - kv_cache_config.kv_cache_groups): - assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec + # Merge the KV cache specs of all workers. Different PP stages may have + # different layer names, and different TP ranks of the same PP stage should + # have the same KV cache spec. + merged_kv_cache_specs: dict[str, KVCacheSpec] = {} + for kv_cache_spec_one_worker in kv_cache_specs: + for layer_name, layer_spec in kv_cache_spec_one_worker.items(): + if layer_name not in merged_kv_cache_specs: + merged_kv_cache_specs[layer_name] = layer_spec + else: + assert merged_kv_cache_specs[layer_name] == layer_spec, ( + "The KV cache specs for the same layer are different " + "across workers. This is not supported yet.") + global_kv_cache_groups = get_kv_cache_groups(vllm_config, + merged_kv_cache_specs) + + kv_cache_configs: list[KVCacheConfig] = [] + for kv_cache_spec_one_worker, available_memory_one_worker in zip( + kv_cache_specs, available_memory): + kv_cache_groups_one_worker: list[KVCacheGroupSpec] = [] + for group in global_kv_cache_groups: + group_layer_names_one_worker = [ + layer_name for layer_name in group.layer_names + if layer_name in kv_cache_spec_one_worker + ] + kv_cache_groups_one_worker.append( + KVCacheGroupSpec(group_layer_names_one_worker, + group.kv_cache_spec)) + assert sum( + len(group.layer_names) for group in + kv_cache_groups_one_worker) == len(kv_cache_spec_one_worker), ( + "Some layers are not assigned to any group.") + kv_cache_configs.append( + get_kv_cache_config_from_groups(vllm_config, + kv_cache_groups_one_worker, + kv_cache_spec_one_worker, + available_memory_one_worker)) # Change the num_blocks of each rank to the smallest among all ranks. We # do not need to shrink the tensor size because it is valid to only use the diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 995e70385be89..64a67f3b438e1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -29,10 +29,9 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket, resolve_obj_by_qualname, set_process_title) -from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config, +from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs, get_request_block_hasher, - init_none_hash, - unify_kv_cache_configs) + init_none_hash) from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler @@ -191,18 +190,9 @@ class EngineCore: available_gpu_memory = [0] * len(kv_cache_specs) assert len(kv_cache_specs) == len(available_gpu_memory) - # Get the kv cache tensor size - kv_cache_configs = [ - get_kv_cache_config(vllm_config, kv_cache_spec_one_worker, - available_gpu_memory_one_worker) - for kv_cache_spec_one_worker, available_gpu_memory_one_worker in - zip(kv_cache_specs, available_gpu_memory) - ] - # Since we use a shared centralized controller, we need the - # `kv_cache_config` to be consistent across all workers to make sure - # all the memory operators can be applied to all workers. - unify_kv_cache_configs(kv_cache_configs) + kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs, + available_gpu_memory) # All workers have the same kv_cache_config except layer names, so use # an arbitrary one to initialize the scheduler. From 78818dd1b0b4b8ae5e90a8259030684f23f589f1 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Mon, 15 Sep 2025 12:50:36 +0800 Subject: [PATCH 268/584] [Docs] Have a try to improve frameworks/streamlit.md (#24841) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- docs/deployment/frameworks/streamlit.md | 42 ++++++++++++------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md index af0f0690c68e2..c119878f137a4 100644 --- a/docs/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -6,35 +6,33 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu ## Prerequisites -- Setup vLLM environment +Set up the vLLM environment by installing all required packages: + +```bash +pip install vllm streamlit openai +``` ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with a supported chat completion model, e.g. -```bash -vllm serve qwen/Qwen1.5-0.5B-Chat -``` + ```bash + vllm serve Qwen/Qwen1.5-0.5B-Chat + ``` -- Install streamlit and openai: +1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py> -```bash -pip install streamlit openai -``` +1. Start the streamlit web UI and start to chat: -- Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py> - -- Start the streamlit web UI and start to chat: - -```bash -streamlit run streamlit_openai_chatbot_webserver.py - -# or specify the VLLM_API_BASE or VLLM_API_KEY -VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ + ```bash streamlit run streamlit_openai_chatbot_webserver.py -# start with debug mode to view more details -streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug -``` + # or specify the VLLM_API_BASE or VLLM_API_KEY + VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ + streamlit run streamlit_openai_chatbot_webserver.py -![](../../assets/deployment/streamlit-chat.png) + # start with debug mode to view more details + streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug + ``` + + ![Chat with vLLM assistant in Streamlit](../../assets/deployment/streamlit-chat.png) From 3f3313981c0bb3e2366c56737b82f8a36aa3f91e Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Mon, 15 Sep 2025 13:15:12 +0800 Subject: [PATCH 269/584] [kv cache] update num_free_blocks in the end (#24228) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- tests/v1/core/test_kv_cache_utils.py | 12 ++++++++++++ vllm/v1/core/kv_cache_utils.py | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 2b44b16fd63b8..5d2517ce8c025 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -243,6 +243,18 @@ def test_free_kv_cache_block_queue_append_n(): assert blocks[3].next_free_block is queue.fake_free_list_tail assert queue.fake_free_list_tail.prev_free_block is blocks[3] + # Create an empty FreeKVCacheBlockQueue + invalid_queue = FreeKVCacheBlockQueue([]) + # set prev_free_block to None and this will cause assertation in append_n + invalid_queue.fake_free_list_tail.prev_free_block = None + with pytest.raises(AssertionError): + # Append 1 block + # fake_head->fake_tail + invalid_queue.append_n(blocks[0:1]) + assert invalid_queue.num_free_blocks == 0 + assert (invalid_queue.fake_free_list_head.next_free_block == + invalid_queue.fake_free_list_tail) + def test_free_kv_cache_block_queue_popleft_n(): blocks = [KVCacheBlock(block_id=i) for i in range(6)] diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 533c0236dad7f..c3e1865bc55b0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -370,7 +370,6 @@ class FreeKVCacheBlockQueue: """ if len(blocks) == 0: return - self.num_free_blocks += len(blocks) last_block = self.fake_free_list_tail.prev_free_block assert last_block is not None, ( @@ -385,6 +384,8 @@ class FreeKVCacheBlockQueue: last_block.next_free_block = self.fake_free_list_tail self.fake_free_list_tail.prev_free_block = last_block + self.num_free_blocks += len(blocks) + def get_all_free_blocks(self) -> list[KVCacheBlock]: """Get all free blocks in the free list. Mainly used for testing. From f4a948f33f8766c843e0d59d9ec1ab44f7b2bfcf Mon Sep 17 00:00:00 2001 From: Ce Gao <cegao@tensorchord.ai> Date: Mon, 15 Sep 2025 14:04:55 +0800 Subject: [PATCH 270/584] [Frontend] Skip `stop` in reasoning content (#14550) Signed-off-by: Ce Gao <cegao@tensorchord.ai> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> --- tests/engine/test_stop_checker.py | 228 +++++++++++++++++++ vllm/engine/llm_engine.py | 17 +- vllm/engine/output_processor/stop_checker.py | 15 +- 3 files changed, 256 insertions(+), 4 deletions(-) create mode 100644 tests/engine/test_stop_checker.py diff --git a/tests/engine/test_stop_checker.py b/tests/engine/test_stop_checker.py new file mode 100644 index 0000000000000..3d1e1c8032a48 --- /dev/null +++ b/tests/engine/test_stop_checker.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.reasoning import ReasoningParser +from vllm.sampling_params import SamplingParams +from vllm.sequence import Sequence, SequenceStatus + +REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + + +class MockReasoningParser(ReasoningParser): + """Mock reasoning parser for testing purposes.""" + + def __init__(self, + tokenizer: AutoTokenizer, + reasoning_active: bool = False): + super().__init__(tokenizer) + self.reasoning_active = reasoning_active + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return not self.reasoning_active + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + return input_ids + + +class MockSequence(Sequence): + """Mock sequence for testing purposes.""" + + def __init__(self, token_ids, output_text="test_output", eos_token_id=0): + self.token_ids = token_ids + self.output_text = output_text + self.eos_token_id = eos_token_id + self.status = SequenceStatus.RUNNING + self.stop_reason = None + + def get_token_ids(self): + return self.token_ids + + def get_last_token_id(self): + return self.token_ids[-1] if self.token_ids else None + + def get_len(self): + return len(self.token_ids) + + def get_output_len(self): + return len(self.token_ids) - 1 # Simulating prompt + outputs + + +@pytest.fixture +def deepseek_r1_qwen_tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +@pytest.fixture +def stop_checker(): + return StopChecker(max_model_len=10, + get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer) + + +@pytest.fixture +def stop_checker_with_reasoner(): + reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer) + return StopChecker(max_model_len=10, + get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer, + reasoner=reasoner) + + +def test_eos_token_stopping(stop_checker): + """Test sequence stopping when EOS token is encountered.""" + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams() + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + + +def test_ignore_eos(stop_checker): + """Test sequence continuing when EOS token is ignored.""" + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams(ignore_eos=True) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.RUNNING + + +def test_min_tokens(stop_checker): + """Test min_tokens prevents early stopping.""" + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams(min_tokens=3) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.RUNNING + + +def test_stop_token_ids(stop_checker): + """Test sequence stopping with custom stop token IDs.""" + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(stop_token_ids=[3]) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert seq.stop_reason == 3 + + +def test_stop_strings(stop_checker): + """Test sequence stopping with stop strings.""" + seq = MockSequence(token_ids=[1, 2, 3], + output_text="test output with STOP", + eos_token_id=0) + sampling_params = SamplingParams(stop=["STOP"]) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert seq.stop_reason == "STOP" + assert "STOP" not in seq.output_text # Default behavior removes stop string + + +def test_include_stop_str_in_output(stop_checker): + """Test keeping stop strings in output.""" + seq = MockSequence(token_ids=[1, 2, 3], + output_text="test output with STOP", + eos_token_id=0) + sampling_params = SamplingParams(stop=["STOP"], + include_stop_str_in_output=True) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=5, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert "STOP" in seq.output_text + + +def test_max_tokens(stop_checker): + """Test sequence stopping at max_tokens.""" + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(max_tokens=2) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED + + +def test_max_model_len(stop_checker): + """Test sequence stopping at max_model_len.""" + seq = MockSequence(token_ids=list(range(11)), + eos_token_id=0) # 11 tokens, max is 10 + sampling_params = SamplingParams() + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED + + +def test_reasoning_skip_stops(stop_checker_with_reasoner): + """Test that stop tokens and strings are ignored during reasoning.""" + # Set reasoning_active to True to simulate being in reasoning mode + stop_checker_with_reasoner.reasoner.reasoning_active = True + + # Test with stop token + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(stop_token_ids=[3]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=1, sampling_params=sampling_params) + assert seq.status == SequenceStatus.RUNNING + + # Test with stop string + seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") + sampling_params = SamplingParams(stop=["STOP"]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=4, sampling_params=sampling_params) + assert seq.status == SequenceStatus.RUNNING + + # But EOS token still stops the sequence + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams() + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=1, sampling_params=sampling_params) + assert seq.status == SequenceStatus.FINISHED_STOPPED + + +def test_reasoning_end_enables_stops(stop_checker_with_reasoner): + """Test that stop tokens work after reasoning ends.""" + # Set reasoning_active to False to simulate being out of reasoning mode + stop_checker_with_reasoner.reasoner.reasoning_active = False + + # Test with stop token + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(stop_token_ids=[3]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=1, sampling_params=sampling_params) + assert seq.status == SequenceStatus.FINISHED_STOPPED + + # Test with stop string + seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") + sampling_params = SamplingParams(stop=["STOP"]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=4, sampling_params=sampling_params) + assert seq.status == SequenceStatus.FINISHED_STOPPED diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c303d093f6324..f25530fc9dac8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -40,6 +40,7 @@ from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.outputs import (PoolingRequestOutput, RequestOutput, RequestOutputFactory) +from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, Sequence, SequenceGroup, SequenceGroupBase, @@ -372,6 +373,14 @@ class LLMEngine: "vllm.llm_engine", self.observability_config.otlp_traces_endpoint) + # Initialize reasoning parser if reasoning backend is set. + if self.decoding_config.reasoning_backend and \ + self.tokenizer: + reasoner_class = ReasoningParserManager.get_reasoning_parser( + self.decoding_config.reasoning_backend) + self.reasoner: ReasoningParser = reasoner_class( + self.tokenizer.get_lora_tokenizer()) + # Create sequence output processor, e.g. for beam search or # speculative decoding. self.output_processor = ( @@ -381,8 +390,12 @@ class LLMEngine: self.scheduler, self.seq_counter, get_tokenizer_for_seq, - stop_checker=StopChecker(self.scheduler_config.max_model_len, - get_tokenizer_for_seq), + stop_checker=StopChecker( + self.scheduler_config.max_model_len, + get_tokenizer_for_seq, + self.reasoner if self.decoding_config.reasoning_backend + and self.tokenizer else None, + ), )) self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {} diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 3fb2f71b5e999..68a63044df05e 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -4,6 +4,7 @@ from typing import Callable, List, Optional, Tuple from vllm.lora.request import LoRARequest +from vllm.reasoning import ReasoningParser from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceStatus from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -16,11 +17,16 @@ class StopChecker: emitted, or if we have exceeded the max model len. """ - def __init__(self, max_model_len: int, - get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer]): + def __init__( + self, + max_model_len: int, + get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], + reasoner: Optional[ReasoningParser] = None, + ): # Do not use it directly, but use `self._get_max_model_len`. self._max_model_len = max_model_len self.get_tokenizer_for_seq = get_tokenizer_for_seq + self.reasoner = reasoner def _get_max_model_len(self, lora_req: Optional[LoRARequest]): if lora_req and lora_req.long_lora_max_len: @@ -57,6 +63,11 @@ class StopChecker: seq.status = SequenceStatus.FINISHED_STOPPED return + # Skip stop string/token checks if in reasoning content generation + if self.reasoner is not None and \ + not self.reasoner.is_reasoning_end(seq.get_token_ids()): + return + # Check if a stop token was encountered. # This assumes a single token produced per step. last_token_id = seq.get_last_token_id() From a8c0f599734d169f7fdac69dc9a09539d508a751 Mon Sep 17 00:00:00 2001 From: bingchen-mi <chenbing8@xiaomi.com> Date: Mon, 15 Sep 2025 14:38:12 +0800 Subject: [PATCH 271/584] [Bugfix] MiDashengLM model contact error under concurrent testing (#24738) Signed-off-by: chenbing8 <chenbing8@xiaomi.com> Signed-off-by: bingchen-mi <chenbing8@xiaomi.com> --- vllm/model_executor/models/midashenglm.py | 28 ++++++++++++++--------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index e314ae357ecd4..140800dd41c76 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -497,8 +497,11 @@ class MiDashengLMDummyInputsBuilder( hf_processor = self.info.get_hf_processor() audio_token = hf_processor.audio_token + audio_bos_token = hf_processor.audio_bos_token + audio_eos_token = hf_processor.audio_eos_token - return audio_token * num_audios + single_audio_text = f"{audio_bos_token}{audio_token}{audio_eos_token}" + return single_audio_text * num_audios def get_dummy_mm_data( self, @@ -577,14 +580,7 @@ class MiDashengLMMultiModalProcessor( vocab = tokenizer.get_vocab() audio_token = getattr(processor, "audio_token", "<|AUDIO|>") - audio_bos_token = getattr(processor, "audio_bos_token", - "<|audio_bos|>") - audio_eos_token = getattr(processor, "audio_eos_token", - "<|audio_eos|>") - audio_token_id = vocab[audio_token] - audio_bos_id = vocab[audio_bos_token] - audio_eos_id = vocab[audio_eos_token] out_mm_data = out_mm_kwargs.get_data() audio_length = out_mm_data.get("audio_length") @@ -604,7 +600,7 @@ class MiDashengLMMultiModalProcessor( audio_tokens = [audio_token_id] * num_features return PromptUpdateDetails.select_token_id( - [audio_bos_id] + audio_tokens + [audio_eos_id], + audio_tokens, embed_token_id=audio_token_id, ) @@ -670,8 +666,18 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): f"Got type: {type(mm_input)}") if isinstance(mm_input, torch.Tensor): return mm_input.reshape(-1, *mm_input.shape[2:]) - else: - return torch.concat(mm_input) + + if name == "input_values": + max_length = max(tensor.shape[1] for tensor in mm_input) + padded_mm_input = [ + torch.nn.functional.pad(tensor, + (0, max_length - tensor.shape[1])) + if tensor.shape[1] < max_length else tensor + for tensor in mm_input + ] + return torch.concat(padded_mm_input) + + return torch.concat(mm_input) def _parse_and_validate_audio_input( self, **kwargs: object) -> Optional[MiDashengLMAudioInputs]: From 4979eb79da80a6dd7d4e52103053bf00b80c65cb Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Mon, 15 Sep 2025 10:08:52 +0200 Subject: [PATCH 272/584] [Doc]: fix typos in various files (#24821) Signed-off-by: Didier Durand <durand.didier@gmail.com> --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 2 +- vllm/model_executor/layers/quantization/moe_wna16.py | 2 +- .../layers/quantization/utils/marlin_utils_test.py | 2 +- vllm/model_executor/layers/sampler.py | 2 +- vllm/model_executor/models/glm4_1v.py | 2 +- vllm/model_executor/models/interns1.py | 2 +- vllm/model_executor/models/ultravox.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 37e2980eea974..2ef36089b6afb 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -8,7 +8,7 @@ This benchmark aims to: Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. -Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) +Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) ## Setup diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index d6d7ec9b15805..c25b3dd6080dc 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -190,7 +190,7 @@ class MoeWNA16Method(FusedMoEMethodBase): group_size = self.quant_config.group_size group_size_div_factor = 1 - # make intermediate_size and hidden_size diviable by group_size + # make intermediate_size and hidden_size divisible by group_size # we reduce the group size to ensure that # and we would repeat the loaded_weight later while intermediate_size_per_partition % group_size or \ diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index b2c228c242532..f5acd03cc6622 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -19,7 +19,7 @@ class MarlinWorkspace: def __init__(self, out_features, min_thread_n, max_parallel): assert (out_features % min_thread_n == 0), ( - "out_features = {} is undivisible by min_thread_n = {}".format( + "out_features = {} is indivisible by min_thread_n = {}".format( out_features, min_thread_n)) max_workspace_size = ((out_features // min_thread_n) * max_parallel) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 829dd82b0bd4d..9d93cad2420ad 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -649,7 +649,7 @@ def _sample_with_torch( else: sampled_token_ids_tensor = None - # Counterintiutively, having two loops here is actually faster. + # Counterintuitively, having two loops here is actually faster. # The first loop can run without waiting on GPU<->CPU sync. for sampling_type in SamplingType: sample_indices = categorized_sample_indices[sampling_type] diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 22386a5e819ab..13c1ff8887022 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1524,7 +1524,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index d998b8a0ab4f7..b59d1b88cf5ce 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -738,7 +738,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, return [] # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index ad911ebedf895..371ca817d5f92 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -662,7 +662,7 @@ def pad_and_concat_to_dim3( max_len = max(f.shape[-1] for f in features) # Ensure all features have dim=3 features = [f.view(-1, *f.shape[-2:]) for f in features] - # Pad and oncatenate: + # Pad and concatenate: # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)] features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features] return torch.cat(features) From 59e17dd4a0f058e0bef38a371316b87fee3e882e Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Mon, 15 Sep 2025 17:18:42 +0800 Subject: [PATCH 273/584] [Misc] rename interval to max_recent_requests (#24229) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- vllm/v1/core/kv_cache_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index c3e1865bc55b0..f225b73264049 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -116,8 +116,8 @@ class PrefixCachingMetrics: This function is called with information gathered when new requests are being scheduled and are looking for computed blocks. - When there are more than `interval` requests, the oldest set of - requests are removed from the metrics. + When there are more than `max_recent_requests` requests, the oldest set + of requests are removed from the metrics. Args: stats: The prefix cache stats. From a0d8b9738dccaa70a9eb6355c9f9cf6c9bb9fc3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Mon, 15 Sep 2025 11:21:09 +0200 Subject: [PATCH 274/584] [Misc] Own KVConnectors installation (#24867) Signed-off-by: NickLucche <nlucches@redhat.com> --- .github/CODEOWNERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 846b68054c0a1..c0858c309501a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -101,4 +101,7 @@ mkdocs.yaml @hmellor /vllm/v1/worker/tpu* @NickLucche /vllm/platforms/tpu.py @NickLucche /vllm/v1/sample/tpu @NickLucche -/vllm/tests/v1/tpu @NickLucche \ No newline at end of file +/vllm/tests/v1/tpu @NickLucche + +# KVConnector installation files +/requirements/kv_connectors.txt @NickLucche \ No newline at end of file From 8de261b04a0a0e916d3d25d528d0f2ddeede2a6b Mon Sep 17 00:00:00 2001 From: Chao Lei <leichao139636@163.com> Date: Mon, 15 Sep 2025 17:36:06 +0800 Subject: [PATCH 275/584] [P/D]`kv_output_aggregator` support P TP > D TP (#23917) Signed-off-by: LCAIZJ <leichao139636@163.com> Co-authored-by: leichao.lc <leichao.lc@antgroup.com> --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 11 +++++++++++ vllm/executor/executor_base.py | 7 +++++++ vllm/v1/engine/core.py | 3 +++ vllm/v1/executor/multiproc_executor.py | 3 --- vllm/v1/executor/ray_distributed_executor.py | 2 -- 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 7e0b927c5b78f..70c07eac6304b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -355,3 +355,14 @@ class KVConnectorBase_V1(ABC): raise TypeError("get_required_kvcache_layout should not be called " "on the abstract base class") return None + + def get_finished_count(self) -> Optional[int]: + """ + Get the count of requests expected to complete send/receive operations + via this connector. + + Returns: + int: expected sending or receiving completion count. + """ + + return None \ No newline at end of file diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index a3c1d79a58b26..d18bef1256af5 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -13,6 +13,7 @@ from typing_extensions import TypeVar import vllm.platforms from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput @@ -54,6 +55,7 @@ class ExecutorBase(ABC): self._init_executor() self.is_sleeping = False self.sleeping_tags: set[str] = set() + self.kv_output_aggregator = None @abstractmethod def _init_executor(self) -> None: @@ -252,6 +254,11 @@ class ExecutorBase(ABC): exception.""" self.check_health() + def init_kv_output_aggregator(self, finished_count: Optional[int]) -> None: + """Init KVOutputAggregator""" + self.kv_output_aggregator = KVOutputAggregator( + finished_count or self.parallel_config.world_size) + class DistributedExecutorBase(ExecutorBase): """Abstract superclass of distributed executor implementations.""" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 64a67f3b438e1..a022e9c0d7058 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -128,6 +128,9 @@ class EngineCore: log_stats=self.log_stats, ) self.use_spec_decode = vllm_config.speculative_config is not None + if self.scheduler.connector is not None: # type: ignore + self.model_executor.init_kv_output_aggregator( + self.scheduler.connector.get_finished_count()) # type: ignore self.mm_registry = mm_registry = MULTIMODAL_REGISTRY self.mm_receiver_cache = engine_receiver_cache_from_config( diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index f566c9aee0c54..3aa373f12b609 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -26,7 +26,6 @@ from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) -from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, get_pp_group, get_tp_group) from vllm.executor.multiproc_worker_utils import ( @@ -135,8 +134,6 @@ class MultiprocExecutor(Executor): self.output_rank = self._get_output_rank() self.has_connector = self.vllm_config.kv_transfer_config is not None - self.kv_output_aggregator = KVOutputAggregator( - self.parallel_config.world_size) def start_worker_monitor(self): workers = self.workers diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index 59c9b56625a95..aadb5fd1dddd5 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -51,8 +51,6 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): # KV connector setup self.has_connector = self.vllm_config.kv_transfer_config is not None - self.kv_output_aggregator = KVOutputAggregator( - self.parallel_config.world_size) @property def max_concurrent_batches(self) -> int: From bc0f6059a24cdebc125c984d0ca1138755285a81 Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Mon, 15 Sep 2025 18:04:37 +0800 Subject: [PATCH 276/584] [UT] enhance free kv cache block queue popleft_n (#24220) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- tests/v1/core/test_kv_cache_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 5d2517ce8c025..d1427f5eaf443 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -280,9 +280,11 @@ def test_free_kv_cache_block_queue_popleft_n(): # Pop 0 block # fake_head->b1->b3->b5->b4->b0->b2->fake_tail assert len(queue.popleft_n(0)) == 0 + assert queue.num_free_blocks == 6 # Pop 1 block # fake_head->b3->b5->b4->b0->b2->fake_tail result_blocks = queue.popleft_n(1) + assert queue.num_free_blocks == 5 assert len(result_blocks) == 1 assert result_blocks[0] is blocks[1] for block in result_blocks: @@ -292,6 +294,7 @@ def test_free_kv_cache_block_queue_popleft_n(): # fake_head->b4->b0->b2->fake_tail result_blocks = queue.popleft_n(2) assert len(result_blocks) == 2 + assert queue.num_free_blocks == 3 assert result_blocks[0] is blocks[3] assert result_blocks[1] is blocks[5] for block in result_blocks: @@ -301,6 +304,7 @@ def test_free_kv_cache_block_queue_popleft_n(): # fake_head->fake_tail result_blocks = queue.popleft_n(3) assert len(result_blocks) == 3 + assert queue.num_free_blocks == 0 assert result_blocks[0] is blocks[4] assert result_blocks[1] is blocks[0] assert result_blocks[2] is blocks[2] From 2e41f5abca796c615f4db8d9a496d037fa385653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Mon, 15 Sep 2025 12:09:34 +0200 Subject: [PATCH 277/584] [XPU] Set consistent default KV cache layout (#24745) Signed-off-by: NickLucche <nlucches@redhat.com> --- .../kv_transfer/kv_connector/v1/nixl_connector.py | 15 +++++++++------ vllm/platforms/xpu.py | 10 ++++------ vllm/v1/attention/backends/utils.py | 14 ++++++++++---- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index c306eeb5aa7ab..1ff1407aeb99b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -56,9 +56,9 @@ except ImportError: logger.warning("NIXL is not available") NixlWrapper = None -# Supported xPUs and types of kv transfer buffer. -# {xPU: tuple of supported kv buffer types} -_NIXL_SUPPORTED_XPUS = { +# Supported platforms and types of kv transfer buffer. +# {device: tuple of supported kv buffer types} +_NIXL_SUPPORTED_DEVICE = { "cuda": ("cuda", ), "tpu": ("cpu", ), "xpu": ("cpu", ), @@ -458,9 +458,9 @@ class NixlConnectorWorker: self.device_type = current_platform.device_type self.kv_buffer_device: str = \ vllm_config.kv_transfer_config.kv_buffer_device - if self.device_type not in _NIXL_SUPPORTED_XPUS: + if self.device_type not in _NIXL_SUPPORTED_DEVICE: raise RuntimeError(f"{self.device_type} is not supported.") - elif self.kv_buffer_device not in _NIXL_SUPPORTED_XPUS[ + elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[ self.device_type]: raise RuntimeError( f"{self.device_type} with {self.kv_buffer_device} kv_buffer " @@ -468,7 +468,7 @@ class NixlConnectorWorker: self.device_kv_caches: dict[str, torch.Tensor] = {} # cpu kv buffer for xfer - # used when xPU memory can not be registered under nixl + # used when device memory can not be registered under nixl self.host_xfer_buffers: dict[str, torch.Tensor] = {} self.use_host_buffer = self.kv_buffer_device == "cpu" if self.kv_buffer_device == "cuda": @@ -927,6 +927,9 @@ class NixlConnectorWorker: if tp_ratio > 1: # Heterogeneous TP expects same kv_cache_layout. assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout + if self.device_type == "xpu": + raise ValueError( + "Heterogeneous TP is not supported on XPU") assert nixl_agent_meta.block_len == self.block_len * tp_ratio, ( "Remote P worker KV layer cache must be of shape [2, N, " diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 32208e7fff018..792115b33ea8d 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -9,6 +9,7 @@ import torch import vllm.envs as envs from vllm.logger import init_logger from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS +from vllm.v1.attention.backends.utils import set_kv_cache_layout from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -164,12 +165,9 @@ class XPUPlatform(Platform): vllm_config.scheduler_config.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) - if (envs.VLLM_KV_CACHE_LAYOUT is None - or envs.VLLM_KV_CACHE_LAYOUT != "NHD"): - os.environ["VLLM_KV_CACHE_LAYOUT"] = "NHD" - logger.info( - "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " - "only NHD layout is supported by XPU attention kernels.") + set_kv_cache_layout("NHD") + logger.info("Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " + "only NHD layout is supported by XPU attention kernels.") @classmethod def is_pin_memory_available(cls): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 009943fa743d8..ead70c910a8fa 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -5,8 +5,8 @@ import enum import functools from abc import abstractmethod from dataclasses import dataclass, fields, make_dataclass -from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Optional, Protocol, - TypeVar) +from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Literal, Optional, + Protocol, TypeVar, Union, get_args) import numpy as np import torch @@ -30,7 +30,12 @@ from vllm.logger import init_logger from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) -_KV_CACHE_LAYOUT_OVERRIDE = None +KVCacheLayoutType = Literal["NHD", "HND"] +_KV_CACHE_LAYOUT_OVERRIDE: Union[KVCacheLayoutType, None] = None + + +def is_valid_kv_cache_layout(value: str) -> bool: + return value in get_args(KVCacheLayoutType) @dataclass @@ -296,12 +301,13 @@ def get_kv_cache_layout(): if cache_layout is None: cache_layout = get_kv_connector_cache_layout() else: + assert is_valid_kv_cache_layout(cache_layout) logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \ "detected. Setting KV cache layout to %s.", cache_layout) return cache_layout -def set_kv_cache_layout(cache_layout: str): +def set_kv_cache_layout(cache_layout: KVCacheLayoutType): global _KV_CACHE_LAYOUT_OVERRIDE _KV_CACHE_LAYOUT_OVERRIDE = cache_layout From bf214ca22625e311a2c4c0dfbf7af19128f4919c Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Mon, 15 Sep 2025 19:57:30 +0800 Subject: [PATCH 278/584] [Misc] Fix examples openai_pooling_client.py (#24853) Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/pooling_models.md | 4 +- docs/models/supported_models.md | 2 +- docs/serving/openai_compatible_server.md | 10 ++--- examples/offline_inference/pooling/README.md | 33 ++++++++++++++ .../{ => pooling}/convert_model_to_seq_cls.py | 0 .../{ => pooling}/embed_jina_embeddings_v3.py | 0 .../{ => pooling}/embed_matryoshka_fy.py | 0 .../{ => pooling}/qwen3_reranker.py | 0 .../openai_embedding_long_text/service.sh | 2 +- examples/online_serving/pooling/README.md | 43 +++++++++++++++++++ .../{ => pooling}/cohere_rerank_client.py | 0 .../{ => pooling}/jinaai_rerank_client.py | 0 ...ai_chat_embedding_client_for_multimodal.py | 6 +++ .../openai_classification_client.py | 5 +++ .../{ => pooling}/openai_embedding_client.py | 5 +++ .../openai_embedding_matryoshka_fy.py | 0 .../{ => pooling}/openai_pooling_client.py | 6 ++- 17 files changed, 105 insertions(+), 11 deletions(-) create mode 100644 examples/offline_inference/pooling/README.md rename examples/offline_inference/{ => pooling}/convert_model_to_seq_cls.py (100%) rename examples/offline_inference/{ => pooling}/embed_jina_embeddings_v3.py (100%) rename examples/offline_inference/{ => pooling}/embed_matryoshka_fy.py (100%) rename examples/offline_inference/{ => pooling}/qwen3_reranker.py (100%) create mode 100644 examples/online_serving/pooling/README.md rename examples/online_serving/{ => pooling}/cohere_rerank_client.py (100%) rename examples/online_serving/{ => pooling}/jinaai_rerank_client.py (100%) rename examples/online_serving/{ => pooling}/openai_chat_embedding_client_for_multimodal.py (92%) rename examples/online_serving/{ => pooling}/openai_classification_client.py (86%) rename examples/online_serving/{ => pooling}/openai_embedding_client.py (82%) rename examples/online_serving/{ => pooling}/openai_embedding_matryoshka_fy.py (100%) rename examples/online_serving/{ => pooling}/openai_pooling_client.py (89%) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index d2fbb1870dde6..0521a22c07029 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -228,7 +228,7 @@ outputs = llm.embed(["Follow the white rabbit."], print(outputs[0].outputs) ``` -A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py> +A code example can be found here: <gh-file:examples/offline_inference/pooling/embed_matryoshka_fy.py> ### Online Inference @@ -258,4 +258,4 @@ Expected output: {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` -An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py> +An OpenAI client example can be found here: <gh-file:examples/online_serving/pooling/openai_embedding_matryoshka_fy.py> diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6295a2aa8dc2f..4f95280233ac7 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -530,7 +530,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>. + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/pooling/qwen3_reranker.py>. ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index dfed15d4ace97..181a874efa3cb 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -239,7 +239,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api]) which will be treated as a single prompt to the model. -Code example: <gh-file:examples/online_serving/openai_embedding_client.py> +Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py> #### Multi-modal inputs @@ -313,7 +313,7 @@ and passing a list of `messages` in the request. Refer to the examples below for `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. -Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py> +Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py> #### Extra parameters @@ -421,7 +421,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: <gh-file:examples/online_serving/openai_pooling_client.py> +Code example: <gh-file:examples/online_serving/pooling/openai_pooling_client.py> [](){ #classification-api } @@ -431,7 +431,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities. -Code example: <gh-file:examples/online_serving/openai_classification_client.py> +Code example: <gh-file:examples/online_serving/pooling/openai_classification_client.py> #### Example Requests @@ -760,7 +760,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with popular open-source tools. -Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py> +Code example: <gh-file:examples/online_serving/pooling/jinaai_rerank_client.py> #### Example Request diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md new file mode 100644 index 0000000000000..8693f5e08e0ba --- /dev/null +++ b/examples/offline_inference/pooling/README.md @@ -0,0 +1,33 @@ +# Pooling models + +## Convert llm model to seq cls + +```bash +# for BAAI/bge-reranker-v2-gemma +# Caution: "Yes" and "yes" are two different tokens +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls +# for mxbai-rerank-v2 +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls +# for Qwen3-Reranker +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls +``` + +## Embed jina_embeddings_v3 usage + +Only text matching task is supported for now. See <gh-pr:16120> + +```bash +python examples/offline_inference/pooling/embed_jina_embeddings_v3.py +``` + +## Embed matryoshka dimensions usage + +```bash +python examples/offline_inference/pooling/embed_matryoshka_fy.py +``` + +## Qwen3 reranker usage + +```bash +python qwen3_reranker.py +``` diff --git a/examples/offline_inference/convert_model_to_seq_cls.py b/examples/offline_inference/pooling/convert_model_to_seq_cls.py similarity index 100% rename from examples/offline_inference/convert_model_to_seq_cls.py rename to examples/offline_inference/pooling/convert_model_to_seq_cls.py diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py similarity index 100% rename from examples/offline_inference/embed_jina_embeddings_v3.py rename to examples/offline_inference/pooling/embed_jina_embeddings_v3.py diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/pooling/embed_matryoshka_fy.py similarity index 100% rename from examples/offline_inference/embed_matryoshka_fy.py rename to examples/offline_inference/pooling/embed_matryoshka_fy.py diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/pooling/qwen3_reranker.py similarity index 100% rename from examples/offline_inference/qwen3_reranker.py rename to examples/offline_inference/pooling/qwen3_reranker.py diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh index f356d7d4529ea..56888c8aa0e4c 100644 --- a/examples/online_serving/openai_embedding_long_text/service.sh +++ b/examples/online_serving/openai_embedding_long_text/service.sh @@ -120,7 +120,7 @@ echo " - API Key: $API_KEY" echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN" echo "" echo "🧪 Test the server with:" -echo " python examples/online_serving/openai_embedding_long_text_client.py" +echo " python examples/online_serving/openai_embedding_long_text/client.py" echo "" echo "📚 Enhanced features enabled:" echo " ✅ Intelligent native pooling type detection" diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md new file mode 100644 index 0000000000000..f7926542202d6 --- /dev/null +++ b/examples/online_serving/pooling/README.md @@ -0,0 +1,43 @@ +# Pooling models + +## Cohere rerank usage + +```bash +python examples/online_serving/pooling/cohere_rerank_client.py +``` + +## Jinaai rerank usage + +```bash +python examples/online_serving/pooling/jinaai_rerank_client.py +``` + +## Openai chat embedding for multimodal usage + +```bash +python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py +``` + +## Openai classification usage + +```bash +python examples/online_serving/pooling/openai_classification_client.py +``` + +## Openai embedding usage + +```bash +python examples/online_serving/pooling/openai_embedding_client.py +``` + +## Openai embedding matryoshka dimensions usage + +```bash +python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py +``` + +## Openai pooling usage + +```bash +python examples/online_serving/pooling/openai_pooling_client.py +``` diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/pooling/cohere_rerank_client.py similarity index 100% rename from examples/online_serving/cohere_rerank_client.py rename to examples/online_serving/pooling/cohere_rerank_client.py diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/pooling/jinaai_rerank_client.py similarity index 100% rename from examples/online_serving/jinaai_rerank_client.py rename to examples/online_serving/pooling/jinaai_rerank_client.py diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py similarity index 92% rename from examples/online_serving/openai_chat_embedding_client_for_multimodal.py rename to examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py index 771ad8511e972..30cb3325b9b18 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py @@ -1,5 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +"""Example Python client for multimodal embedding API using vLLM API server +NOTE: + start a supported multimodal embeddings model server with `vllm serve`, e.g. + vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024 +""" import argparse import base64 diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/pooling/openai_classification_client.py similarity index 86% rename from examples/online_serving/openai_classification_client.py rename to examples/online_serving/pooling/openai_classification_client.py index b10e7acbd26c1..d8dc2ef001112 100644 --- a/examples/online_serving/openai_classification_client.py +++ b/examples/online_serving/pooling/openai_classification_client.py @@ -1,5 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Example Python client for classification API using vLLM API server +NOTE: + start a supported classification model server with `vllm serve`, e.g. + vllm serve jason9693/Qwen2.5-1.5B-apeach +""" import argparse import pprint diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/pooling/openai_embedding_client.py similarity index 82% rename from examples/online_serving/openai_embedding_client.py rename to examples/online_serving/pooling/openai_embedding_client.py index 6bc390861e2ee..f5f6820d07d73 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/pooling/openai_embedding_client.py @@ -1,5 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Example Python client for embedding API using vLLM API server +NOTE: + start a supported embeddings model server with `vllm serve`, e.g. + vllm serve intfloat/e5-small +""" from openai import OpenAI diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py similarity index 100% rename from examples/online_serving/openai_embedding_matryoshka_fy.py rename to examples/online_serving/pooling/openai_embedding_matryoshka_fy.py diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/pooling/openai_pooling_client.py similarity index 89% rename from examples/online_serving/openai_pooling_client.py rename to examples/online_serving/pooling/openai_pooling_client.py index 95555d41cbea5..569015746b128 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/pooling/openai_pooling_client.py @@ -4,7 +4,9 @@ Example online usage of Pooling API. Run `vllm serve <model> --runner pooling` -to start up the server in vLLM. +to start up the server in vLLM. e.g. + +vllm serve internlm/internlm2-1_8b-reward --trust-remote-code """ import argparse @@ -23,7 +25,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach") + parser.add_argument("--model", type=str, default="internlm/internlm2-1_8b-reward") return parser.parse_args() From 72c99f2a75ee082e9755dcddfd5a2289ff4be7d7 Mon Sep 17 00:00:00 2001 From: ant-yy <vito.yy@antgroup.com> Date: Mon, 15 Sep 2025 20:09:30 +0800 Subject: [PATCH 279/584] [Model]: support Ling2.0 (#24627) Signed-off-by: vito.yy <vito.yy@antgroup.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- docs/models/supported_models.md | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/bailing_moe.py | 214 +++++++++++++++++----- vllm/model_executor/models/registry.py | 1 + 4 files changed, 169 insertions(+), 49 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4f95280233ac7..cc3ee8b788ddb 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -328,6 +328,7 @@ th { | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | diff --git a/tests/models/registry.py b/tests/models/registry.py index b268bf12a3f30..3424cb6e7e7d4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -180,6 +180,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5", trust_remote_code=True), + "BailingMoeV2ForCausalLM": _HfExamplesInfo("inclusionAI/Ling-mini-2.0", + trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1", min_transformers_version="4.55.3", extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index a42640cef9d44..5f6025abf315c 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( @@ -68,6 +67,7 @@ class BailingAttention(nn.Module): config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, prefix: str = "", ): super().__init__() @@ -84,10 +84,11 @@ class BailingAttention(nn.Module): self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads) self.q_size_per_rank = self.head_dim * self.num_heads - self.num_kv_heads = self.total_kv_heads // tp_size self.kv_size_per_rank = self.num_kv_heads * self.head_dim self.scale = self.head_dim**-0.5 + self.use_qk_norm = getattr(config, "use_qk_norm", False) + self.use_rmsnorm = getattr(config, "use_rmsnorm", False) self.query_key_value = QKVParallelLinear( self.hidden_size, @@ -99,28 +100,45 @@ class BailingAttention(nn.Module): prefix=f"{prefix}.query_key_value", ) + if self.use_qk_norm: + self.query_layernorm = (RMSNorm( + self.head_dim, eps=config.rms_norm_eps) if self.use_rmsnorm + else nn.LayerNorm(self.head_dim, eps=1e-6)) + self.key_layernorm = (RMSNorm( + self.head_dim, eps=config.rms_norm_eps) if self.use_rmsnorm + else nn.LayerNorm(self.head_dim, eps=1e-6)) + self.dense = RowParallelLinear( self.total_num_heads * self.head_dim, self.hidden_size, bias=config.use_bias, quant_config=quant_config, + reduce_results=reduce_results, prefix=f"{prefix}.dense", ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scale, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - prefix=f"{prefix}.attn") + self.partial_rotary_factor = getattr(config, "partial_rotary_factor", + 1.0) + + self.rotary_dim = getattr(config, "rotary_dim", self.head_dim) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, + rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, base=config.rope_theta, is_neox_style=True, rope_scaling=config.rope_scaling, + partial_rotary_factor=self.partial_rotary_factor, + ) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + prefix=f"{prefix}.attn", ) def forward( @@ -135,6 +153,14 @@ class BailingAttention(nn.Module): ], dim=-1) + if self.use_qk_norm: + q = q.view(-1, self.num_heads, self.head_dim) + k = k.view(-1, self.num_kv_heads, self.head_dim) + q = self.query_layernorm(q) + k = self.key_layernorm(k) + q = q.view(-1, self.q_size_per_rank) + k = k.view(-1, self.kv_size_per_rank) + q, k = self.rotary_emb(position_ids, q, k) context_layer = self.attn(q, k, v) @@ -198,24 +224,72 @@ class BailingMoE(nn.Module): self.hidden_size = config.hidden_size self.quant_config = quant_config self.num_shared_experts = config.num_shared_experts - # Gate always runs at half / full precision for now. - self.gate = ReplicatedLinear(self.hidden_size, - self.num_experts, - bias=False, - quant_config=None) + self.score_function = getattr(config, "score_function", None) + self.n_group = getattr(config, "n_group", None) + self.topk_group = getattr(config, "topk_group", None) + self.use_grouped_topk = (self.n_group is not None + and self.topk_group is not None) + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", + 1.0) - self.experts = FusedMoE(num_experts=self.num_experts, - top_k=self.top_k, - hidden_size=self.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=self.norm_expert_prob, - quant_config=quant_config, - prefix=f"{prefix}.experts") + router_dtype = getattr(config, "router_dtype", None) + if router_dtype is None: + self.router_dtype = None + elif router_dtype == "fp32": + self.router_dtype = torch.float32 + else: + self.router_dtype = torch.bfloat16 + + self.gate = nn.Linear( + self.hidden_size, + self.num_experts, + bias=False, + dtype=self.router_dtype, + ) + + if getattr(config, "moe_router_enable_expert_bias", False): + self.gate.expert_bias = nn.Parameter( + torch.empty((config.num_experts, ), dtype=torch.float32)) + else: + self.gate.expert_bias = None + + self.correction_bias = (self.gate.expert_bias.data + if self.gate.expert_bias is not None else None) + + if self.score_function is not None: + assert ( + self.score_function == "softmax" + and self.correction_bias is None + ) or ( + self.score_function == "sigmoid" + and self.correction_bias is not None + ), "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)" # noqa: E501 + else: + # default value for scoring_func + self.score_function = "softmax" + + self.experts = FusedMoE( + num_experts=self.num_experts, + top_k=self.top_k, + hidden_size=self.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=self.norm_expert_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + scoring_func=self.score_function, + e_score_correction_bias=self.gate.expert_bias, + num_expert_group=self.n_group, + topk_group=self.topk_group, + use_grouped_topk=self.use_grouped_topk, + ) if self.num_shared_experts > 0: - intermediate_size = (config.moe_intermediate_size * - self.num_shared_experts) + if hasattr(config, "moe_shared_expert_intermediate_size"): + intermediate_size = config.moe_shared_expert_intermediate_size + else: + intermediate_size = config.moe_intermediate_size + intermediate_size *= config.num_shared_experts self.shared_experts = BailingMLP( intermediate_size=intermediate_size, config=config, @@ -228,14 +302,18 @@ class BailingMoE(nn.Module): def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_size = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_size) - if self.num_shared_experts > 0: + if self.shared_experts: shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) + router_logits = self.gate(hidden_states.to(self.router_dtype)) + router_logits = router_logits.to(hidden_states.dtype) + final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits) - if self.num_shared_experts > 0: + final_hidden_states *= self.routed_scaling_factor + + if self.shared_experts: final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: @@ -254,20 +332,30 @@ class BailingMoeBlock(nn.Module): prefix: str = "", ): super().__init__() + layer_idx = int(prefix.split('.')[-1]) + self.config = config hidden_size = config.hidden_size intermediate_size = config.intermediate_size + self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) self.attention = BailingAttention(config, cache_config, quant_config, prefix=f"{prefix}.attention") + self.post_attention_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) - self.mlp = BailingMoE(intermediate_size, - config, - quant_config, - True, - prefix=f"{prefix}.mlp") + + # Choose MLP class based on the number of experts and layer index + if layer_idx < config.first_k_dense_replace: + mlp_class = BailingMLP + else: + mlp_class = BailingMoE + self.mlp = mlp_class(intermediate_size, + config, + quant_config, + True, + prefix=f"{prefix}.mlp") def forward( self, @@ -310,11 +398,17 @@ class BailingMoeModel(nn.Module): self.config = config self.vocab_size = config.vocab_size self.embed_dim = config.hidden_size + self.tie_word_embeddings = getattr(config, "tie_word_embeddings", + False) - if get_pp_group().is_first_rank or (config.tie_word_embeddings + if get_pp_group().is_first_rank or (self.tie_word_embeddings and get_pp_group().is_last_rank): self.word_embeddings = VocabParallelEmbedding( - self.vocab_size, self.embed_dim) + self.vocab_size, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.word_embeddings", + ) else: self.word_embeddings = PPMissingLayer() @@ -372,8 +466,11 @@ class BailingMoeModel(nn.Module): "hidden_states": hidden_states, "residual": residual }) - - hidden_states, _ = self.norm(hidden_states, residual) + else: + if residual is None: + hidden_states = self.norm(hidden_states) + else: + hidden_states, _ = self.norm(hidden_states, residual) return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: @@ -396,7 +493,8 @@ class BailingMoeModel(nn.Module): loaded_params: set[str] = set() expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: - if self.config.norm_head and "lm_head.weight" in name: + if (hasattr(self.config, "norm_head") and self.config.norm_head + and "lm_head.weight" in name): loaded_weight = F.normalize(loaded_weight, dim=0, p=2, @@ -430,13 +528,17 @@ class BailingMoeModel(nn.Module): if is_pp_missing_parameter(name, self): continue + if name not in params_dict: + continue param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id) + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) break else: if name.endswith(".bias") and name not in params_dict: @@ -473,19 +575,30 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): ) -> None: super().__init__() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_config.get_text_config() + vllm_config.model_config.hf_config = config quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config + self.lora_config = lora_config self.quant_config = quant_config self.max_position_embeddings = config.max_position_embeddings self.model = BailingMoeModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.tie_word_embeddings = getattr(config, "tie_word_embeddings", + False) + if get_pp_group().is_last_rank: - self.lm_head = (self.word_embeddings if config.tie_word_embeddings - else ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config)) + if self.tie_word_embeddings: + self.lm_head = self.model.word_embeddings + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head", + ) self.logits_processor = LogitsProcessor(config.vocab_size) else: self.lm_head = PPMissingLayer() @@ -520,10 +633,13 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, - skip_prefixes=(["lm_head."] - if self.config.tie_word_embeddings else None), + skip_prefixes=(["lm_head."] if self.tie_word_embeddings else None), ) return loader.load_weights(weights) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() + + +class BailingMoeV2ForCausalLM(BailingMoeForCausalLM): + pass diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 85759df369850..c6ea4c2050577 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -52,6 +52,7 @@ _TEXT_GENERATION_MODELS = { # baichuan-13b, lower case 'c' in the class name "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), "BailingMoeForCausalLM": ("bailing_moe", "BailingMoeForCausalLM"), + "BailingMoeV2ForCausalLM": ("bailing_moe", "BailingMoeV2ForCausalLM"), "BambaForCausalLM": ("bamba", "BambaForCausalLM"), "BloomForCausalLM": ("bloom", "BloomForCausalLM"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), From 0e219cd50b189800573e469bc8184724f52a91a1 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Mon, 15 Sep 2025 20:45:06 +0800 Subject: [PATCH 280/584] [Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (#24822) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- examples/offline_inference/vision_language.py | 10 ++- .../multimodal/processing/test_common.py | 9 +- .../multimodal/processing/test_glm4_1v.py | 17 +++- vllm/assets/video.py | 15 +++- vllm/model_executor/models/glm4_1v.py | 88 +++++++++++-------- vllm/multimodal/video.py | 49 ++++++----- 6 files changed, 118 insertions(+), 70 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 4b75eb19fcf94..67a978ad2aae9 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1716,6 +1716,13 @@ model_example_map = { } +MODELS_NEED_VIDEO_METADATA = [ + "glm4_1v", + "glm4_5v", + "glm4_5v_fp8", +] + + def get_multi_modal_input(args): """ return { @@ -1740,12 +1747,13 @@ def get_multi_modal_input(args): if args.modality == "video": # Input video and question + needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata vid_questions = ["Why is this video funny?"] return { - "data": [(video, metadata)] if args.model_type == "glm4_1v" else video, + "data": ([(video, metadata)] if needs_metadata else video), "questions": vid_questions, } diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index ced0ab3377a9e..8bd93bd838fed 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -32,11 +32,14 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: # Ensure video metadata is included if "video" in mm_data: video = mm_data["video"] + num_frames = len(video) mm_data["video"] = (video, { - "total_num_frames": len(video), - "fps": len(video), + "total_num_frames": num_frames, + "fps": num_frames, "duration": 1, - "video_backend": "opencv" + "frames_indices": [i for i in range(num_frames)], + "video_backend": "opencv", + "do_sample_frames": True, }) return mm_data diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index dfb8d9b2a038d..070ddcd89ee96 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -12,8 +12,19 @@ from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"]) @pytest.mark.parametrize("expected_toks_per_frame", [299]) -@pytest.mark.parametrize("num_frames", [32, 128]) -@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)]) +@pytest.mark.parametrize( + "num_frames, fps, expected_grid_t", + [ + # pre-sampled fixed frames (unexpected behavior, + # but we still expect it to work without errors) + (32, 1, 16), + (32, 2, 16), + (128, 1, 64), + (128, 2, 64), + # post-sampled frames (expected behavior) + (-1, 1, 5), + (-1, 2, 10), + ]) def test_processor_override( model_id: str, expected_toks_per_frame: int, @@ -80,7 +91,7 @@ def test_video_loader_consistency( static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes) dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes( - video_bytes, requested_fps=fps) + video_bytes, fps=fps) # pre-sampled loader shouldn't read all frames assert len(dynamic_video) < len(static_video) diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 983e9114cccfb..5c9e403c4b91f 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -76,7 +76,7 @@ def video_to_pil_images_list(path: str, return [Image.fromarray(frame) for frame in frames] -def video_get_metadata(path: str) -> dict[str, Any]: +def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]: cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") @@ -85,11 +85,18 @@ def video_get_metadata(path: str) -> dict[str, Any]: fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames / fps if fps > 0 else 0 + if num_frames == -1 or num_frames > total_frames: + num_frames = total_frames + metadata = { - "total_num_frames": total_frames, + "total_num_frames": num_frames, "fps": fps, "duration": duration, - "video_backend": "opencv" + "video_backend": "opencv", + "frames_indices": list(range(num_frames)), + # extra field used to control hf processor's video + # sampling behavior + "do_sample_frames": num_frames == total_frames, } return metadata @@ -126,7 +133,7 @@ class VideoAsset: @property def metadata(self) -> dict[str, Any]: - ret = video_get_metadata(self.video_path) + ret = video_get_metadata(self.video_path, self.num_frames) return ret def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray: diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 13c1ff8887022..cbf327ce02b6b 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -36,7 +36,9 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange +from packaging.version import Version from transformers import BatchFeature +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, smart_resize) @@ -1001,28 +1003,32 @@ class Glm4vProcessingInfo(BaseProcessingInfo): max_frame_idx = meta_frames - 1 duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1) - if duration <= video_processor.max_duration: - n = int(math.floor(duration * video_processor.fps)) - frame_indices = [ - min( - max_frame_idx, - int(math.ceil(i * video_fps / video_processor.fps)), - ) for i in range(n) - ] + do_sample_frames = metadata["do_sample_frames"] + if not do_sample_frames: + frame_indices = metadata["frames_indices"] else: - num_samples = int(video_processor.max_duration * - video_processor.fps) - if num_samples >= meta_frames: - frame_indices = list(range(meta_frames)) - else: - target_seconds = np.linspace(0, - duration, - num_samples, - endpoint=True) + if duration <= video_processor.max_duration: + n = int(math.floor(duration * video_processor.fps)) frame_indices = [ - min(max_frame_idx, int(math.ceil(t * video_fps))) - for t in target_seconds + min( + max_frame_idx, + int(math.ceil(i * video_fps / video_processor.fps)), + ) for i in range(n) ] + else: + num_samples = int(video_processor.max_duration * + video_processor.fps) + if num_samples >= meta_frames: + frame_indices = list(range(meta_frames)) + else: + target_seconds = np.linspace(0, + duration, + num_samples, + endpoint=True) + frame_indices = [ + min(max_frame_idx, int(math.ceil(t * video_fps))) + for t in target_seconds + ] seen, uniq = set(), [] for idx in frame_indices: @@ -1139,7 +1145,9 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): "fps": 2.0, "duration": num_frames / 2.0, "total_num_frames": num_frames, + "frames_indices": [i for i in range(num_frames)], "video_backend": "opencv", + "do_sample_frames": False, } video_item = (video.copy(), video_metadata) video_items.append(video_item) @@ -1172,34 +1180,37 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): for item in mm_data.pop("videos", []): video_array, metadata = item - if metadata["video_backend"] == "opencv_dynamic": - mm_kwargs["do_sample_frames"] = False - - elif metadata["total_num_frames"] != len(video_array): - logger.warning( - "Total frames in metadata " - "(%s) does not match the length of " - "video array %s. This can " - "be because the video is resampled " - "in advance. This may cause " - "a divergence with HF implementation.", - metadata["total_num_frames"], - len(video_array), - ) - metadata["total_num_frames"] = len(video_array) + # don't update mm_kwargs inplace + video_mm_kwargs = dict(**mm_kwargs) + video_mm_kwargs["do_sample_frames"] = metadata.get( + "do_sample_frames", True) video_mm_data = dict() video_mm_data["videos"] = [[video_array]] - video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]] + + # backward compatibility for Transformers 4.55 + unuse_metadata = ["do_sample_frames"] + if not hasattr( + VideoMetadata, + "frames_indices") and "frames_indices" in metadata: + unuse_metadata.append("frames_indices") + + video_mm_data["video_metadata"] = [[ + VideoMetadata( + **{ + k: metadata[k] + for k in metadata if k not in unuse_metadata + }) + ]] video_outputs = super()._call_hf_processor( prompt="<|begin_of_video|><|video|><|end_of_video|>", mm_data=video_mm_data, - mm_kwargs=mm_kwargs, + mm_kwargs=video_mm_kwargs, tok_kwargs=tok_kwargs, ) - if "do_sample_frames" in mm_kwargs and not mm_kwargs[ - "do_sample_frames"]: + if not video_mm_kwargs["do_sample_frames"] and Version( + TRANSFORMERS_VERSION) < Version("4.56.0"): # Transformers v4.55 has incorrect timestamps issue for # skip sampling. We construct the placeholder manually to # get placeholders with correct timestamps. @@ -1218,6 +1229,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): prompt = prompt.replace( "<|begin_of_video|><|video|><|end_of_video|>", video_placeholder, + 1, ) video_grid_thw_lst.append(video_outputs["video_grid_thw"]) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index df6e19da82ca2..fb2dcac49ee93 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -121,14 +121,6 @@ class OpenCVVideoBackend(VideoLoader): original_fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames_num / original_fps if original_fps > 0 else 0 - # Use transformers transformers.video_utils.VideoMetadata format - metadata = { - "total_num_frames": total_frames_num, - "fps": original_fps, - "duration": duration, - "video_backend": "opencv" - } - # resample video to target num_frames full_read = num_frames == -1 or total_frames_num < num_frames if full_read: @@ -159,6 +151,20 @@ class OpenCVVideoBackend(VideoLoader): assert i == num_frames, (f"Expected reading {num_frames} frames, " f"but only loaded {i} frames from video.") + # Use transformers transformers.video_utils.VideoMetadata format + # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata + # can cause incorrect timestamp calculation without num_frames=-1. + metadata = { + "total_num_frames": num_frames, + "fps": original_fps, + "duration": duration, + "video_backend": "opencv", + "frames_indices": list(range(num_frames)), + # extra field used to control hf processor's video + # sampling behavior + "do_sample_frames": num_frames == total_frames_num, + } + return frames, metadata @@ -170,7 +176,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): cls, data: bytes, num_frames: int = -1, - requested_fps: int = 2, + fps: int = 2, max_duration: int = 300, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: @@ -185,14 +191,6 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): original_fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames_num / original_fps if original_fps > 0 else 0 - # Use transformers transformers.video_utils.VideoMetadata format - metadata = { - "total_num_frames": total_frames_num, - "fps": original_fps, - "duration": duration, - "video_backend": "opencv_dynamic" - } - # resample video to target num_frames max_frame_idx = total_frames_num - 1 duration = duration or round(max_frame_idx / original_fps) + 1 @@ -201,14 +199,13 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 frame_indices: Union[range, list[int]] if duration <= max_duration: - n = int(math.floor(duration * requested_fps)) + n = int(math.floor(duration * fps)) frame_indices = sorted({ - min(max_frame_idx, - int(math.ceil(i * original_fps / requested_fps))) + min(max_frame_idx, int(math.ceil(i * original_fps / fps))) for i in range(n) }) else: - num_samples = int(max_duration * requested_fps) + num_samples = int(max_duration * fps) if num_samples >= total_frames_num: frame_indices = range(total_frames_num) else: @@ -241,6 +238,16 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): f"Expected reading {len(frame_indices)} frames, " f"but only loaded {i} frames from video.") + # Use transformers transformers.video_utils.VideoMetadata format + metadata = { + "total_num_frames": total_frames_num, + "fps": original_fps, + "duration": duration, + "video_backend": "opencv_dynamic", + "frames_indices": list(frame_indices), + "do_sample_frames": False, + } + return frames, metadata From 01413e0cf5a04da4049ffa38b6ff3df27ccabd06 Mon Sep 17 00:00:00 2001 From: xiao-llm <xiao.yu.dc@outlook.com> Date: Mon, 15 Sep 2025 10:43:26 -0400 Subject: [PATCH 281/584] Fp8 paged attention update (#22222) Signed-off-by: Xiao Yu <xiao.yu@amd.com> Signed-off-by: xiao-llm <xiao.yu.dc@outlook.com> Co-authored-by: Xiao Yu <xiao.yu@metamaterial.com> Co-authored-by: Xiao Yu <xiao.yu@amd.com> Co-authored-by: Bowen Bao <bowenbao@amd.com> --- csrc/rocm/attention.cu | 321 ++++++++++++++++++++++++++--------- csrc/rocm/ops.h | 3 +- csrc/rocm/torch_bindings.cpp | 3 +- vllm/_custom_ops.py | 3 +- vllm/envs.py | 6 + 5 files changed, 257 insertions(+), 79 deletions(-) diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index e3a0e15f5304f..dac9df6048f2a 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -30,6 +30,10 @@ #define __HIP__GFX9__ #endif +#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__)) + #define __HIP__FP8MFMA__ +#endif + #if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__)) #define __HIP__GFX11__ #endif @@ -51,6 +55,12 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) +enum class MFMAType { + F16 = 0, + Fp8 = 1, + Fp4 = 2, +}; + #if defined(__HIP__GFX9__) #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32 @@ -112,6 +122,21 @@ __device__ __forceinline__ floatx4 gcn_mfma16x16x16_instr(const _B16x4& inpA, } } +template <typename T, int absz, int cbid, int blgp> +__device__ __forceinline__ floatx4 gcn_mfma16x16x32_instr(const long& inpA, + const long& inpB, + const floatx4& inpC) { + if constexpr (std::is_same<T, __hip_fp8_e4m3>::value) { + return __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(inpA, inpB, inpC, absz, + cbid, blgp); + } else if constexpr (std::is_same<T, __hip_fp8_e5m2>::value) { + return __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(inpA, inpB, inpC, absz, + cbid, blgp); + } else { + static_assert(false, "unsupported 8b dtype"); + } +} + template <typename T> __device__ __forceinline__ float to_float(const T& inp) { if constexpr (std::is_same<T, _Float16>::value) { @@ -256,12 +281,44 @@ __device__ __forceinline__ _B16x8 convert_b8x8_custom(const _B8x8 input) { return ret; } +typedef union u64_cvt { + half f16x4[4]; + int16_t b16x4[4]; + _B8x8 b8x8; + _B16x4 b64; + int64_t i64; +} _T8x8; + +__device__ __forceinline__ _B8x8 convert_b16x8(const _B16x8& input, + _T8x8& Mtemp) { + _T8x8 Qtmp8x8; + + for (int i = 0; i < 2; i++) { + floatx4 q_out = {0, 0, 0, 0}; + q_out = gcn_mfma16x16x16_instr<_Float16, 0, 0, 0>(Mtemp.b64, input.xy[i], + q_out); + Qtmp8x8.b16x4[i * 2] = + __builtin_amdgcn_cvt_pk_fp8_f32(q_out[0], q_out[1], 0, false); + Qtmp8x8.b16x4[i * 2 + 1] = + __builtin_amdgcn_cvt_pk_fp8_f32(q_out[2], q_out[3], 0, false); + } + return Qtmp8x8.b8x8; +} + +__device__ float warpReduceMax(float val) { + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val = max( + val, __shfl_down(val, offset, WARP_SIZE)); // Using max() for reduction + } + return val; +} + // grid (num_seqs, num_partitions,num_kv_heads) // block (256) // clang-format off template <typename scalar_t, typename cache_t, vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE, - int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO> + int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -367,6 +424,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; int kphysical_block_number[TLOOP]; + #if defined(__HIP__FP8MFMA__) + float q_max = 0; + float q_scale = 1.0; + #endif // fetch k physical block numbers for (int token_depth = 0; token_depth < TLOOP; token_depth++) { @@ -416,6 +477,15 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( Qlocal[qkhe_depth][qkratio].xy[i] = shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO] [2 * qkratio + i]; + #if defined(__HIP__FP8MFMA__) + if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto && + MFMA_TYPE == MFMAType::Fp8) { + scalar_t* qptr = + reinterpret_cast<scalar_t*>(&Qlocal[qkhe_depth][qkratio].xy[i]); + for (int k = 0; k < 4; k++) + q_max = fmax(fabs(to_float<scalar_t>(qptr[k])), q_max); + } + #endif } } } @@ -515,6 +585,14 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) { // multiply by k_scale if fp8 kv cache scale2 *= *k_scale; + #if defined(__HIP__FP8MFMA__) + q_max = warpReduceMax(q_max); + constexpr float FP8_E4M3_SCALE_TARGET = 224.0f; + if constexpr (MFMA_TYPE == MFMAType::Fp8) { + q_scale = q_max > 0 ? FP8_E4M3_SCALE_TARGET / q_max : 1.0f; + scale2 /= q_scale; + } + #endif } floatx4 d_out[TLOOP]; @@ -534,12 +612,41 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( auto Ktmp = Klocal[token_depth][qkhe_depth]; _B8x16 Ktmp8x16 = *reinterpret_cast<_B8x16*>(&Ktmp); for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) { - _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio]; - _B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8); - for (int i = 0; i < 2; i++) { - d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>( - Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i], - d_out[token_depth]); + if constexpr (MFMA_TYPE == MFMAType::F16) { + _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio]; + _B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8); + for (int i = 0; i < 2; i++) { + d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>( + Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i], + d_out[token_depth]); + } + } else { + #if defined(__HIP__FP8MFMA__) + _T8x8 Ktmp8x8, Qtmp8x8; + Ktmp8x8.b8x8 = Ktmp8x16.xy[qkratio]; + + for (int n = 0; n < 2; n++) { + scalar_t* qptr = reinterpret_cast<scalar_t*>( + &Qlocal[qkhe_depth][qkratio].xy[n]); + + Qtmp8x8.b16x4[n * 2] = + vllm::fp8::scaled_vec_conversion<uint16_t, float2>( + make_float2(to_float<scalar_t>(qptr[0]), + to_float<scalar_t>(qptr[1])), + q_scale); + Qtmp8x8.b16x4[n * 2 + 1] = + vllm::fp8::scaled_vec_conversion<uint16_t, float2>( + make_float2(to_float<scalar_t>(qptr[2]), + to_float<scalar_t>(qptr[3])), + q_scale); + } + + d_out[token_depth] = + gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>( + Ktmp8x8.i64, Qtmp8x8.i64, d_out[token_depth]); + #else + UNREACHABLE_CODE + #endif } } } @@ -629,17 +736,36 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( // disable rtz conversion due to its impact on accuracy. constexpr bool LOGITS_RTZ_CONVERSION = false; + #if defined(__HIP__FP8MFMA__) + int rowid_8x8 = rowid / 2; + int offset = rowid % 2; + #endif + // write logits to shared mem for (int token_depth = 0; token_depth < TLOOP; token_depth++) { d_out[token_depth] *= inv_sum_scale; - if constexpr (LOGITS_RTZ_CONVERSION) { - // use rtz conversion for better performance, with negligible impact on - // accuracy - shared_logits[warpid][token_depth][lane16id][rowid] = - from_floatx4_rtz<scalar_t>(d_out[token_depth]); + if constexpr (MFMA_TYPE != MFMAType::Fp8) { + if constexpr (LOGITS_RTZ_CONVERSION) { + // use rtz conversion for better performance, with negligible impact on + // accuracy + shared_logits[warpid][token_depth][lane16id][rowid] = + from_floatx4_rtz<scalar_t>(d_out[token_depth]); + } else { + shared_logits[warpid][token_depth][lane16id][rowid] = + from_floatx4<scalar_t>(d_out[token_depth]); + } } else { - shared_logits[warpid][token_depth][lane16id][rowid] = - from_floatx4<scalar_t>(d_out[token_depth]); + #if defined(__HIP__FP8MFMA__) + // cast _B16x4* to _B8x8* + _T8x8& logits_8x8 = *reinterpret_cast<_T8x8*>( + &shared_logits[warpid][token_depth][lane16id][rowid_8x8]); + logits_8x8.b16x4[offset * 2] = __builtin_amdgcn_cvt_pk_fp8_f32( + d_out[token_depth][0], d_out[token_depth][1], 0, false); + logits_8x8.b16x4[offset * 2 + 1] = __builtin_amdgcn_cvt_pk_fp8_f32( + d_out[token_depth][2], d_out[token_depth][3], 0, false); + #else + UNREACHABLE_CODE + #endif } } @@ -692,19 +818,42 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( _B8x16 Vtmp8x16 = *reinterpret_cast<_B8x16*>(&Vtmp); for (int j = 0; j < ELEMS16_ELEMS8_RATIO; j++) { _B8x8 Vtmp8x8 = Vtmp8x16.xy[j]; - _B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8); - for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) { - const int offset = - rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + - j * ELEMS8_ELEMS4_RATIO + i; - const int offset1 = offset % ROWS_PER_WARP; - const int offset2 = offset / ROWS_PER_WARP; - // output format is 16 qheads across 16 lanes, 16 head elems - // spread across 4 rows - tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>( - Vlocaltmp.xy[i], - shared_logits[vtoken_depth][offset2][lane16id][offset1], - tmp_out); + if constexpr (MFMA_TYPE == MFMAType::F16) { + _B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8); + for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) { + const int offset = + rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + + j * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + // output format is 16 qheads across 16 lanes, 16 head elems + // spread across 4 rows + tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>( + Vlocaltmp.xy[i], + shared_logits[vtoken_depth][offset2][lane16id][offset1], + tmp_out); + } + } else { + #if defined(__HIP__FP8MFMA__) + for (int i = 0; i < ELEMS8_ELEMS4_RATIO / 2; i++) { + const int offset = + rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + + j * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = (offset % ROWS_PER_WARP) / 2; + const int offset2 = offset / ROWS_PER_WARP; + // output format is 16 qheads across 16 lanes, 16 head elems + // spread across 4 rows + tmp_out = gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>( + reinterpret_cast<_T8x8*>(&Vtmp8x8)->i64, + reinterpret_cast<_T8x8*>( + &shared_logits[vtoken_depth][offset2][lane16id] + [offset1]) + ->i64, + tmp_out); + } + #else + UNREACHABLE_CODE + #endif } } } @@ -1570,7 +1719,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { // clang-format off template <typename scalar_t, typename cache_t, vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE, - int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO> + int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, + MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -2337,7 +2487,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { // clang-format off template <typename scalar_t, typename cache_t, vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE, - int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO> + int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, + MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -2969,7 +3120,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( template <typename scalar_t, typename cache_t, vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE, int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, - int GQA_RATIO> + int GQA_RATIO, MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -3041,7 +3192,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( #define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \ paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \ HEAD_SIZE, NTHR, ALIBI_ENABLED, \ - GQA_RATIO> \ + GQA_RATIO, MFMA_TYPE> \ <<<grid, block, 0, stream>>>( \ query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \ @@ -3069,7 +3220,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE, int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD, - bool ALIBI_ENABLED> + bool ALIBI_ENABLED, MFMAType MFMA_TYPE> void paged_attention_custom_launcher( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, @@ -3225,7 +3376,7 @@ void paged_attention_custom_launcher( template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE, int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD, - bool ALIBI_ENABLED> + bool ALIBI_ENABLED, MFMAType MFMA_TYPE> void paged_attention_custom_launcher_navi( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, @@ -3397,74 +3548,77 @@ void paged_attention_custom_launcher_navi( } #define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, \ - PSIZE, ALIBI_ENABLED) \ + PSIZE, ALIBI_ENABLED, MFMA_TYPE) \ if (!is_navi) { \ paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ - OUTT, PSIZE, ALIBI_ENABLED>( \ + OUTT, PSIZE, ALIBI_ENABLED, MFMA_TYPE>( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \ max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \ } else { \ - paged_attention_custom_launcher_navi< \ - T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \ + paged_attention_custom_launcher_navi<T, KVT, KV_DTYPE, BLK_SIZE, \ + HEAD_SIZE, OUTT, PSIZE, \ + ALIBI_ENABLED, MFMA_TYPE>( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \ max_seq_len, alibi_slopes, k_scale, v_scale); \ } #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ - OUTT, PSIZE) \ + OUTT, PSIZE, MFMA_TYPE) \ if (alibi_slopes) { \ CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \ - true); \ + true, MFMA_TYPE); \ } else { \ CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \ - false); \ + false, MFMA_TYPE); \ } #if defined(__HIPCC__) && defined(__gfx90a__) - #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE) \ + #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ + MFMA_TYPE) \ if (fp8_out_scale) { \ TORCH_CHECK(false, "fp8 out scale unsupported for gfx90a"); \ } else { \ CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \ - 256); \ + 256, MFMA_TYPE); \ } #else - #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE) \ + #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ + MFMA_TYPE) \ if (fp8_out_scale) { \ CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ - uint8_t, 256); \ + uint8_t, 256, MFMA_TYPE); \ } else { \ CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \ - 256); \ + 256, MFMA_TYPE); \ } #endif -#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE) \ - switch (block_size) { \ - case 16: \ - CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE); \ - break; \ - case 32: \ - CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ +#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE, MFMA_TYPE) \ + switch (block_size) { \ + case 16: \ + CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE, MFMA_TYPE); \ + break; \ + case 32: \ + CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE, MFMA_TYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ } -#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE) \ - switch (head_size) { \ - case 64: \ - CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64); \ - break; \ - case 128: \ - CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported head size: ", head_size); \ - break; \ +#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE, MFMA_TYPE) \ + switch (head_size) { \ + case 64: \ + CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64, MFMA_TYPE); \ + break; \ + case 128: \ + CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128, MFMA_TYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported head size: ", head_size); \ + break; \ } bool is_navi_gpu() { @@ -3503,28 +3657,43 @@ void paged_attention( const std::optional<torch::Tensor>& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale, - const std::optional<torch::Tensor>& fp8_out_scale) { + const std::optional<torch::Tensor>& fp8_out_scale, + const std::string& mfma_type) { // clang-format on bool is_navi = is_navi_gpu(); - const int head_size = query.size(2); if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Half) { - CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, _Float16, - vllm::Fp8KVCacheDataType::kAuto); + CALL_CUSTOM_LAUNCHER_BLK_HEAD( + _Float16, _Float16, vllm::Fp8KVCacheDataType::kAuto, MFMAType::F16); } else if (query.dtype() == at::ScalarType::BFloat16) { CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, __hip_bfloat16, - vllm::Fp8KVCacheDataType::kAuto); + vllm::Fp8KVCacheDataType::kAuto, + MFMAType::F16); } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } } else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") { if (query.dtype() == at::ScalarType::Half) { - CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t, - vllm::Fp8KVCacheDataType::kFp8E4M3); + if (mfma_type == "fp8") { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::Fp8); + } else { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::F16); + } } else if (query.dtype() == at::ScalarType::BFloat16) { - CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t, - vllm::Fp8KVCacheDataType::kFp8E4M3); + if (mfma_type == "fp8") { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::Fp8); + } else { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::F16); + } } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index 34dcc9401aae8..b6ee2656746c1 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -19,4 +19,5 @@ void paged_attention( const std::optional<torch::Tensor>& query_start_loc, int64_t block_size, int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, - torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale); + torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale, + const std::string& mfma_type); diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp index 66bdc448da3ca..c0c4daef64f05 100644 --- a/csrc/rocm/torch_bindings.cpp +++ b/csrc/rocm/torch_bindings.cpp @@ -48,7 +48,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { " Tensor? alibi_slopes," " str kv_cache_dtype," " Tensor k_scale, Tensor v_scale," - " Tensor? fp8_out_scale) -> ()"); + " Tensor? fp8_out_scale," + " str mfma_type) -> ()"); rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention); } diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 93b4f87ed260c..f3e5fe73d4e26 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -117,13 +117,14 @@ def paged_attention_rocm( k_scale: torch.Tensor, v_scale: torch.Tensor, fp8_out_scale: Optional[torch.Tensor] = None, + mfma_type: str = "fp8" if envs.VLLM_ROCM_FP8_MFMA_PAGE_ATTN else "f16", ) -> None: torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, seq_lens, query_start_loc, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, k_scale, - v_scale, fp8_out_scale) + v_scale, fp8_out_scale, mfma_type) def mla_decode_kvcache_cpu( diff --git a/vllm/envs.py b/vllm/envs.py index bb10c7cc2ac27..7ea18064c9446 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -167,6 +167,7 @@ if TYPE_CHECKING: VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False + VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None @@ -1219,6 +1220,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))), + # If set, use the fp8 mfma in rocm paged attention. + "VLLM_ROCM_FP8_MFMA_PAGE_ATTN": + lambda: bool(int(os.getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))), + # Whether to use pytorch symmetric memory for allreduce "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))), @@ -1340,6 +1345,7 @@ def compute_hash() -> str: "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", + "VLLM_ROCM_FP8_MFMA_PAGE_ATTN", ] for key in environment_variables_to_hash: # if this goes out of sync with environment_variables, From 740f0647b1d1cff7d2def13179b92e0452c9cb46 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 15 Sep 2025 17:43:40 +0100 Subject: [PATCH 282/584] Reinstate existing torch script (#24729) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- use_existing_torch.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/use_existing_torch.py b/use_existing_torch.py index b5aafdde16c28..76480f3e58fee 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -1,5 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -print("vLLM is now using 'uv' to disable build isolation for 'torch'.") -print("Please instead install vLLM with 'uv pip install -e .' (must use 'uv')") +import glob + +requires_files = glob.glob('requirements/*.txt') +requires_files += ["pyproject.toml"] +for file in requires_files: + print(f">>> cleaning {file}") + with open(file) as f: + lines = f.readlines() + if "torch" in "".join(lines).lower(): + print("removed:") + with open(file, 'w') as f: + for line in lines: + if 'torch' not in line.lower(): + f.write(line) + else: + print(line.strip()) + print(f"<<< done cleaning {file}") + print() \ No newline at end of file From b834b4cbf1d5094affdf231df2be86920610d83e Mon Sep 17 00:00:00 2001 From: Rafael Marcelino Koike <koiker@users.noreply.github.com> Date: Mon, 15 Sep 2025 12:45:49 -0400 Subject: [PATCH 283/584] =?UTF-8?q?[USAGE]=20Improve=20error=20handling=20?= =?UTF-8?q?for=20weight=20initialization=20in=20Unquantized=E2=80=A6=20(#2?= =?UTF-8?q?0321)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rafael Marcelino Koike <rafael.koike@oracle.com> Signed-off-by: Rafael Koike <koike.rafael@gmail.com> --- vllm/attention/layer.py | 25 +++++++++++++++++++++---- vllm/model_executor/layers/linear.py | 26 ++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 44cb2c7c6b642..22dc6dcbc8d62 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.platforms import _Backend, current_platform -from vllm.utils import direct_register_custom_op +from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None @@ -225,9 +225,26 @@ class Attention(nn.Module, AttentionLayerBase): ).parallel_config.pipeline_parallel_size) ] - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + try: + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, + dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, + dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, + dtype=torch.float32) + except torch.cuda.OutOfMemoryError as e: + logger.error( + "Failed to initialize attention q/k/v range constants: %s", e) + if torch.cuda.is_available(): + logger.debug("CUDA device: %s", torch.cuda.current_device()) + logger.debug("Allocated: %.2f GiB", + torch.cuda.memory_allocated() / GiB_bytes) + logger.debug("Reserved: %.2f GiB", + torch.cuda.memory_reserved() / GiB_bytes) + raise RuntimeError( + "Failed to initialize q/k/v range constants. " + "This may be caused by insufficient memory to allocate " + "kv cache.") from e def forward( self, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 773dfeae25d93..cd05136520977 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -29,6 +29,7 @@ from vllm.model_executor.parameter import (BasevLLMParameter, # yapf: enable from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.utils import GiB_bytes logger = init_logger(__name__) @@ -190,10 +191,27 @@ class UnquantizedLinearMethod(LinearMethodBase): output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) + # This method creates unquantized linear weights. + # The weights are not quantized, and they are not sharded. + # The amount of memory allocated for the weights is + # sum(output_partition_sizes) * input_size_per_partition. + try: + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) + except torch.cuda.OutOfMemoryError as e: + logger.error("Failed to create unquantized linear weights: %s", e) + if torch.cuda.is_available(): + logger.debug("CUDA device: %s", torch.cuda.current_device()) + logger.debug("Allocated: %.2f GiB", + torch.cuda.memory_allocated() / GiB_bytes) + logger.debug("Reserved: %.2f GiB", + torch.cuda.memory_reserved() / GiB_bytes) + raise RuntimeError( + "Failed to create unquantized linear weights. " + "This may be caused by insufficient memory to allocate " + "the weight.") from e set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs) From c4afdb69cc22a23d23886528147b956796c1000c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 15 Sep 2025 18:43:16 +0100 Subject: [PATCH 284/584] Move `MultiModalConfig` from `config/__init__.py` to `config/multimodal.py` (#24659) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../entrypoints/openai/test_lora_resolvers.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 2 +- tests/multimodal/test_cache.py | 21 +- tests/test_config.py | 4 +- .../test_processor_multi_modal_uuids.py | 2 +- vllm/config/__init__.py | 277 ++++-------------- vllm/config/multimodal.py | 120 ++++++++ vllm/config/utils.py | 18 ++ vllm/engine/arg_utils.py | 14 +- vllm/entrypoints/chat_utils.py | 9 +- vllm/model_executor/models/transformers.py | 7 +- 11 files changed, 229 insertions(+), 247 deletions(-) create mode 100644 vllm/config/multimodal.py diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 818efd825640c..2bf29ecf087f3 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -9,7 +9,7 @@ from unittest.mock import MagicMock import pytest -from vllm.config import MultiModalConfig +from vllm.config.multimodal import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index d219a1f311f15..502704c9bbdff 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -12,7 +12,7 @@ from unittest.mock import MagicMock import pytest import pytest_asyncio -from vllm.config import MultiModalConfig +from vllm.config.multimodal import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 3c61ee26e092e..3c737acfbfe28 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -7,6 +7,7 @@ import pytest import torch from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import (MultiModalCache, MultiModalProcessorCacheItem, MultiModalProcessorCacheItemMetadata, @@ -17,7 +18,6 @@ from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, MultiModalKwargsItems, MultiModalSharedField) from vllm.multimodal.processing import PromptInsertion -from vllm.multimodal.registry import MultiModalRegistry def _dummy_elem( @@ -96,7 +96,9 @@ def _create_vllm_config( enable_ipc: bool, ): return VllmConfig( - model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb), + model_config=ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_gb=mm_processor_cache_gb), parallel_config=ParallelConfig( data_parallel_size=1 if enable_ipc else 2), ) @@ -113,15 +115,16 @@ def _compare_caches( n_iter: int = 100, seed: int = 0, ): - mm_registry = MultiModalRegistry() - cache_0_p0 = processor_cache_from_config(config_0, mm_registry) - cache_0_p1 = engine_receiver_cache_from_config(config_0, mm_registry) - cache_1_p0 = processor_cache_from_config(config_1, mm_registry) - cache_1_p1 = engine_receiver_cache_from_config(config_1, mm_registry) + cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY) + cache_0_p1 = engine_receiver_cache_from_config(config_0, + MULTIMODAL_REGISTRY) + cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY) + cache_1_p1 = engine_receiver_cache_from_config(config_1, + MULTIMODAL_REGISTRY) cache_size_gb = max( - config_0.model_config.mm_processor_cache_gb, - config_1.model_config.mm_processor_cache_gb, + config_0.model_config.multimodal_config.mm_processor_cache_gb, + config_1.model_config.multimodal_config.mm_processor_cache_gb, ) item_size_gb = int(cache_size_gb / item_capacity) diff --git a/tests/test_config.py b/tests/test_config.py index 373fbd267539a..8db04de5469ad 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,9 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field import pytest from vllm.compilation.backends import VllmBackend -from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field, - update_config) +from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config from vllm.config.load import LoadConfig +from vllm.config.utils import get_field from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index 955c74d262a09..bdd41eece2317 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -31,7 +31,7 @@ def _mk_processor(monkeypatch, raising=True) monkeypatch.setattr(ModelConfig, "__post_init__", - lambda self: None, + lambda self, *args: None, raising=True) monkeypatch.setattr(UnspecifiedPlatform, "is_async_output_supported", diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 85e58c290b792..599eeb63bf804 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -11,13 +11,12 @@ import json import os import textwrap import warnings -from collections.abc import Mapping from contextlib import contextmanager -from dataclasses import MISSING, Field, field, fields, is_dataclass, replace +from dataclasses import InitVar, field, fields, is_dataclass, replace from functools import cached_property, lru_cache from importlib.util import find_spec -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, - Protocol, TypeVar, Union, cast, get_args) +from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol, + TypeVar, Union, cast, get_args) import regex as re import torch @@ -37,6 +36,8 @@ from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig +from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, + MultiModalConfig) from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -238,31 +239,12 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]: return out -def get_field(cls: ConfigType, name: str) -> Field: - """Get the default factory field of a dataclass by name. Used for getting - default factory fields in `EngineArgs`.""" - if not is_dataclass(cls): - raise TypeError("The given class is not a dataclass.") - cls_fields = {f.name: f for f in fields(cls)} - if name not in cls_fields: - raise ValueError(f"Field '{name}' not found in {cls.__name__}.") - named_field: Field = cls_fields[name] - if (default_factory := named_field.default_factory) is not MISSING: - return field(default_factory=default_factory) - if (default := named_field.default) is not MISSING: - return field(default=default) - raise ValueError( - f"{cls.__name__}.{name} must have a default value or default factory.") - - def is_init_field(cls: ConfigType, name: str) -> bool: return next(f for f in fields(cls) if f.name == name).init TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] -MMEncoderTPMode = Literal["weights", "data"] -MMCacheType = Literal["shm", "lru"] class LogprobsMode(enum.Enum): @@ -407,20 +389,6 @@ class ModelConfig: that this name(s) will also be used in `model_name` tag content of prometheus metrics, if multiple names provided, metrics tag will take the first one.""" - limit_mm_per_prompt: dict[str, int] = field(default_factory=dict) - """Maximum number of data items per modality per prompt. Only applicable - for multimodal models.""" - interleave_mm_strings: bool = False - """Enable fully interleaved support for multimodal prompts, while using - --chat-template-content-format=string. Defaults to False.""" - skip_mm_profiling: bool = False - """When enabled, skips multimodal memory profiling and only profiles with - language backbone model during engine initialization. - """ - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ use_async_output_proc: bool = True """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = "auto" @@ -436,41 +404,6 @@ class ModelConfig: hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.""" - mm_processor_kwargs: Optional[dict[str, Any]] = None - """Arguments to be forwarded to the model's processor for multi-modal data, - e.g., image processor. Overrides for the multi-modal processor obtained - from `AutoProcessor.from_pretrained`. The available overrides depend on the - model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. - """ - mm_processor_cache_gb: float = 4 - """The size (in GiB) of the multi-modal processor cache, which is used to - avoid re-processing past multi-modal inputs. - - This cache is duplicated for each API process and engine core process, - resulting in a total memory usage of - `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. - - Set to `0` to disable this cache completely (not recommended).""" - mm_processor_cache_type: MMCacheType = "lru" - """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, - use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" - mm_shm_cache_max_object_size_mb: int = 128 - """Size limit (in MiB) for each object stored in the multi-modal processor - shared memory cache. Only effective when `mm_processor_cache_type` is - `"shm"`.""" - mm_encoder_tp_mode: MMEncoderTPMode = "weights" - """Indicates how to optimize multi-modal encoder inference using - tensor parallelism (TP). - - - `"weights"`: Within the same vLLM engine, split the weights of - each layer across TP ranks. (default TP behavior) - - `"data"`: Within the same vLLM engine, split the batched input data - across TP ranks to process the data in parallel, while hosting - the full weights on each TP rank. - This batch-level DP is not to be confused with API request-level - DP (which is controlled by `--data-parallel-size`). - This is only supported on a per-model basis and falls back to - `"weights"` if the encoder does not support DP.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" @@ -513,6 +446,18 @@ class ModelConfig: io_processor_plugin: Optional[str] = None """IOProcessor plugin name to load at model startup""" + # Multimodal config and init vars + multimodal_config: Optional[MultiModalConfig] = None + limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None + mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None + mm_processor_cache_gb: InitVar[Optional[float]] = None + mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None + mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None + mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None + interleave_mm_strings: InitVar[Optional[bool]] = None + skip_mm_profiling: InitVar[Optional[bool]] = None + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -546,7 +491,18 @@ class ModelConfig: assert_hashable(str_factors) return hashlib.sha256(str(factors).encode()).hexdigest() - def __post_init__(self) -> None: + def __post_init__( + self, + # Multimodal config init vars + limit_mm_per_prompt: Optional[dict[str, int]], + media_io_kwargs: Optional[dict[str, dict[str, Any]]], + mm_processor_kwargs: Optional[dict[str, Any]], + mm_processor_cache_gb: Optional[float], + mm_processor_cache_type: Optional[MMCacheType], + mm_shm_cache_max_object_size_mb: Optional[int], + mm_encoder_tp_mode: Optional[MMEncoderTPMode], + interleave_mm_strings: Optional[bool], + skip_mm_profiling: Optional[bool]) -> None: # Set the default seed to 0 in V1. # NOTE(woosuk): In V0, we set the default seed to None because the # driver worker shares the same process as the user process, and thus @@ -777,7 +733,33 @@ class ModelConfig: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) - self.multimodal_config = self._init_multimodal_config() + # Init multimodal config if needed + if self._model_info.supports_multimodal: + if (mm_encoder_tp_mode == "data" and + not self._model_info.supports_multimodal_encoder_tp_data): + logger.warning_once( + "This model does not support `--mm-encoder-tp-mode data`. " + "Falling back to `--mm-encoder-tp-mode weights`.") + mm_encoder_tp_mode = "weights" + + mm_config_kwargs = dict( + limit_per_prompt=limit_mm_per_prompt, + media_io_kwargs=media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, + mm_processor_cache_gb=mm_processor_cache_gb, + mm_processor_cache_type=mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, + mm_encoder_tp_mode=mm_encoder_tp_mode, + interleave_mm_strings=interleave_mm_strings, + skip_mm_profiling=skip_mm_profiling, + ) + + mm_config_kwargs = { + k: v + for k, v in mm_config_kwargs.items() if v is not None + } + + self.multimodal_config = MultiModalConfig(**mm_config_kwargs) if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len @@ -875,30 +857,6 @@ class ModelConfig: ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"]) self.tokenizer = object_storage_tokenizer.dir - def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: - if self._model_info.supports_multimodal: - if (self.mm_encoder_tp_mode == "data" and - not self._model_info.supports_multimodal_encoder_tp_data): - logger.warning_once( - "This model does not support `--mm-encoder-tp-mode data`. " - "Falling back to `--mm-encoder-tp-mode weights`.") - self.mm_encoder_tp_mode = "weights" - - return MultiModalConfig( - limit_per_prompt=self.limit_mm_per_prompt, - media_io_kwargs=self.media_io_kwargs, - mm_processor_kwargs=self.mm_processor_kwargs, - mm_processor_cache_gb=self.mm_processor_cache_gb, - mm_processor_cache_type=self.mm_processor_cache_type, - mm_shm_cache_max_object_size_mb=self. - mm_shm_cache_max_object_size_mb, - mm_encoder_tp_mode=self.mm_encoder_tp_mode, - interleave_mm_strings=self.interleave_mm_strings, - skip_mm_profiling=self.skip_mm_profiling, - ) - - return None - def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) @@ -2417,129 +2375,6 @@ class SpeculativeConfig: return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" -@config -@dataclass -class MultiModalConfig: - """Controls the behavior of multimodal models.""" - - limit_per_prompt: dict[str, int] = \ - cast(dict[str, int], get_field(ModelConfig, "limit_mm_per_prompt")) - """ - The maximum number of input items allowed per prompt for each modality. - Defaults to 1 (V0) or 999 (V1) for each modality. - - For example, to allow up to 16 images and 2 videos per prompt: - `{"image": 16, "video": 2}` - """ - - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ - - mm_processor_kwargs: Optional[dict[str, object]] = None - """ - Overrides for the multi-modal processor obtained from - `transformers.AutoProcessor.from_pretrained`. - - The available overrides depend on the model that is being run. - - For example, for Phi-3-Vision: - `{"num_crops": 4}`. - """ - - mm_processor_cache_gb: float = 4 - """ - The size (in GiB) of the multi-modal processor cache, which is used to - - This cache is duplicated for each API process and engine core process, - resulting in a total memory usage of - `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. - - Set to `0` to disable this cache completely (not recommended). - """ - - mm_processor_cache_type: MMCacheType = "lru" - """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, - use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" - - mm_shm_cache_max_object_size_mb: int = 128 - """Size limit (in MiB) for each object stored in the multi-modal processor - shared memory cache. Only effective when `mm_processor_cache_type` is - `"shm"`.""" - - mm_encoder_tp_mode: MMEncoderTPMode = "weights" - """ - Indicates how to optimize multi-modal encoder inference using - tensor parallelism (TP). - - - `"weights"`: Within the same vLLM engine, split the weights of - each layer across TP ranks. (default TP behavior) - - `"data"`: Within the same vLLM engine, split the batched input data - across TP ranks to process the data in parallel, while hosting - the full weights on each TP rank. - This batch-level DP is not to be confused with API request-level - DP (which is controlled by `--data-parallel-size`). - This is only supported on a per-model basis and falls back to - `"weights"` if the encoder does not support DP. - """ - - interleave_mm_strings: bool = False - """ - Enable fully interleaved support for multimodal prompts. - """ - - skip_mm_profiling: bool = False - """ - When enabled, skips multimodal memory profiling and only profiles with - language backbone model during engine initialization. - - This reduces engine startup time but shifts the responsibility to users for - estimating the peak memory usage of the activation of multimodal encoder and - embedding cache. - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def get_limit_per_prompt(self, modality: str) -> int: - """ - Get the maximum number of input items allowed per prompt - for the given modality. - """ - return self.limit_per_prompt.get( - modality, - 999 if envs.VLLM_USE_V1 else 1, - ) - - def merge_mm_processor_kwargs( - self, - inference_kwargs: Mapping[str, object], - ) -> dict[str, object]: - """ - Get the keyword arguments to pass to the multi-modal processor - according to the extra arguments passed during inference. - """ - kwargs = self.mm_processor_kwargs or {} - return kwargs | dict(inference_kwargs) - - @config @dataclass class PoolerConfig: diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py new file mode 100644 index 0000000000000..1b93b520f33f9 --- /dev/null +++ b/vllm/config/multimodal.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from collections.abc import Mapping +from dataclasses import field +from typing import Any, Literal, Optional + +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config.utils import config + +MMEncoderTPMode = Literal["weights", "data"] +MMCacheType = Literal["shm", "lru"] + + +@config +@dataclass +class MultiModalConfig: + """Controls the behavior of multimodal models.""" + + limit_per_prompt: dict[str, int] = field(default_factory=dict) + """The maximum number of input items allowed per prompt for each modality. + Defaults to 1 (V0) or 999 (V1) for each modality. + + For example, to allow up to 16 images and 2 videos per prompt: + `{"image": 16, "video": 2}`""" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" + mm_processor_kwargs: Optional[dict[str, object]] = None + """Arguments to be forwarded to the model's processor for multi-modal data, + e.g., image processor. Overrides for the multi-modal processor obtained + from `transformers.AutoProcessor.from_pretrained`. + + The available overrides depend on the model that is being run. + + For example, for Phi-3-Vision: + `{"num_crops": 4}`.""" + mm_processor_cache_gb: float = 4 + """The size (in GiB) of the multi-modal processor cache, which is used to + avoid re-processing past multi-modal inputs. + + This cache is duplicated for each API process and engine core process, + resulting in a total memory usage of + `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. + + Set to `0` to disable this cache completely (not recommended).""" + mm_processor_cache_type: MMCacheType = "lru" + """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, + use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" + mm_shm_cache_max_object_size_mb: int = 128 + """Size limit (in MiB) for each object stored in the multi-modal processor + shared memory cache. Only effective when `mm_processor_cache_type` is + `"shm"`.""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """Indicates how to optimize multi-modal encoder inference using tensor + parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior)\n + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" + interleave_mm_strings: bool = False + """Enable fully interleaved support for multimodal prompts, while using + --chat-template-content-format=string.""" + skip_mm_profiling: bool = False + """When enabled, skips multimodal memory profiling and only profiles with + language backbone model during engine initialization. + + This reduces engine startup time but shifts the responsibility to users for + estimating the peak memory usage of the activation of multimodal encoder and + embedding cache.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def get_limit_per_prompt(self, modality: str) -> int: + """ + Get the maximum number of input items allowed per prompt + for the given modality. + """ + return self.limit_per_prompt.get( + modality, + 999 if envs.VLLM_USE_V1 else 1, + ) + + def merge_mm_processor_kwargs( + self, + inference_kwargs: Mapping[str, object], + ) -> dict[str, object]: + """ + Get the keyword arguments to pass to the multi-modal processor + according to the extra arguments passed during inference. + """ + kwargs = self.mm_processor_kwargs or {} + return kwargs | dict(inference_kwargs) diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 98fbeb1fa86aa..db8c05ef8be4a 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import MISSING, Field, field, fields, is_dataclass from typing import TYPE_CHECKING, TypeVar if TYPE_CHECKING: @@ -27,3 +28,20 @@ def config(cls: ConfigT) -> ConfigT: script, which is invoked during the pre-commit checks. """ return cls + + +def get_field(cls: ConfigType, name: str) -> Field: + """Get the default factory field of a dataclass by name. Used for getting + default factory fields in `EngineArgs`.""" + if not is_dataclass(cls): + raise TypeError("The given class is not a dataclass.") + cls_fields = {f.name: f for f in fields(cls)} + if name not in cls_fields: + raise ValueError(f"Field '{name}' not found in {cls.__name__}.") + named_field: Field = cls_fields[name] + if (default_factory := named_field.default_factory) is not MISSING: + return field(default_factory=default_factory) + if (default := named_field.default) is not MISSING: + return field(default=default) + raise ValueError( + f"{cls.__name__}.{name} must have a default value or default factory.") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ab43c0edc98d7..595d318fbaafe 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -27,12 +27,14 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, MambaDType, MMCacheType, MMEncoderTPMode, - ModelConfig, ModelDType, ModelImpl, MultiModalConfig, - ObservabilityConfig, ParallelConfig, PoolerConfig, - PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, - SchedulerPolicy, SpeculativeConfig, TaskOption, - TokenizerMode, VllmConfig, get_attr_docs, get_field) + LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, + ModelDType, ModelImpl, ObservabilityConfig, + ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, + RunnerOption, SchedulerConfig, SchedulerPolicy, + SpeculativeConfig, TaskOption, TokenizerMode, + VllmConfig, get_attr_docs) +from vllm.config.multimodal import MMCacheType, MultiModalConfig +from vllm.config.utils import get_field from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aa231de93c0c3..00ef39f134653 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -800,9 +800,10 @@ class MultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker - + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector = MediaConnector( - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, ) @@ -883,8 +884,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector = MediaConnector( - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, ) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index a386f47e1929f..4f51441e28efa 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -229,7 +229,8 @@ class MultiModalProcessingInfo(BaseProcessingInfo): def get_max_image_tokens(self) -> int: width, height = self.get_max_image_size() processor = self.get_hf_processor() - mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {} + multimodal_config = self.ctx.model_config.multimodal_config + mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {} mm_tokens = processor._get_num_multimodal_tokens( image_sizes=([height, width], ), **mm_processor_kwargs) image_tokens = mm_tokens["num_image_tokens"][0] @@ -380,8 +381,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1 mm_positions = torch.where(mm_token_type_ids == 1)[1] images = mm_items.get_items("image", ImageProcessorItems) - mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs - or {}) + multimodal_config = self.info.ctx.model_config.multimodal_config + mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {} image_sizes = [] for item_idx in range(len(images)): image_size = images.get_image_size(item_idx) From a0b26701c91a386e38105816057c1b3e6c1d080f Mon Sep 17 00:00:00 2001 From: Kyle Sayers <kylesayrs@gmail.com> Date: Mon, 15 Sep 2025 19:59:31 +0100 Subject: [PATCH 285/584] [Transform] Deterministic Hadacore Transforms (#24106) Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> --- CMakeLists.txt | 11 + csrc/ops.h | 2 + .../hadacore/hadamard_transform_cuda.cu | 817 ++++++++++++++++++ csrc/torch_bindings.cpp | 3 + tests/kernels/quantization/test_hadacore.py | 25 + vllm/_custom_ops.py | 24 + .../compressed_tensors/compressed_tensors.py | 2 +- .../compressed_tensors/transform/linear.py | 25 +- .../compressed_tensors/transform/module.py | 76 +- .../transform/schemes/linear_qutlass_nvfp4.py | 37 +- 10 files changed, 979 insertions(+), 43 deletions(-) create mode 100644 csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu create mode 100644 tests/kernels/quantization/test_hadacore.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 8df349ce14fda..009c224dc7735 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -783,6 +783,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + # Hadacore kernels + cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}") + if(HADACORE_ARCHS) + set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${HADACORE_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + message(STATUS "Building hadacore") + endif() + # if CUDA endif endif() diff --git a/csrc/ops.h b/csrc/ops.h index c65bf431640d5..fd9c55b948959 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -347,6 +347,8 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle( int64_t open_mem_handle(torch::Tensor& mem_handle); void free_shared_buffer(int64_t buffer); +torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace); + #ifdef USE_ROCM fptr_t init_custom_qr(int64_t rank, int64_t world_size, std::optional<int64_t> qr_max_size = std::nullopt); diff --git a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu new file mode 100644 index 0000000000000..5369d409f9b21 --- /dev/null +++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu @@ -0,0 +1,817 @@ +// clang-format off +// Adapted from: https://github.com/meta-pytorch/applied-ai/blob/main/kernels/cuda/inference/hadamard_transform/hadamard_transform_cuda.cu + +/*********** +Copyright 2024 Meta + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +***********/ + +#include <torch/all.h> +#include <stdint.h> +#include <cuda_runtime.h> +#include <mma.h> +#include <cuda/annotated_ptr> +#include <c10/cuda/CUDAException.h> + +#include <ATen/cuda/CUDAContext.h> +#include <c10/cuda/CUDAGuard.h> + +#include "core/registration.h" +#include "dispatch_utils.h" + +namespace hadacore { + +#ifndef __CUDACC__ +#define __launch_bounds__(x,y) +#endif + +#define MAX_WARPS_PER_SM 48 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +using b16 = uint16_t; +using b32 = uint32_t; + +constexpr int launch_configs_big[7][3] = { + // default + {2, 1, 24}, + {2, 2, 16}, + {2, 4, 8}, + {2, 8, 4}, + {2, 16, 3}, + {4, 16, 2}, + {8, 16, 1} + // // extra coalescing + // {2, 1, 24}, + // {2, 2, 16}, + // {2, 4, 8}, + // {2, 8, 4}, + // {4, 8, 3}, + // {8, 8, 2}, + // {16, 8, 1} + // // less coalescing + // {2, 1, 24}, + // {2, 2, 16}, + // {2, 4, 8}, + // {2, 8, 4}, + // {1, 32, 1}, + // {2, 32, 1}, + // {4, 32, 1} +}; + +// a 4x2, b 2x2, c 2x2 +template <torch::ScalarType dtype> +__device__ __forceinline__ void mma_m16_n8_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32& c0, b32& c1){ + static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16); + // d, a, b, c + b32 zero = 0; + if constexpr(dtype == torch::ScalarType::Half) { + asm ( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " + "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n\t" + : "=r"(c0), "=r"(c1) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero) + ); + } else { + b32 temp0, temp1, temp2, temp3; + asm ( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n\t" + : "=r"(temp0), "=r"(temp1), "=r"(temp2), "=r"(temp3) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero), "r"(zero), "r"(zero) + ); + asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c0) : "r"(temp1), "r"(temp0)); + asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c1) : "r"(temp3), "r"(temp2)); + } +} + +// a 4x2, b 4x2, c 4x2 +template <torch::ScalarType dtype> +__device__ __forceinline__ void mma_m16_n16_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32 b2, b32 b3, b32& c0, b32& c1, b32& c2, b32& c3){ + mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b0, b1, c0, c1); + mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b2, b3, c2, c3); +} + +__device__ __forceinline__ void matrix_transpose_m8_n8_b16_inplace(b32& a0) { + asm ( + "movmatrix.sync.aligned.m8n8.trans.b16 " + "%0, %1;\n\t" + : "=r"(a0) : "r"(a0) + ); +} + +#define p_p(i) ((val_1p[i] & 0x0000FFFF) | val_1p[i] << 16) +#define p_n(i) ((val_1p[i] & 0x0000FFFF) | val_1n[i] << 16) +#define n_p(i) ((val_1n[i] & 0x0000FFFF) | val_1p[i] << 16) +#define n_n(i) ((val_1n[i] & 0x0000FFFF) | val_1n[i] << 16) + +template<int64_t num_chunks, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool enable_mask, torch::ScalarType dtype> +__global__ void __launch_bounds__(32 * warps_per_block, blocks_per_sm) +// a is column major, b is row major +hadamard_transform_kernel(b16* a, b16* out, int total_num_chunks) { + static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently"); + + b32 b_frag_all[num_chunks][4]; // for all chunks, holds matrix fragment (which takes 4 regs of b16x2 * 32 threads) + + int64_t blockid = blockIdx.x * warps_per_block + threadIdx.x / 32; + int64_t threadid = threadIdx.x % 32; + extern __shared__ b32 bfrag_arr[]; // num_chunks * warps_per_block * 128 + int64_t real_num_chunks = ((blockid + 1) * num_chunks) > total_num_chunks ? (total_num_chunks - (blockid * num_chunks)) : num_chunks; + int64_t diff_num_chunks = real_num_chunks - num_chunks; + + b32* a_start_ptr = (b32*) (a + blockid * num_chunks * 256); // offset a to where this warp starts + b32* out_start_ptr = (b32*) (out + blockid * num_chunks * 256); + b32* a_ptr = a_start_ptr + threadid * 4; + b32* b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128 + threadid * 4; + + #if (__CUDA_ARCH__ < 900) // SM80, SM89 + uint64_t cache_policy; + asm volatile( + "createpolicy.fractional.L2::evict_first.b64 %0, 1.0;\n" + : "=l"(cache_policy) + ); + #endif + + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + size_t shared_ptr = __cvta_generic_to_shared(b_frag_ptr); + #if (__CUDA_ARCH__ >= 900) // SM90 + asm volatile( + "cp.async.cg.shared.global [%0], [%1], 16;\n" + "cp.async.commit_group;\n" + :: "l"(shared_ptr), "l"(a_ptr) + ); + #else // SM80, SM89 + asm volatile( + "cp.async.cg.shared.global.L2::cache_hint.L2::256B [%0], [%1], 16, %2;\n" + "cp.async.commit_group;\n" + :: "l"(shared_ptr), "l"(a_ptr), "l"(cache_policy) + ); + #endif + + a_ptr += 128; + b_frag_ptr += 128; + } + + // generate hadamard 16x16 (up to 2 of them) + constexpr b16 fp16_1p[4] = {0b0011100110101000, 0b0011100000000000, 0b0011010110101000, 0b0011010000000000}; + constexpr b16 fp16_1n[4] = {0b1011100110101000, 0b1011100000000000, 0b1011010110101000, 0b1011010000000000}; + constexpr b16 bf16_1p[4] = {0b0011111100110101, 0b0011111100000000, 0b0011111010110101, 0b0011111010000000}; + constexpr b16 bf16_1n[4] = {0b1011111100110101, 0b1011111100000000, 0b1011111010110101, 0b1011111010000000}; + + #define val_type_1p(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1p[i]) : (bf16_1p[i])) + #define val_type_1n(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1n[i]) : (bf16_1n[i])) + constexpr b16 val_1p[4] = {val_type_1p(0), val_type_1p(1), val_type_1p(2), val_type_1p(3)}; + constexpr b16 val_1n[4] = {val_type_1n(0), val_type_1n(1), val_type_1n(2), val_type_1n(3)}; + + constexpr b32 p_p[4] = {p_p(0), p_p(1), p_p(2), p_p(3)}; + constexpr b32 p_n[4] = {p_n(0), p_n(1), p_n(2), p_n(3)}; + constexpr b32 n_p[4] = {n_p(0), n_p(1), n_p(2), n_p(3)}; + constexpr b32 n_n[4] = {n_n(0), n_n(1), n_n(2), n_n(3)}; + const b32 had_16_p1[4][4] = { + { + 0b10001000010001000010001000010001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b10001000010001000010001000010001 + }, + { + 0b11001100100010000011001100100010, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11001100100010000011001100100010 + }, + { + 0b11111111101010101100110010011001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11111111101010101100110010011001 + }, + { + 0b11111111101010101100110010011001, + 0b11111111101010101100110010011001, + 0b11111111101010101100110010011001, + 0b00000000010101010011001101100110 + } + }; + const b32 had_16_p2[4][4] = { + { + 0b10000000010000000010000000010000, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b10000000010000000010000000010000 + }, + { + 0b11000000100001000011000000100001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11000000100001000011000000100001 + }, + { + 0b11110000101001011100001110010110, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11110000101001011100001110010110 + }, + { + 0b11110000101001011100001110010110, + 0b11110000101001011100001110010110, + 0b11110000101001011100001110010110, + 0b00001111010110100011110001101001 + } + }; + const b32 had_16_mask[3][4] = { + { + 0b10001000010001000010001000010001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b10001000010001000010001000010001 + }, + { + 0b11001100110011000011001100110011, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11001100110011000011001100110011 + }, + { + 0b11111111111111111111111111111111, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11111111111111111111111111111111 + } + }; + b32 had_frag[8]; + #pragma unroll + for (int64_t i = 0; i < 2; i++) { + int64_t c_log_h = (i == 0) ? MIN(4, log_had_size) : log_had_size % 4; + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + if (c_log_h < 4) { + bool mask = had_16_mask[c_log_h - 1][j] & (1 << (31 - threadid)); + if (!mask) { + had_frag[i * 4 + j] = 0; + continue; + } + } + bool pred1 = had_16_p1[c_log_h - 1][j] & (1 << (31 - threadid)); + bool pred2 = had_16_p2[c_log_h - 1][j] & (1 << (31 - threadid)); + b32 val = pred1 ? (pred2 ? p_p[c_log_h - 1] : p_n[c_log_h - 1]) : (pred2 ? n_p[c_log_h - 1] : n_n[c_log_h - 1]); + had_frag[i * 4 + j] = val; + } + if constexpr(log_had_size <= 4 || log_had_size % 4 == 0) break; + } + + // log had size above 8, only used for above 2^8 = 256 size + constexpr int64_t part8_log_had_size = log_had_size - 8; + + b32* a_chunk_ptr = a_start_ptr; // first chunk starts at this warp's data starts + b32* out_chunk_ptr = out_start_ptr; + + #pragma unroll + for (int64_t l = 0; l < 2; l++) { + if constexpr(log_had_size <= 8) { // l == 0 guaranteed, redundant simplified version of else body, to help compiler warnings + b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128; + } else { + b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * (l == 0 ? 128 : (128 >> part8_log_had_size)); + } + + if (l == 1) { + if constexpr(log_had_size > 8) { + __syncthreads(); // sync between first and second iterations if above size 256 + + if constexpr(log_had_size >= 12) { + // sizes 4k and above + + // a + threadblock offset + warp offset + // can then index into all chunks owned by this warp + b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block)); + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + // here, j represents register, and k represents 8-offset/chunk + uint64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data + + int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread # + int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data) + int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads) + int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads + int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register + int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index + + // fix idx for majorness + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + + // store[rowidx * 128 + colidx] = data; + b32 data = store[rowidx * 128 + colidx]; + + // compiler generates excessive instructions, so we manually do the if statement + #pragma unroll + for (uint64_t i = 0; i < num_chunks; i++) { + asm volatile ( + "{\n\t" + " .reg .pred p0;\n\t" + " setp.eq.s64 p0, %1, %2;\n\t" + " @p0 mov.b32 %0, %3;\n\t" + "}\n\t" + : "+r"(b_frag_all[i][j]) // Output operand %0 + : "l"(real_chunk_num), "l"(i), "r"(data) // Input operands %1, %2, %3 + ); + } + } + } + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 1; k < num_chunks; k++) { + int64_t threadid_contig = threadid % num_chunks; + int64_t threadid_mul = threadid / num_chunks; + int64_t threadid2 = (threadid_contig + num_chunks - k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to + b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2); + } + } + } + } + } + + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + if constexpr(enable_mask) { + if (k >= real_num_chunks) + break; + } + if (l == 0) { + // bad fix for k not being recognized as a constexpr by compiler + // asm("cp.async.wait_group %0;\n" :: "n"(num_chunks - k - 1)); + #define SWITCH_WAIT_ASYNC_LOAD_GROUP(i) case i: asm volatile("cp.async.wait_group %0;\n" :: "n"(num_chunks - i - 1)); break; + if constexpr(enable_mask) { + switch(k + diff_num_chunks) { + SWITCH_WAIT_ASYNC_LOAD_GROUP(0) + SWITCH_WAIT_ASYNC_LOAD_GROUP(1) + SWITCH_WAIT_ASYNC_LOAD_GROUP(2) + SWITCH_WAIT_ASYNC_LOAD_GROUP(3) + SWITCH_WAIT_ASYNC_LOAD_GROUP(4) + SWITCH_WAIT_ASYNC_LOAD_GROUP(5) + SWITCH_WAIT_ASYNC_LOAD_GROUP(6) + SWITCH_WAIT_ASYNC_LOAD_GROUP(7) + SWITCH_WAIT_ASYNC_LOAD_GROUP(8) + SWITCH_WAIT_ASYNC_LOAD_GROUP(9) + SWITCH_WAIT_ASYNC_LOAD_GROUP(10) + SWITCH_WAIT_ASYNC_LOAD_GROUP(11) + SWITCH_WAIT_ASYNC_LOAD_GROUP(12) + SWITCH_WAIT_ASYNC_LOAD_GROUP(13) + SWITCH_WAIT_ASYNC_LOAD_GROUP(14) + SWITCH_WAIT_ASYNC_LOAD_GROUP(15) + SWITCH_WAIT_ASYNC_LOAD_GROUP(16) + SWITCH_WAIT_ASYNC_LOAD_GROUP(17) + SWITCH_WAIT_ASYNC_LOAD_GROUP(18) + SWITCH_WAIT_ASYNC_LOAD_GROUP(19) + SWITCH_WAIT_ASYNC_LOAD_GROUP(20) + SWITCH_WAIT_ASYNC_LOAD_GROUP(21) + SWITCH_WAIT_ASYNC_LOAD_GROUP(22) + SWITCH_WAIT_ASYNC_LOAD_GROUP(23) + SWITCH_WAIT_ASYNC_LOAD_GROUP(24) + SWITCH_WAIT_ASYNC_LOAD_GROUP(25) + SWITCH_WAIT_ASYNC_LOAD_GROUP(26) + SWITCH_WAIT_ASYNC_LOAD_GROUP(27) + SWITCH_WAIT_ASYNC_LOAD_GROUP(28) + SWITCH_WAIT_ASYNC_LOAD_GROUP(29) + SWITCH_WAIT_ASYNC_LOAD_GROUP(30) + SWITCH_WAIT_ASYNC_LOAD_GROUP(31) + } + } else { + switch(k) { + SWITCH_WAIT_ASYNC_LOAD_GROUP(0) + SWITCH_WAIT_ASYNC_LOAD_GROUP(1) + SWITCH_WAIT_ASYNC_LOAD_GROUP(2) + SWITCH_WAIT_ASYNC_LOAD_GROUP(3) + SWITCH_WAIT_ASYNC_LOAD_GROUP(4) + SWITCH_WAIT_ASYNC_LOAD_GROUP(5) + SWITCH_WAIT_ASYNC_LOAD_GROUP(6) + SWITCH_WAIT_ASYNC_LOAD_GROUP(7) + SWITCH_WAIT_ASYNC_LOAD_GROUP(8) + SWITCH_WAIT_ASYNC_LOAD_GROUP(9) + SWITCH_WAIT_ASYNC_LOAD_GROUP(10) + SWITCH_WAIT_ASYNC_LOAD_GROUP(11) + SWITCH_WAIT_ASYNC_LOAD_GROUP(12) + SWITCH_WAIT_ASYNC_LOAD_GROUP(13) + SWITCH_WAIT_ASYNC_LOAD_GROUP(14) + SWITCH_WAIT_ASYNC_LOAD_GROUP(15) + SWITCH_WAIT_ASYNC_LOAD_GROUP(16) + SWITCH_WAIT_ASYNC_LOAD_GROUP(17) + SWITCH_WAIT_ASYNC_LOAD_GROUP(18) + SWITCH_WAIT_ASYNC_LOAD_GROUP(19) + SWITCH_WAIT_ASYNC_LOAD_GROUP(20) + SWITCH_WAIT_ASYNC_LOAD_GROUP(21) + SWITCH_WAIT_ASYNC_LOAD_GROUP(22) + SWITCH_WAIT_ASYNC_LOAD_GROUP(23) + SWITCH_WAIT_ASYNC_LOAD_GROUP(24) + SWITCH_WAIT_ASYNC_LOAD_GROUP(25) + SWITCH_WAIT_ASYNC_LOAD_GROUP(26) + SWITCH_WAIT_ASYNC_LOAD_GROUP(27) + SWITCH_WAIT_ASYNC_LOAD_GROUP(28) + SWITCH_WAIT_ASYNC_LOAD_GROUP(29) + SWITCH_WAIT_ASYNC_LOAD_GROUP(30) + SWITCH_WAIT_ASYNC_LOAD_GROUP(31) + } + } + } + + if (l == 0) { + // loading for the first iteration + + // thread 0 loads [t0r0, t16r1, t0r2, t16r3] + // thread 16 loads [t0r1, t16r0, t0r3, t16r2] + // allows full coalescing, same for t1/t17, t2/t18, etc. + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2)); + int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16); + int64_t real_row = real_thread_id % 4; + int64_t real_col = real_thread_id / 4; + b_frag_all[k][j] = b_frag_ptr[(real_row + (reg % 2) * 4) + (real_col + (j / 2) * 8) * 8]; + } + + // for t16 swap r0/r1 and r2/r3 to have [t16r0, t0r1, t16r2, t0r3] + // so registers are in right order, same for t17, t18, etc. + if ((threadid & 16) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][1]; + b_frag_all[k][1] = temp; + + temp = b_frag_all[k][2]; + b_frag_all[k][2] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + // t0 and t16 swap r1 and r3 to have their own data, + // same for t1/t17, t2/18, etc. + #pragma unroll + for (int64_t j = 1; j < 4; j += 2) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16); + } + } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings + if constexpr(log_had_size < 12) { + // sizes 512, 1k, and 2k + + // for 512: + // thread 0 loads [t0r0, t0r1, t16r2, t16r3] + // thread 16 loads [t0r2, t0r3, t16r0, t16r1] + // same for t1/t17, t2/t18, etc. + // for 1k and 2k: + // thread 0 loads [t0r0, t0r1, t1r2, t1r3] + // thread 1 loads [t0r2, t0r3, t1r0, t1r1] + // same for t2/t3, t4/t5, etc. + // allows full coalescing for 512 and 1k, 16x coalescing for 2k + constexpr int64_t xor_val = log_had_size == 9 ? 16 : 1; + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4; + int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val); + int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2); + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + b_frag_all[k][j] = b_frag_ptr[rowidx * 128 + colidx]; + } + + if ((threadid & xor_val) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][2]; + b_frag_all[k][2] = temp; + + temp = b_frag_all[k][1]; + b_frag_all[k][1] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + #pragma unroll + for (int64_t j = 2; j < 4; j++) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val); + } + } + } + + if (l == 1) { + // for second iteration, we load 2 consecutive b16s (1 b32) per register, + // but tensor core register layout requires 2 b16s that are in the + // same column/consecutive rows to be in the same register, so do the swap + b32 f0 = ((b_frag_all[k][1] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF); + b32 f1 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][2] & 0xFFFF); + b32 f2 = (b_frag_all[k][1] & 0xFFFF0000) | (b_frag_all[k][0] >> 16); + b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][2] >> 16); + b_frag_all[k][0] = f0; + b_frag_all[k][1] = f1; + b_frag_all[k][2] = f2; + b_frag_all[k][3] = f3; + } + + #pragma unroll + for(int64_t i = 0, remaining_log_had_size = log_had_size - l * 8; i < 2 && remaining_log_had_size > 0; i++) { + int64_t had_off = ((remaining_log_had_size < 4) && !(log_had_size <= 4 || log_had_size % 4 == 0)) ? 4 : 0; + mma_m16_n16_k16_b16_b16_b16_noacc<dtype>(had_frag[had_off + 0], had_frag[had_off + 1], had_frag[had_off + 2], had_frag[had_off + 3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3]); + + remaining_log_had_size -= 4; + if (remaining_log_had_size <= 0 && i == 0) { + // TODO: consider different storing so no need for transpose + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][0]); + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][1]); + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][2]); + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][3]); + } else { + // swap and use output directly as b_frag for next iteration as an actually free transpose + b32 temp = b_frag_all[k][1]; + b_frag_all[k][1] = b_frag_all[k][2]; + b_frag_all[k][2] = temp; + } + } + + if (l == 1) { + // invert swap from above for second iteration + b32 f0 = ((b_frag_all[k][2] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF); + b32 f1 = (b_frag_all[k][2] & 0xFFFF0000) | (b_frag_all[k][0] >> 16); + b32 f2 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][1] & 0xFFFF); + b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][1] >> 16); + b_frag_all[k][0] = f0; + b_frag_all[k][1] = f1; + b_frag_all[k][2] = f2; + b_frag_all[k][3] = f3; + } + + if (l == 0) { + // inverse of coalesced load for first iteration to store result + #pragma unroll + for (int64_t j = 1; j < 4; j += 2) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16); + } + + if ((threadid & 16) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][1]; + b_frag_all[k][1] = temp; + + temp = b_frag_all[k][2]; + b_frag_all[k][2] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + // if only going up to 256 size, store directly back to global memory, + // otherwise store back to shared memory for next iteration + b32* store = (log_had_size <= 8) ? out_chunk_ptr : b_frag_ptr; + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2)); + int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16); + int64_t real_row = real_thread_id % 4; + int64_t real_col = real_thread_id / 4; + store[(real_row + (reg % 2) * 4) + (real_col + (reg / 2) * 8) * 8] = b_frag_all[k][j]; + } + } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings + if (log_had_size < 12) { + // inverse of coalesced load for sizes 512, 1k and 2k to store result + constexpr int xor_val = log_had_size == 9 ? 16 : 1; + #pragma unroll + for (int64_t j = 2; j < 4; j++) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val); + } + + if ((threadid & xor_val) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][2]; + b_frag_all[k][2] = temp; + + temp = b_frag_all[k][1]; + b_frag_all[k][1] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + b32* store = (b32*)(out + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 256 + (256 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block) + k)); + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4; + b32 data = b_frag_all[k][j]; + int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val); + int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2); + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + store[rowidx * 128 + colidx] = data; + } + } + // for size 4k and above, wait to process all chunks so a final store can be performed coalesced + } + + a_chunk_ptr += 128; // (only affects first 256 size) move on to next chunk by skipping 256 elements in b16 (= 128 in b32) + out_chunk_ptr += 128; + if constexpr(log_had_size > 8) { + b_frag_ptr += (l == 0 ? 128 : (128 >> part8_log_had_size)); + } else { // else is redundant, simplified version of if body, to help compiler warnings + b_frag_ptr += 128; + } + } + if (log_had_size <= 8) + break; + } + + if constexpr(log_had_size >= 12) { + // for sizes 4k and above, perform final coalesced store after processing all chunks + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 1; k < num_chunks; k++) { + int64_t threadid_contig = threadid % num_chunks; + int64_t threadid_mul = threadid / num_chunks; + int64_t threadid2 = (threadid_contig + k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to + b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2); + } + } + + // a + threadblock offset + warp offset + // can then index into all chunks owned by this warp + b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block)); + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + // here, j represents register, and k represents 8-offset/chunk + int64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data + + // b32 data = b_frag_all[real_chunk_num][j]; // target thread data + b32 data; + #pragma unroll + for (int64_t i = 0; i < num_chunks; i++) { + if (real_chunk_num == i) data = b_frag_all[i][j]; + } + + int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread # + int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data) + int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads) + int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads + int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register + int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index + + // fix idx for majorness + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + + store[rowidx * 128 + colidx] = data; + } + } + + __syncthreads(); + store = ((b32*) out) + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 128; + int4* store4 = (int4*) store; + int4* bfrag_arr4 = (int4*) bfrag_arr; + // flush smem, simply linearly write to store + // always divisible by 128*32b, so (32*4)*32b is ok + #pragma unroll + for (int64_t warp_off = 0; warp_off < (num_chunks * warps_per_block * 128 / 4); warp_off += 32 * warps_per_block) { + int64_t total_off = warp_off + threadid + (blockid % warps_per_block) * 32; + store4[total_off] = bfrag_arr4[total_off]; + } + } + +} + +constexpr int64_t ceil_div(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + +template <torch::ScalarType dtype, int64_t chunks_per_warp, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool check_masking = false> +void __forceinline__ run_kernel(b16* a_mat, b16* out, int64_t num_chunks, cudaStream_t stream) { + int64_t shared_size = chunks_per_warp * warps_per_block * 128 * 4; + dim3 block_size = 32 * warps_per_block; + + #define CHECK_SHARED_LIM() { \ + if (shared_size > 48 * 1024) { \ + C10_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536)); \ + } \ + } \ + + if constexpr(check_masking) { + if (num_chunks % (chunks_per_warp * warps_per_block) != 0) { + dim3 grid_size = ceil_div(ceil_div(num_chunks, chunks_per_warp), warps_per_block); + auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, true, dtype>; + CHECK_SHARED_LIM(); + kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks); + } else { + dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block; + auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>; + CHECK_SHARED_LIM(); + kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks); + } + } else { + dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block; + auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>; + CHECK_SHARED_LIM(); + kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks); + } + + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template <torch::ScalarType dtype> +void run_fht(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream) { + int64_t num_chunks = numel / 256; // caller required to ensure divisible by 256 + // for size 256, use (2, 1) + // for size 32k use (8, 16) + constexpr int64_t chunks_per_warp_small = 1;// 8; + constexpr int64_t warps_per_block_small = 1;//2;//16; + constexpr int64_t blocks_per_sm_small = 24; + constexpr int64_t chunks_per_warp_large = 2; + constexpr int64_t warps_per_block_large = 1; + constexpr int64_t blocks_per_sm_large = 24; + + b16* a_mat = (b16*) a_mat_ptr; + b16* out = (b16*) out_ptr; + + if (numel <= 256) { + switch (had_size) { + case (1<<1): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 1, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + case (1<<2): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 2, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + case (1<<3): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 3, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + case (1<<4): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 4, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + case (1<<5): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 5, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + case (1<<6): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 6, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + case (1<<7): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 7, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + case (1<<8): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 8, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break; + } + } else { + switch (had_size) { + case (1<<1): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 1, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<2): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 2, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<3): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 3, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<4): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 4, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<5): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 5, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<6): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 6, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<7): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 7, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<8): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 8, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break; + case (1<<9): run_kernel<dtype, launch_configs_big[0][0], launch_configs_big[0][1], 9 , launch_configs_big[0][2]>(a_mat, out, num_chunks, stream); break; + case (1<<10): run_kernel<dtype, launch_configs_big[1][0], launch_configs_big[1][1], 10, launch_configs_big[1][2]>(a_mat, out, num_chunks, stream); break; + case (1<<11): run_kernel<dtype, launch_configs_big[2][0], launch_configs_big[2][1], 11, launch_configs_big[2][2]>(a_mat, out, num_chunks, stream); break; + case (1<<12): run_kernel<dtype, launch_configs_big[3][0], launch_configs_big[3][1], 12, launch_configs_big[3][2]>(a_mat, out, num_chunks, stream); break; + case (1<<13): run_kernel<dtype, launch_configs_big[4][0], launch_configs_big[4][1], 13, launch_configs_big[4][2]>(a_mat, out, num_chunks, stream); break; + case (1<<14): run_kernel<dtype, launch_configs_big[5][0], launch_configs_big[5][1], 14, launch_configs_big[5][2]>(a_mat, out, num_chunks, stream); break; + case (1<<15): run_kernel<dtype, launch_configs_big[6][0], launch_configs_big[6][1], 15, launch_configs_big[6][2]>(a_mat, out, num_chunks, stream); break; + } + } +} + +template void run_fht<torch::ScalarType::Half>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream); +template void run_fht<torch::ScalarType::BFloat16>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream); + +} // namespace hadacore + +constexpr bool is_power_of_two(int x) { return x && !(x & (x - 1)); } + +torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) { + auto dtype = x.scalar_type(); + TORCH_CHECK(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently"); + TORCH_CHECK(x.is_cuda()); + + const int had_size = x.size(-1); + TORCH_CHECK(is_power_of_two(had_size) && (had_size <= (1U << 15)), + "Only power of two Hadamard sizes up to 2^15 are supported, got ", had_size); + + const auto res_shape = x.sizes(); + x = x.reshape({-1, had_size}); + + auto numel = x.numel(); + if (numel % 256 != 0) { + x = torch::nn::functional::pad(x, torch::nn::functional::PadFuncOptions({0, 0, 0, (256 - numel % 256) / had_size})); + } + + if (x.stride(-1) != 1) { + x = x.contiguous(); + } + torch::Tensor out = inplace ? x : torch::empty_like(x); + + at::cuda::CUDAGuard device_guard{(char)x.get_device()}; + auto stream = at::cuda::getCurrentCUDAStream().stream(); + + VLLM_DISPATCH_HALF_TYPES(x.scalar_type(), "hadacore_transform_runfht", [&] { + auto constexpr SCALAR_TYPE = c10::CppTypeToScalarType<scalar_t>::value; + hadacore::run_fht<SCALAR_TYPE>(x.data_ptr(), x.data_ptr(), x.numel(), had_size, stream); + }); + + if (numel % 256 != 0) { + out = out.index({torch::indexing::Slice(0, numel / had_size)}); + } + + if (inplace && out.data_ptr() != x.data_ptr()) { + x.copy_(out.view(res_shape)); + return x; + } + return out.reshape(res_shape); +} + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("hadacore_transform", &hadacore_transform); +} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 81aca7b8860d5..f22e23519831f 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -613,6 +613,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "int pad_slot_id) -> ()"); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); + // Hadamard transforms + ops.def("hadacore_transform(Tensor! x, bool inplace) -> Tensor"); + #ifndef USE_ROCM // Compute per-token-group FP8 quantized tensor and scaling factor. ops.def( diff --git a/tests/kernels/quantization/test_hadacore.py b/tests/kernels/quantization/test_hadacore.py new file mode 100644 index 0000000000000..127d68072e3f3 --- /dev/null +++ b/tests/kernels/quantization/test_hadacore.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import pytest +import torch +from compressed_tensors.transform import deterministic_hadamard_matrix + +from vllm import _custom_ops as ops + + +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("hidden_dim", [2**n for n in range(10)]) +def test_hadacore(batch_size, hidden_dim, dtype=torch.bfloat16, device="cuda"): + x = torch.eye(hidden_dim, dtype=dtype, device=device) + hadamard = deterministic_hadamard_matrix( + hidden_dim, dtype=torch.float64, device="cuda") / math.sqrt(hidden_dim) + + y = ops.hadacore_transform(x.clone()) + y_true = (x.to(hadamard.dtype) @ hadamard.T).to(y.dtype) + assert torch.allclose(y, y_true) + + y = ops.hadacore_transform(y) + assert torch.allclose(y, x) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index f3e5fe73d4e26..456c6b3ba9234 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2011,3 +2011,27 @@ def onednn_scaled_mm( input_zp_adj, bias, dnnl_handler.handler) return output + + +def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor: + """ + Perform Hadamard transforms using [Hadacore](https://arxiv.org/abs/2412.08832) + kernels. Note that these kernels exploit the recursive properties of + Sylvester Hadamards, and therefore do not require transform weight data + + Note that sylvester hadamard transforms are also symmetric, which means that + this function is also applies the (transpose <=> inverse) transform. + + :param x: value to be transformed inplace + :param inplace: modify value in place + :return: value after transformation + """ + return torch.ops._C.hadacore_transform(x, inplace) + + +if hasattr(torch.ops._C, "hadacore_transform"): + + @register_fake("_C::hadacore_transform") + def _hadacore_transform_fake(x: torch.Tensor, + inplace: bool) -> torch.Tensor: + return torch.empty_like(x) if not inplace else x diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 97041a5a050f1..b56a691311774 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -129,7 +129,7 @@ class CompressedTensorsConfig(QuantizationConfig): # choose transform method if any((input_tfms, output_tfms)): return CompressedTensorsLinearTransformMethod.from_schemes( - quant_method, input_tfms, output_tfms) + quant_method, quant_scheme, input_tfms, output_tfms) else: return quant_method diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py index 2fc94b3c257e6..d098185146e41 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py @@ -12,6 +12,8 @@ from compressed_tensors.utils import is_match from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, LinearMethodBase, QKVCrossParallelLinear) +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsScheme) from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import ( # noqa: E501 HadamardTransform) from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501 @@ -26,14 +28,22 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase): @classmethod def from_schemes( - cls, quant_method: LinearMethodBase, input_tfms: dict[int, - TransformTuple], - output_tfms: dict[int, TransformTuple] + cls, + quant_method: LinearMethodBase, + quant_scheme: Optional[CompressedTensorsScheme], + input_tfms: dict[int, TransformTuple], + output_tfms: dict[int, TransformTuple], ) -> "CompressedTensorsLinearTransformMethod": + from vllm.model_executor.layers.quantization.compressed_tensors.transform.schemes.linear_qutlass_nvfp4 import ( # noqa: E501 + QutlassNvFP4LinearMethod, is_qutlass_fp4_scheme) + assert input_tfms or output_tfms - # TODO (@ksayers): implement QutlassLinearMethodNvFP4 - # hadacore and fwht can be selected by Transform module + if is_qutlass_fp4_scheme(quant_scheme, input_tfms): + return QutlassNvFP4LinearMethod(quant_method, input_tfms, + output_tfms) + + # hadacore or dense gemm is selected by Transform module return cls(quant_method, input_tfms, output_tfms) @@ -129,11 +139,12 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase): assert bias is None x = self.quant_method.apply(layer, x, bias) - # TODO (@ksayers): Write a triton kernel to do this in parallel + # In most cases, input transforms are preferred over output transforms + # (@ksayers): confirm that this is done concurrently if self.output_transform is not None: for part_id, (start, length) in enumerate(self.partition_ranges): x[:, start:start + length] = self.output_transform( - x[:, start:start + length], part_id=part_id) + x[:, start:start + length].contiguous(), part_id=part_id) return x diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py index 48ab2582a3b26..5e863354715e4 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Hashable -from typing import Callable, Optional +from typing import Callable import torch -from compressed_tensors.transform import TransformLocation, TransformScheme +from compressed_tensors.transform import (TransformArgs, TransformLocation, + TransformScheme) from torch import Tensor +import vllm._custom_ops as ops from vllm.distributed.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.layers.linear import LinearBase @@ -28,16 +30,12 @@ class HadamardTransform(torch.nn.Module): transforms: dict[int, TransformTuple] # info parsed from transforms config weight: SharedWeightParameter # container for shared tensors - kernel: Callable # function used during application scales: dict[int, float] # hadamard scale, usually sqrt(matrix.size(0)) - def __init__(self, - transforms: dict[int, TransformTuple], - layer: torch.nn.Module, - weight_loader: Callable, + def __init__(self, transforms: dict[int, TransformTuple], + layer: torch.nn.Module, weight_loader: Callable, input_size_per_partition: int, - output_partition_sizes: list[int], - kernel: Optional[Callable] = None): + output_partition_sizes: list[int]): super().__init__() self.transforms = transforms self.scales = {} @@ -55,7 +53,7 @@ class HadamardTransform(torch.nn.Module): for part_index, (_scheme_name, scheme, args) in self.transforms.items(): output_size = output_partition_sizes[part_index] - weight_size = self._get_weight_size(layer, args.location, + weight_size = self._get_weight_size(layer, scheme, args, input_size, output_size) data_key = self._get_data_key(scheme, weight_size) @@ -69,9 +67,6 @@ class HadamardTransform(torch.nn.Module): # validate that shared tensors and schemes are correct self._validate_input_transforms() - # select kernel based on transform schemes - self.kernel = self._infer_kernel() if kernel is None else kernel - def process_weights_after_loading(self): for part_id in self.weight.partitions: data = self.weight.partitions[part_id].data @@ -90,32 +85,59 @@ class HadamardTransform(torch.nn.Module): if part_id not in self.weight.partitions: return value - weight = self.weight.partitions[part_id] - weight = weight if self.transforms[ - part_id].args.inverse else weight.T # linear := x(W.T) - scale = self.scales[part_id] - return self.kernel(self, value.to(weight.dtype), weight, None).to( - value.dtype) * scale + # use hadacore if possible + if self.transforms[part_id].scheme.type == "hadamard": + if self.transforms[part_id].scheme.head_dim is not None: + weight_size = self.transforms[part_id].scheme.head_dim + value = value.unflatten(-1, (-1, weight_size)) + value = ops.hadacore_transform(value) + value = value.flatten(-2, -1) + + return value + + # sylvester transforms are symmetric, inv => transpose => original + return ops.hadacore_transform(value) + + # fall back to dense + else: + weight = self.weight.partitions[part_id] + weight = weight if self.transforms[ + part_id].args.inverse else weight.T # linear := x(W.T) + scale = self.scales[part_id] + + if self.transforms[part_id].scheme.head_dim is not None: + value = value.unflatten(-1, (-1, weight.size(0))) + value = dispatch_unquantized_gemm()(self, value.to( + weight.dtype), weight, None).to(value.dtype) * scale + value = value.flatten(-2, -1) + + return value + + return dispatch_unquantized_gemm()(self, value.to( + weight.dtype), weight, None).to(value.dtype) * scale def _get_data_key(self, scheme: TransformScheme, weight_size: int) -> Hashable: return (id(scheme), weight_size) - def _get_weight_size(self, layer: torch.nn.Module, - location: TransformLocation, input_size: int, + def _get_weight_size(self, layer: torch.nn.Module, scheme: TransformScheme, + args: TransformArgs, input_size: int, output_size: int) -> int: + if scheme.head_dim is not None: + return scheme.head_dim + if isinstance(layer, LinearBase): - if location == TransformLocation.INPUT: + if args.location == TransformLocation.INPUT: return input_size - elif location == TransformLocation.OUTPUT: + elif args.location == TransformLocation.OUTPUT: return output_size elif isinstance(layer, VocabParallelEmbedding): - if location == TransformLocation.INPUT: + if args.location == TransformLocation.INPUT: return output_size - elif location == TransformLocation.OUTPUT: + elif args.location == TransformLocation.OUTPUT: return input_size raise ValueError() @@ -129,7 +151,3 @@ class HadamardTransform(torch.nn.Module): for partition in self.weight.partitions.values(): if partition.data.data_ptr() != first_data.data_ptr(): raise ValueError("") - - def _infer_kernel(self) -> Callable: - # TODO (@ksayers): use fwht, hadacore - return dispatch_unquantized_gemm() diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py index f42258f9f9d7f..69b39f31eec1e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py @@ -4,18 +4,43 @@ from typing import Optional import torch +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsScheme, CompressedTensorsW4A4Fp4) from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import ( # noqa: E501 - CompressedTensorsLinearTransformMethod) + CompressedTensorsLinearTransformMethod, TransformTuple) + +__all__ = ["is_qutlass_fp4_scheme", "QutlassNvFP4LinearMethod"] -# Because qutlass fuses hadamard with quantization, it cannot automatically be -# composed with kernels in the way CompressedTensorsLinearTransformMethod does. -# Therefore, a separate scheme must be created for each quantized dtype -class QutlassLinearMethodNvFP4(CompressedTensorsLinearTransformMethod): +def is_qutlass_fp4_scheme(quant_scheme: Optional[CompressedTensorsScheme], + input_tfms: dict[int, TransformTuple]) -> bool: + return isinstance( + quant_scheme, + (CompressedTensorsW4A4Fp4, )) and len(input_tfms) == 1 and input_tfms[ + 0].scheme.head_dim == quant_scheme.group_size + + +class QutlassNvFP4LinearMethod(CompressedTensorsLinearTransformMethod): + + def create_weights(self, layer, input_size_per_partition, + output_partition_sizes, input_size, output_size, + params_dtype, **extra_weight_attrs): + # initializes fp4 qparams + assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4, )) + ret = super().create_weights(layer, input_size_per_partition, + output_partition_sizes, input_size, + output_size, params_dtype, + **extra_weight_attrs) + + assert self.input_transform is not None + assert len(self.input_transform.weight) == 1 + assert self.input_transform.weight[0].size( + 0) == layer.scheme.group_size + + return ret def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - # fused hadamard quant linear method raise NotImplementedError() From 49bfc538e4047fce58d3a43b975aef0bab795b25 Mon Sep 17 00:00:00 2001 From: Sage Moore <sage@neuralmagic.com> Date: Mon, 15 Sep 2025 12:05:48 -0700 Subject: [PATCH 286/584] Update num_tokens_across_dp to use nccl instead of gloo (#24105) Signed-off-by: Sage Moore <sage@neuralmagic.com> --- vllm/envs.py | 8 ++++++++ vllm/forward_context.py | 21 +++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 7ea18064c9446..24133f79b09b7 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -92,6 +92,7 @@ if TYPE_CHECKING: VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: list[str] = [] + VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: bool = False VLLM_USE_V1: bool = True VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False @@ -745,6 +746,13 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[ "VLLM_DISABLED_KERNELS"].split(","), + # Swaps the all reduce backend that we use to coordinate the DP padding + # information from NCCL to gloo. + "VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION": + lambda: + (os.getenv("VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION", "False").lower() in + ("true", "1")), + # If set, use the V1 code path. "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))), diff --git a/vllm/forward_context.py b/vllm/forward_context.py index c57c51d289ac8..b3ddd7b9a7392 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -13,6 +13,7 @@ import torch.distributed as dist import vllm.envs as envs from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger +from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -75,14 +76,26 @@ class DPMetadata: Gather the num_tokens across all DP ranks and return results in a CPU tensor of size dp_size. """ + from vllm.distributed.parallel_state import get_dp_group + device = current_platform.device_type + group = get_dp_group().device_group + + # Transfering this tensor from GPU to CPU will introduce a GPU sync + # point that could adversely affect performance of vllm with asynch + # scheduling. This environment variable exists to quickly disable + # this optimization if we run into this case. + if envs.VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: + logger.info_once( + "Using CPU all reduce to syncronize DP padding between ranks.") + device = "cpu" + group = get_dp_group().cpu_group num_tokens_across_dp = [0] * dp_size num_tokens_across_dp[dp_rank] = num_tokens num_tokens_tensor = torch.tensor(num_tokens_across_dp, - device="cpu", + device=device, dtype=torch.int32) - from vllm.distributed.parallel_state import get_dp_group - dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) - return num_tokens_tensor + dist.all_reduce(num_tokens_tensor, group=group) + return num_tokens_tensor.cpu() @staticmethod def make( From 94b03f88dd4d69ccf65e52439f4d2dacff97862f Mon Sep 17 00:00:00 2001 From: Benjamin Bartels <benjamin@bartels.dev> Date: Mon, 15 Sep 2025 20:45:55 +0100 Subject: [PATCH 287/584] Bump Flashinfer to 0.3.1 (#24868) Signed-off-by: bbartels <benjamin@bartels.dev> --- docker/Dockerfile | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 307e9658f7175..4d0779d5f648f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -375,7 +375,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.3.0" +ARG FLASHINFER_GIT_REF="v0.3.1" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... diff --git a/setup.py b/setup.py index eb313b7d219c7..9fec424d9cfe6 100644 --- a/setup.py +++ b/setup.py @@ -664,7 +664,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.3.0"], + "flashinfer": ["flashinfer-python==0.3.1"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, From 25aba2b6a323e083004705c05077ef5327beaf39 Mon Sep 17 00:00:00 2001 From: Andrew Xia <axia@meta.com> Date: Mon, 15 Sep 2025 13:07:55 -0700 Subject: [PATCH 288/584] [gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561) Signed-off-by: Andrew Xia <axia@meta.com> --- .../openai/test_response_api_with_harmony.py | 14 +++++++++++ vllm/entrypoints/context.py | 25 +++++++++++-------- vllm/entrypoints/harmony_utils.py | 4 ++- vllm/entrypoints/openai/protocol.py | 17 ++++++++++--- vllm/entrypoints/openai/serving_responses.py | 18 ++++++++++--- vllm/v1/core/sched/utils.py | 8 +++--- vllm/v1/engine/output_processor.py | 6 ++--- 7 files changed, 67 insertions(+), 25 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 0d5836fab5a7c..88b3795abe73e 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str): assert response.status == "completed" +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_max_tokens(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is the first paragraph of Moby Dick?", + reasoning={"effort": "low"}, + max_output_tokens=30, + ) + assert response is not None + assert response.status == "incomplete" + assert response.incomplete_details.reason == "max_output_tokens" + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_chat(client: OpenAI, model_name: str): diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 9012639457cad..6658f91595e51 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext): available_tools: list[str], ): self._messages = messages + self.finish_reason: Optional[str] = None self.available_tools = available_tools self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} self.called_tools: set[str] = set() @@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext): if self.parser.current_channel in {"analysis", "commentary"}: self.num_reasoning_tokens += 1 - def append_output(self, output) -> None: + def append_output(self, output: Union[RequestOutput, + list[Message]]) -> None: if isinstance(output, RequestOutput): output_token_ids = output.outputs[0].token_ids self.parser = get_streamable_parser_for_assistant() @@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext): # Move current turn to previous turn for next turn's calculations self.previous_turn = self.current_turn.copy() output_msgs = self.parser.messages + # The responses finish reason is set in the last message + self.finish_reason = output.outputs[0].finish_reason else: # Tool output. output_msgs = output @@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext): def _update_prefill_token_usage(self, output: RequestOutput) -> None: """Update token usage statistics for the prefill phase of generation. - + The prefill phase processes the input prompt tokens. This method: 1. Counts the prompt tokens for this turn 2. Calculates tool output tokens for multi-turn conversations 3. Updates cached token counts 4. Tracks state for next turn calculations - + Tool output tokens are calculated as: - current_prompt_tokens - last_turn_prompt_tokens - + current_prompt_tokens - last_turn_prompt_tokens - last_turn_output_tokens This represents tokens added between turns (typically tool responses). - + Args: output: The RequestOutput containing prompt token information """ @@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext): def _update_decode_token_usage(self, output: RequestOutput) -> int: """Update token usage statistics for the decode phase of generation. - + The decode phase processes the generated output tokens. This method: 1. Counts output tokens from all completion outputs 2. Updates the total output token count 3. Tracks tokens generated in the current turn - + In streaming mode, this is called for each token generated. In non-streaming mode, this is called once with all output tokens. - + Args: output: The RequestOutput containing generated token information - + Returns: int: Number of output tokens processed in this call """ @@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext): def messages(self) -> list: return self.parser.messages - def append_output(self, output) -> None: + def append_output(self, output: Union[RequestOutput, + list[Message]]) -> None: if isinstance(output, RequestOutput): # append_output is called for each output token in streaming case, # so we only want to add the prompt tokens once for each message. diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index f7528ba81dce5..1364a41be950d 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -387,7 +387,9 @@ def parse_remaining_state( id=f"msg_{random_uuid()}", content=[output_text], role="assistant", - status="completed", + # if the parser still has messages (ie if the generator got cut + # abruptly), this should be incomplete + status="incomplete", type="message", ) return [text_item] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 4dcb1f3f1c89f..8ecb1a8239c35 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0) from openai.types.responses import (ResponseFormatTextConfig as ResponseTextConfig) -from openai.types.responses.response import ToolChoice +from openai.types.responses.response import IncompleteDetails, ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, @@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") created_at: int = Field(default_factory=lambda: int(time.time())) # error: Optional[ResponseError] = None - # incomplete_details: Optional[IncompleteDetails] = None + incomplete_details: Optional[IncompleteDetails] = None instructions: Optional[str] = None metadata: Optional[Metadata] = None model: str @@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel): status: ResponseStatus, usage: Optional[ResponseUsage] = None, ) -> "ResponsesResponse": + + incomplete_details: Optional[IncompleteDetails] = None + if status == 'incomplete': + incomplete_details = IncompleteDetails(reason='max_output_tokens') + # TODO: implement the other reason for incomplete_details, + # which is content_filter + # incomplete_details = IncompleteDetails(reason='content_filter') + return cls( id=request.request_id, created_at=created_time, + incomplete_details=incomplete_details, instructions=request.instructions, metadata=request.metadata, model=model_name, @@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel): class TokenizerInfoResponse(OpenAIBaseModel): """ - Response containing tokenizer configuration + Response containing tokenizer configuration equivalent to tokenizer_config.json """ @@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel): to_language: Optional[str] = None """The language of the output audio we transcribe to. - Please note that this is not currently used by supported models at this + Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api. """ diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 401ba6c53331c..088051f783a8e 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent, ResponseReasoningItem, ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, - response_text_delta_event) + ResponseStatus, response_text_delta_event) from openai.types.responses.response_output_text import (Logprob, LogprobTopLogprob) # yapf: enable @@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing): # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + # NOTE: Implementation of stauts is still WIP, but for now + # we guarantee that if the status is not "completed", it is accurate. + # "completed" is implemented as the "catch-all" for now. + status: ResponseStatus = "completed" + if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) num_tool_output_tokens = context.num_tool_output_tokens + if len(output) > 0: + if context.finish_reason == "length": + status = "incomplete" + elif context.finish_reason == "abort": + status = "cancelled" + else: + status = "incomplete" else: assert isinstance(context, SimpleContext) final_res = context.last_output @@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing): model_name=model_name, created_time=created_time, output=output, - status="completed", + status=status, usage=usage, ) @@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing): self, context: HarmonyContext, ) -> list[ResponseOutputItem]: - output_items = [] + output_items: list[ResponseOutputItem] = [] num_init_messages = context.num_init_messages for msg in context.messages[num_init_messages:]: output_items.extend(parse_output_message(msg)) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 42d3e5c68b4c8..c431843de6baa 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus def remove_all(lst: list, items_to_remove: set) -> list: """Remove all items from a list that are in the items_to_remove set. - + This method optimizes for the common case of removing a single item, falling back to list comprehension for multiple items. - + Args: lst: The list to remove items from items_to_remove: Set of items to remove - + Returns: Either the modified original list (for single item removal) or a new list (for multiple item removal). Callers should use the returned value. - + Note: For single item removal, this modifies the original list in-place and returns it. For multiple items, it creates and returns a new list. diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 02c8c61cb9093..14ac1e3e5afa8 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -373,17 +373,17 @@ class OutputProcessor: 1) Compute stats for logging 2) Detokenize 3) Create and handle RequestOutput objects: - * If there is a queue (for usage with AsyncLLM), + * If there is a queue (for usage with AsyncLLM), put the RequestOutput objects into the queue for handling by the per-request generate() tasks. - * If there is no queue (for usage with LLMEngine), + * If there is no queue (for usage with LLMEngine), return a list of RequestOutput objects. NOTE FOR DEVELOPERS vLLM V1 minimizes the number of python loops over the full - batch to ensure system overheads are minimized. This is the + batch to ensure system overheads are minimized. This is the only function that should loop over EngineCoreOutputs. If you need to touch every element of the batch, do it from From 73df49ef3a220c79abfffc36bdfb4e8dee61226b Mon Sep 17 00:00:00 2001 From: Andrew Xia <axia@meta.com> Date: Mon, 15 Sep 2025 13:08:08 -0700 Subject: [PATCH 289/584] [gpt-oss][1a] create_responses stream outputs BaseModel type, api server is SSE still (#24759) Signed-off-by: Andrew Xia <axia@meta.com> --- vllm/entrypoints/openai/api_server.py | 27 +++- vllm/entrypoints/openai/serving_responses.py | 134 ++++++++++--------- 2 files changed, 90 insertions(+), 71 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c159bcee315f2..2e4aa7f3d5a6f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -15,7 +15,7 @@ import socket import tempfile import uuid from argparse import Namespace -from collections.abc import AsyncIterator, Awaitable +from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus @@ -29,6 +29,7 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse +from openai import BaseModel from prometheus_client import make_asgi_app from prometheus_fastapi_instrumentator import Instrumentator from starlette.concurrency import iterate_in_threadpool @@ -577,6 +578,18 @@ async def show_version(): return JSONResponse(content=ver) +async def _convert_stream_to_sse_events( + generator: AsyncGenerator[BaseModel, + None]) -> AsyncGenerator[str, None]: + """Convert the generator to a stream of events in SSE format""" + async for event in generator: + event_type = getattr(event, 'type', 'unknown') + # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format + event_data = (f"event: {event_type}\n" + f"data: {event.model_dump_json(indent=None)}\n\n") + yield event_data + + @router.post("/v1/responses", dependencies=[Depends(validate_json_request)], responses={ @@ -612,7 +625,9 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): status_code=generator.error.code) elif isinstance(generator, ResponsesResponse): return JSONResponse(content=generator.model_dump()) - return StreamingResponse(content=generator, media_type="text/event-stream") + + return StreamingResponse(content=_convert_stream_to_sse_events(generator), + media_type="text/event-stream") @router.get("/v1/responses/{response_id}") @@ -640,10 +655,10 @@ async def retrieve_responses( if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), status_code=response.error.code) - elif stream: - return StreamingResponse(content=response, - media_type="text/event-stream") - return JSONResponse(content=response.model_dump()) + elif isinstance(response, ResponsesResponse): + return JSONResponse(content=response.model_dump()) + return StreamingResponse(content=_convert_stream_to_sse_events(response), + media_type="text/event-stream") @router.post("/v1/responses/{response_id}/cancel") diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 088051f783a8e..9e285e6e51756 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -10,7 +10,7 @@ from collections.abc import AsyncGenerator, AsyncIterator, Sequence from contextlib import AsyncExitStack from copy import copy from http import HTTPStatus -from typing import Callable, Final, Optional, Union +from typing import Callable, Final, Optional, TypeVar, Union import jinja2 import openai.types.responses as openai_responses_types @@ -175,7 +175,8 @@ class OpenAIServingResponses(OpenAIServing): # HACK(wuhang): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove events from the store. - self.event_store: dict[str, tuple[deque[str], asyncio.Event]] = {} + self.event_store: dict[str, tuple[deque[BaseModel], + asyncio.Event]] = {} self.background_tasks: dict[str, asyncio.Task] = {} @@ -185,7 +186,8 @@ class OpenAIServingResponses(OpenAIServing): self, request: ResponsesRequest, raw_request: Optional[Request] = None, - ) -> Union[AsyncGenerator[str, None], ResponsesResponse, ErrorResponse]: + ) -> Union[AsyncGenerator[BaseModel, None], ResponsesResponse, + ErrorResponse]: error_check_ret = await self._check_model(request) if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) @@ -812,7 +814,7 @@ class OpenAIServingResponses(OpenAIServing): *args, **kwargs, ): - event_deque: deque[str] = deque() + event_deque: deque[BaseModel] = deque() new_event_signal = asyncio.Event() self.event_store[request.request_id] = (event_deque, new_event_signal) response = None @@ -827,8 +829,6 @@ class OpenAIServingResponses(OpenAIServing): request.request_id) response = self.create_error_response(str(e)) finally: - # Mark as finished with a special marker - event_deque.append("__STREAM_END__") new_event_signal.set() if response is not None and isinstance(response, ErrorResponse): @@ -867,7 +867,7 @@ class OpenAIServingResponses(OpenAIServing): self, response_id: str, starting_after: Optional[int] = None, - ): + ) -> AsyncGenerator[BaseModel, None]: if response_id not in self.event_store: raise ValueError(f"Unknown response_id: {response_id}") @@ -881,9 +881,9 @@ class OpenAIServingResponses(OpenAIServing): # Yield existing events from start_index while current_index < len(event_deque): event = event_deque[current_index] - if event == "__STREAM_END__": - return yield event + if getattr(event, 'type', 'unknown') == "response.completed": + return current_index += 1 await new_event_signal.wait() @@ -893,7 +893,8 @@ class OpenAIServingResponses(OpenAIServing): response_id: str, starting_after: Optional[int], stream: Optional[bool], - ) -> Union[ErrorResponse, ResponsesResponse]: + ) -> Union[ErrorResponse, ResponsesResponse, AsyncGenerator[BaseModel, + None]]: if not response_id.startswith("resp_"): return self._make_invalid_id_error(response_id) @@ -976,8 +977,9 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: int, - _send_event: Callable[[BaseModel], str], - ) -> AsyncGenerator[str, None]: + _increment_sequence_number_and_return: Callable[[BaseModel], + BaseModel], + ) -> AsyncGenerator[BaseModel, None]: current_content_index = 0 current_output_index = 0 current_item_id = "" @@ -1014,7 +1016,7 @@ class OpenAIServingResponses(OpenAIServing): if not first_delta_sent: current_item_id = str(uuid.uuid4()) if delta_message.reasoning_content: - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1029,7 +1031,7 @@ class OpenAIServingResponses(OpenAIServing): ), )) else: - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1044,7 +1046,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, @@ -1072,7 +1074,7 @@ class OpenAIServingResponses(OpenAIServing): reason_content = ''.join( pm.reasoning_content for pm in previous_delta_messages if pm.reasoning_content is not None) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( type="response.reasoning_text.done", item_id=current_item_id, @@ -1094,14 +1096,14 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, output_index=current_output_index, item=reasoning_item, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, @@ -1116,7 +1118,7 @@ class OpenAIServingResponses(OpenAIServing): )) current_output_index += 1 current_item_id = str(uuid.uuid4()) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, @@ -1135,7 +1137,7 @@ class OpenAIServingResponses(OpenAIServing): previous_delta_messages = [] if delta_message.reasoning_content is not None: - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDeltaEvent( type="response.reasoning_text.delta", sequence_number=-1, @@ -1145,7 +1147,7 @@ class OpenAIServingResponses(OpenAIServing): delta=delta_message.reasoning_content, )) elif delta_message.content is not None: - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDeltaEvent( type="response.output_text.delta", sequence_number=-1, @@ -1168,7 +1170,7 @@ class OpenAIServingResponses(OpenAIServing): reason_content = ''.join(pm.reasoning_content for pm in previous_delta_messages if pm.reasoning_content is not None) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( type="response.reasoning_text.done", item_id=current_item_id, @@ -1190,7 +1192,7 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1201,7 +1203,7 @@ class OpenAIServingResponses(OpenAIServing): final_content = ''.join(pm.content for pm in previous_delta_messages if pm.content is not None) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDoneEvent( type="response.output_text.done", sequence_number=-1, @@ -1217,7 +1219,7 @@ class OpenAIServingResponses(OpenAIServing): type="output_text", annotations=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseContentPartDoneEvent( type="response.content_part.done", sequence_number=-1, @@ -1237,7 +1239,7 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1255,8 +1257,9 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: int, - _send_event: Callable[[BaseModel], str], - ) -> AsyncGenerator[str, None]: + _increment_sequence_number_and_return: Callable[[BaseModel], + BaseModel], + ) -> AsyncGenerator[BaseModel, None]: current_content_index = 0 # FIXME: this number is never changed current_output_index = 0 current_item_id = "" # FIXME: this number is never changed @@ -1288,7 +1291,7 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( type="response.reasoning_text.done", item_id=current_item_id, @@ -1297,7 +1300,7 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, text=previous_item.content[0].text, )) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1310,7 +1313,7 @@ class OpenAIServingResponses(OpenAIServing): text=previous_item.content[0].text, annotations=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDoneEvent( type="response.output_text.done", sequence_number=-1, @@ -1320,7 +1323,7 @@ class OpenAIServingResponses(OpenAIServing): logprobs=[], item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartDoneEvent( type="response.content_part.done", @@ -1330,7 +1333,7 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, part=text_content, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1344,12 +1347,13 @@ class OpenAIServingResponses(OpenAIServing): ), )) + # stream the output of a harmony message if ctx.parser.last_content_delta: if (ctx.parser.current_channel == "final" and ctx.parser.current_recipient is None): if not sent_output_item_added: sent_output_item_added = True - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1364,7 +1368,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartAddedEvent( type="response.content_part.added", @@ -1379,7 +1383,7 @@ class OpenAIServingResponses(OpenAIServing): logprobs=[], ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDeltaEvent( type="response.output_text.delta", sequence_number=-1, @@ -1394,7 +1398,7 @@ class OpenAIServingResponses(OpenAIServing): and ctx.parser.current_recipient is None): if not sent_output_item_added: sent_output_item_added = True - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1408,7 +1412,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartAddedEvent( type="response.content_part.added", @@ -1423,7 +1427,7 @@ class OpenAIServingResponses(OpenAIServing): logprobs=[], ), )) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDeltaEvent( type="response.reasoning_text.delta", item_id=current_item_id, @@ -1440,7 +1444,7 @@ class OpenAIServingResponses(OpenAIServing): ) and ctx.parser.current_recipient == "python": if not sent_output_item_added: sent_output_item_added = True - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1456,7 +1460,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallInProgressEvent( type= @@ -1465,7 +1469,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallCodeDeltaEvent( type="response.code_interpreter_call_code.delta", @@ -1474,6 +1478,8 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, delta=ctx.parser.last_content_delta, )) + + # stream tool call outputs if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0: previous_item = ctx.parser.messages[-1] if (self.tool_server is not None @@ -1510,7 +1516,7 @@ class OpenAIServingResponses(OpenAIServing): raise ValueError( f"Unknown function name: {function_name}") - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, @@ -1525,7 +1531,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseWebSearchCallInProgressEvent( type="response.web_search_call.in_progress", @@ -1533,7 +1539,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseWebSearchCallSearchingEvent( type="response.web_search_call.searching", @@ -1543,7 +1549,7 @@ class OpenAIServingResponses(OpenAIServing): )) # enqueue - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseWebSearchCallCompletedEvent( type="response.web_search_call.completed", @@ -1551,7 +1557,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1569,7 +1575,7 @@ class OpenAIServingResponses(OpenAIServing): and self.tool_server.has_tool("python") and previous_item.recipient is not None and previous_item.recipient.startswith("python")): - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallCodeDoneEvent( type="response.code_interpreter_call_code.done", @@ -1578,7 +1584,7 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, code=previous_item.content[0].text, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallInterpretingEvent( type="response.code_interpreter_call.interpreting", @@ -1586,7 +1592,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallCompletedEvent( type="response.code_interpreter_call.completed", @@ -1594,7 +1600,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1621,7 +1627,7 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: Optional[int] = None, - ) -> AsyncGenerator[str, None]: + ) -> AsyncGenerator[BaseModel, None]: # TODO: # 1. Handle disconnect @@ -1629,16 +1635,15 @@ class OpenAIServingResponses(OpenAIServing): sequence_number = 0 - def _send_event(event: BaseModel): + T = TypeVar("T", bound=BaseModel) + + def _increment_sequence_number_and_return(event: T) -> T: nonlocal sequence_number # Set sequence_number if the event has this attribute if hasattr(event, 'sequence_number'): event.sequence_number = sequence_number sequence_number += 1 - # Get event type from the event's type field if it exists - event_type = getattr(event, 'type', 'unknown') - return (f"event: {event_type}\n" - f"data: {event.model_dump_json(indent=None)}\n\n") + return event async with AsyncExitStack() as exit_stack: processer = None @@ -1658,24 +1663,23 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", usage=None, ).model_dump() - yield _send_event( + yield _increment_sequence_number_and_return( ResponseCreatedEvent( type="response.created", sequence_number=-1, response=initial_response, )) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseInProgressEvent( type="response.in_progress", sequence_number=-1, response=initial_response, )) - async for event_data in processer(request, sampling_params, - result_generator, context, - model_name, tokenizer, - request_metadata, created_time, - _send_event): + async for event_data in processer( + request, sampling_params, result_generator, context, + model_name, tokenizer, request_metadata, created_time, + _increment_sequence_number_and_return): yield event_data async def empty_async_generator(): @@ -1694,7 +1698,7 @@ class OpenAIServingResponses(OpenAIServing): request_metadata, created_time=created_time, ) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseCompletedEvent( type="response.completed", sequence_number=-1, From aae725af7cee79628e576e7b394811050f651e49 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Mon, 15 Sep 2025 16:21:53 -0400 Subject: [PATCH 290/584] [Performance] Remove redundant clone() calls in cutlass_mla (#24891) --- vllm/v1/attention/backends/mla/cutlass_mla.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 6017445402eca..78af8d28f8892 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -210,9 +210,14 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): sm_scale, num_kv_splits, ) - returned_lse = lse[:, :H].contiguous( - ) if self.need_to_return_lse_for_decode else lse - return out[:, :H].contiguous(), returned_lse + + if H < MAX_HEADS: + # Extract the subsets of the outputs + returned_lse = lse[:, :H].contiguous( + ) if self.need_to_return_lse_for_decode else lse + out = out[:, :H] + + return out, returned_lse def _sm100_forward_decode( self, @@ -228,11 +233,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): self._workspace.ensure_size(attn_metadata, self._num_kv_splits) # Run MLA - # Clone q_nope and q_pe to make sure strides computation is correct. - # TODO: Check if we really need it - q_nope = q_nope.clone() - q_pe = q_pe.clone() - o, lse = self._sm100_cutlass_mla_decode( q_nope, q_pe, From e757a629e7336c4e436dac62daee20fd81776aeb Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 15 Sep 2025 17:21:17 -0400 Subject: [PATCH 291/584] [Bug] Fix Cutlass Scaled MM Compilation Error (#24887) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh | 42 ++++++++++--------- ...scaled_mm_blockwise_sm120_fp8_dispatch.cuh | 24 ++++++----- .../scaled_mm_blockwise_sm90_fp8_dispatch.cuh | 24 ++++++----- 3 files changed, 51 insertions(+), 39 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index 939879b2c59fa..dbf79a0651159 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -146,6 +146,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; + using ElementBlockScale = typename Gemm::ElementBlockScale; int32_t m = a.size(0), n = b.size(1), k = a.size(1); @@ -166,26 +167,29 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); - auto a_ptr = static_cast<ElementAB*>(a.data_ptr()); - auto b_ptr = static_cast<ElementAB*>(b.data_ptr()); - auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr()); - auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr()); + auto a_ptr = static_cast<ElementAB const*>(a.data_ptr()); + auto b_ptr = static_cast<ElementAB const*>(b.data_ptr()); + auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr()); + auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr()); - auto mainloop_args = [&](){ - // layout_SFA and layout_SFB cannot be swapped since they are deduced. - if (swap_ab) { - return typename GemmKernel::MainloopArguments{ - b_ptr, b_stride, a_ptr, a_stride, - b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB - }; - } - else { - return typename GemmKernel::MainloopArguments{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB - }; - } - }(); + typename GemmKernel::MainloopArguments mainloop_args{}; + mainloop_args.layout_SFA = layout_SFA; + mainloop_args.layout_SFB = layout_SFB; + if (swap_ab) { + mainloop_args.ptr_A = b_ptr; + mainloop_args.dA = b_stride; + mainloop_args.ptr_B = a_ptr; + mainloop_args.dB = a_stride; + mainloop_args.ptr_SFA = b_scales_ptr; + mainloop_args.ptr_SFB = a_scales_ptr; + } else { + mainloop_args.ptr_A = a_ptr; + mainloop_args.dA = a_stride; + mainloop_args.ptr_B = b_ptr; + mainloop_args.dB = b_stride; + mainloop_args.ptr_SFA = a_scales_ptr; + mainloop_args.ptr_SFB = b_scales_ptr; + } auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1); auto c_ptr = static_cast<ElementD*>(out.data_ptr()); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh index 78d5cf37fa6d0..811741aee58b3 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh @@ -125,6 +125,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; + using ElementBlockScale = typename Gemm::ElementBlockScale; int32_t m = a.size(0), n = b.size(1), k = a.size(1); @@ -143,17 +144,20 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, LayoutSFB layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); - auto a_ptr = static_cast<ElementAB*>(a.data_ptr()); - auto b_ptr = static_cast<ElementAB*>(b.data_ptr()); - auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr()); - auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr()); + auto a_ptr = static_cast<ElementAB const*>(a.data_ptr()); + auto b_ptr = static_cast<ElementAB const*>(b.data_ptr()); + auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr()); + auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr()); - auto mainloop_args = [&](){ - return typename GemmKernel::MainloopArguments{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB - }; - }(); + typename GemmKernel::MainloopArguments mainloop_args{}; + mainloop_args.ptr_A = a_ptr; + mainloop_args.dA = a_stride; + mainloop_args.ptr_B = b_ptr; + mainloop_args.dB = b_stride; + mainloop_args.ptr_SFA = a_scales_ptr; + mainloop_args.layout_SFA = layout_SFA; + mainloop_args.ptr_SFB = b_scales_ptr; + mainloop_args.layout_SFB = layout_SFB; auto prob_shape = cute::make_shape(m, n, k, 1); auto c_ptr = static_cast<ElementD*>(out.data_ptr()); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh index 86220264151e7..147eb8efc0778 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh @@ -115,6 +115,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; + using ElementBlockScale = typename Gemm::ElementBlockScale; int32_t m = a.size(0), n = b.size(1), k = a.size(1); @@ -135,17 +136,20 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, LayoutSFB layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); - auto a_ptr = static_cast<ElementAB*>(a.data_ptr()); - auto b_ptr = static_cast<ElementAB*>(b.data_ptr()); - auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr()); - auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr()); + auto a_ptr = static_cast<ElementAB const*>(a.data_ptr()); + auto b_ptr = static_cast<ElementAB const*>(b.data_ptr()); + auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr()); + auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr()); - auto mainloop_args = [&](){ - return typename GemmKernel::MainloopArguments{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB - }; - }(); + typename GemmKernel::MainloopArguments mainloop_args{}; + mainloop_args.ptr_A = a_ptr; + mainloop_args.dA = a_stride; + mainloop_args.ptr_B = b_ptr; + mainloop_args.dB = b_stride; + mainloop_args.ptr_SFA = a_scales_ptr; + mainloop_args.layout_SFA = layout_SFA; + mainloop_args.ptr_SFB = b_scales_ptr; + mainloop_args.layout_SFB = layout_SFB; auto prob_shape = cute::make_shape(m, n, k, 1); auto c_ptr = static_cast<ElementD*>(out.data_ptr()); From fd2f10546c88a8accc199703b67ea31b427705b1 Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Mon, 15 Sep 2025 14:39:08 -0700 Subject: [PATCH 292/584] [ci] fix wheel names for arm wheels (#24898) Signed-off-by: simon-mo <simon.mo@hey.com> --- .buildkite/release-pipeline.yaml | 16 ++++---------- .buildkite/scripts/annotate-release.sh | 29 +++++++++++++++++++------- docker/Dockerfile | 2 ++ setup.py | 6 ++---- vllm/envs.py | 6 ++++++ 5 files changed, 36 insertions(+), 23 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index a1de41652c9a6..8c6ef7817aaf8 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,24 +1,22 @@ steps: # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 - label: "Build arm64 wheel - CUDA 12.9" + depends_on: ~ id: build-wheel-arm64-cuda-12-9 agents: queue: arm64_cpu_queue_postmerge commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 12.8 wheel" - key: block-build-cu128-wheel - - label: "Build wheel - CUDA 12.8" - depends_on: block-build-cu128-wheel + depends_on: ~ id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge @@ -30,12 +28,8 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 12.6 wheel" - key: block-build-cu126-wheel - depends_on: ~ - - label: "Build wheel - CUDA 12.6" - depends_on: block-build-cu126-wheel + depends_on: ~ id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge @@ -102,8 +96,6 @@ steps: depends_on: - create-multi-arch-manifest - build-wheel-cuda-12-8 - - build-wheel-cuda-12-6 - - build-wheel-cuda-12-9 id: annotate-release-workflow agents: queue: cpu_queue_postmerge diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 94e0ac2398f34..fde48603ad3cd 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF To download the wheel: \`\`\` aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . + aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: \`\`\` -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai -docker tag vllm/vllm-openai vllm/vllm-openai:latest -docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} -docker push vllm/vllm-openai:latest -docker push vllm/vllm-openai:v${RELEASE_VERSION} +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +docker push vllm/vllm-openai:latest-x86_64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker push vllm/vllm-openai:latest-aarch64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 + +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest push vllm/vllm-openai:latest +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` EOF \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 4d0779d5f648f..17f8e6043f895 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels ARG VLLM_USE_PRECOMPILED="" +ARG VLLM_MAIN_CUDA_VERSION="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -213,6 +214,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \ && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ diff --git a/setup.py b/setup.py index 9fec424d9cfe6..e4c40d22b928d 100644 --- a/setup.py +++ b/setup.py @@ -56,8 +56,6 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None # fallback to cpu VLLM_TARGET_DEVICE = "cpu" -MAIN_CUDA_VERSION = "12.8" - def is_sccache_available() -> bool: return which("sccache") is not None and \ @@ -507,7 +505,7 @@ def get_vllm_version() -> str: version += f"{sep}precompiled" else: cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: + if cuda_version != envs.VLLM_MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] # skip this for source tarball, required for pypi if "sdist" not in sys.argv: @@ -515,7 +513,7 @@ def get_vllm_version() -> str: elif _is_hip(): # Get the Rocm Version rocm_version = get_rocm_version() or torch.version.hip - if rocm_version and rocm_version != MAIN_CUDA_VERSION: + if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION: version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" elif _is_tpu(): version += f"{sep}tpu" diff --git a/vllm/envs.py b/vllm/envs.py index 24133f79b09b7..d2006979ea81c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -70,6 +70,7 @@ if TYPE_CHECKING: VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" + VLLM_MAIN_CUDA_VERSION: str = "12.8" MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False @@ -249,6 +250,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), + # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9], + # 12.8 is the default. This follows PyTorch but can be overridden. + "VLLM_MAIN_CUDA_VERSION": + lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8", + # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": From 45bfa49cb804b0090f32980aaa68f9cad409d15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= <mickael@mistral.ai> Date: Mon, 15 Sep 2025 23:48:27 +0200 Subject: [PATCH 293/584] [Tests] fix initialization of kv hash in tests (#24273) Signed-off-by: Mickael Seznec <mickael@mistral.ai> --- tests/v1/core/test_kv_cache_utils.py | 17 ++++++++++------- tests/v1/core/test_prefix_caching.py | 15 ++++++++++----- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index d1427f5eaf443..319e6e84fba1e 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -30,6 +30,16 @@ from vllm.v1.request import Request # yapf: enable +@pytest.fixture(autouse=True) +def _auto_init_hash_fn(request): + hash_fn: Callable + if "hash_fn" in request.fixturenames: + hash_fn = init_none_hash(request.getfixturevalue("hash_fn")) + else: + hash_fn = sha256 + init_none_hash(hash_fn) + + def make_request( request_id: str, prompt_token_ids: list[int], @@ -424,7 +434,6 @@ def test_generate_block_hash_extra_keys_cache_salt(): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_block_tokens(hash_fn): - init_none_hash(hash_fn) parent_block_hash = BlockHash(b"123") curr_block_token_ids = (1, 2, 3) extra_keys = ("key1", "key2") @@ -437,8 +446,6 @@ def test_hash_block_tokens(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_request_block_hasher(hash_fn): - kv_cache_utils.init_none_hash(hash_fn) - request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], @@ -461,8 +468,6 @@ def test_request_block_hasher(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_tokens_different_mm_input(hash_fn): - init_none_hash(hash_fn) - request1 = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], @@ -491,8 +496,6 @@ def test_hash_tokens_different_mm_input(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_request_tokens_no_mm_inputs(hash_fn): - kv_cache_utils.init_none_hash(hash_fn) - request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 659d768bcf2e9..3cf9d93696767 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -25,6 +25,16 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, SlidingWindowSpec) +@pytest.fixture(autouse=True) +def _auto_init_hash_fn(request): + hash_fn: Callable + if "hash_fn" in request.fixturenames: + hash_fn = init_none_hash(request.getfixturevalue("hash_fn")) + else: + hash_fn = sha256 + init_none_hash(hash_fn) + + def make_request( request_id: str, prompt_token_ids: list[int], @@ -105,7 +115,6 @@ def make_kv_cache_config_hybrid_model(block_size: int, @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_prefill(hash_fn): - init_none_hash(hash_fn) block_size = 16 manager = KVCacheManager( @@ -736,7 +745,6 @@ def test_cache_blocks(hash_fn): This is a unit test that tests the correctness of the _cache_full_blocks function of KVCacheManager. """ - init_none_hash(hash_fn) block_size = 4 block_pool = BlockPool( @@ -849,7 +857,6 @@ def test_mm_prefix_caching(): """ This tests that the multi-modal prefix caching is correct. """ - kv_cache_utils.init_none_hash(sha256) block_size = 16 manager = KVCacheManager( @@ -942,8 +949,6 @@ def test_cache_key_salting(): This tests that cache salts are applied during hashing and the cache is separated cache as expected. """ - kv_cache_utils.init_none_hash(sha256) - block_size = 16 manager = KVCacheManager( make_kv_cache_config(block_size, 11), From 5bcc153d7bf69ef34bc5788a33f60f1792cf2861 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu <riverclouds.zhu@qq.com> Date: Tue, 16 Sep 2025 07:33:18 +0800 Subject: [PATCH 294/584] [Compile] Fix noop_elimination pass and add tests for noop_elimination (#24880) Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> --- .buildkite/test-pipeline.yaml | 1 + tests/compile/backend.py | 6 +- tests/compile/test_noop_elimination.py | 106 +++++++++++++++++++++++++ vllm/compilation/noop_elimination.py | 40 +++++----- 4 files changed, 130 insertions(+), 23 deletions(-) create mode 100644 tests/compile/test_noop_elimination.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index adb5c862eecd9..c3b5aa2907a49 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -394,6 +394,7 @@ steps: - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py + - pytest -v -s compile/test_noop_elimination.py - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 diff --git a/tests/compile/backend.py b/tests/compile/backend.py index ace4d25534cdd..2c4287950dcfe 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -64,4 +64,8 @@ class TestBackend: num_pre = len(list(find_op_nodes(op, self.graph_pre_pass))) num_post = len(list(find_op_nodes(op, self.graph_post_pass))) assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph" - assert num_post > 0, f"Op {op.name()} not found in post-pass graph" \ No newline at end of file + assert num_post > 0, f"Op {op.name()} not found in post-pass graph" + + def op_count(self, op: OpOverload, before=False) -> int: + graph = self.graph_pre_pass if before else self.graph_post_pass + return len(list(find_op_nodes(op, graph))) diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py new file mode 100644 index 0000000000000..242d531312675 --- /dev/null +++ b/tests/compile/test_noop_elimination.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +import vllm +from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.config import (CompilationConfig, CompilationLevel, PassConfig, + VllmConfig) + +from .backend import TestBackend + + +@pytest.mark.parametrize("dtype", + [torch.float16, torch.bfloat16, torch.float32]) +@pytest.mark.parametrize("num_tokens", [256, 1024]) +@pytest.mark.parametrize("hidden_size", [64, 4096]) +def test_noop_elimination(dtype, num_tokens, hidden_size): + torch.set_default_device("cuda") + torch.set_default_dtype(dtype) + torch.manual_seed(1) + + class Model(torch.nn.Module): + + def forward(self, x): + # Chain of reshapes + y = x.reshape(-1, 128, 32) + z = y.reshape(-1, 4096) + # No-op reshape + a = z.reshape(-1, 4096) + # Final reshape that should remain + b = a.reshape(-1, 128, 32) + # No-op slice + c = b[0:b.shape[0]] + # The pass should replace the result of this op with `c`. + d = torch.slice_scatter( + torch.ones_like(c), # Dummy tensor to be scattered into + c, # Source tensor + 0, # dim + 0, # start + c.shape[0], # end + ) + return d + + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + pass_config=PassConfig(enable_noop=True), + )) + with vllm.config.set_current_vllm_config(vllm_config): + noop_pass = NoOpEliminationPass(vllm_config) + + backend = TestBackend(noop_pass) + + model = Model() + # First dimension dynamic + x = torch.rand(num_tokens, hidden_size) + torch._dynamo.mark_dynamic(x, 0) + + result = model(x) + + model2 = torch.compile(model, backend=backend) + result2 = model2(x) + + ATOL, RTOL = (2e-3, 2e-3) + torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) + + # The no-op reshape and slice should be eliminated. + # The chain of reshapes should be fused into a single reshape. + assert backend.op_count(torch.ops.aten.reshape.default) == 1 + assert backend.op_count(torch.ops.aten.slice.Tensor) == 0 + assert backend.op_count(torch.ops.aten.slice_scatter.default) == 0 + + +def test_non_noop_slice_preserved(): + """Ensure that a slice with end=-1 (dropping last row) is NOT eliminated. + + Regression test for a bug where end=-1 was treated like an inferred + dimension (reshape semantics) leading to incorrect elimination. + """ + torch.set_default_device("cuda") + x = torch.randn(16, 16) + + class SliceModel(torch.nn.Module): + + def forward(self, x): + base = x.clone() + src = torch.ones(15, 16) + y = torch.slice_scatter(base, src, dim=0, start=0, end=-1) + return x[0:-1, :], y + + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + pass_config=PassConfig(enable_noop=True), + )) + with vllm.config.set_current_vllm_config(vllm_config): + noop_pass = NoOpEliminationPass(vllm_config) + backend = TestBackend(noop_pass) + model = SliceModel() + ref = model(x) + compiled = torch.compile(model, backend=backend) + out = compiled(x) + torch.testing.assert_close(ref, out) + # The slice should remain (not a no-op). + assert backend.op_count(torch.ops.aten.slice.Tensor) == 1 + assert backend.op_count(torch.ops.aten.slice_scatter.default) == 1 diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 4888d4d1298e3..17e85e70218da 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -62,9 +62,6 @@ class NoOpEliminationPass(VllmInductorPass): scaled_mm: "f16[s0, 4096]" = ... at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...) out: "f16[s0, 4096]" = at[1] - - TODO(luka): This is currently tested in test_fusion, - but separate tests could be good. """ def __call__(self, graph: torch.fx.Graph): @@ -96,17 +93,19 @@ class NoOpEliminationPass(VllmInductorPass): # Invalid reshape args, skip continue - if self.all_dims_equivalent(shape, input_shape): + if self.reshape_all_dims_equivalent(shape, input_shape): node.replace_all_uses_with(input) graph.erase_node(node) count += 1 elif is_func(node, torch.ops.aten.slice.Tensor): + # python slicing semantics are different from reshape + # Don't treat -1 as inferred dimension input, dim_index, start, end = node.args[:4] input_shape = input.meta["val"].shape - i_dim = input_shape[dim_index] + output_shape = node.meta["val"].shape - if start == 0 and self.dims_equivalent(end, i_dim): + if output_shape == input_shape: node.replace_all_uses_with(input) graph.erase_node(node) count += 1 @@ -116,14 +115,7 @@ class NoOpEliminationPass(VllmInductorPass): base_shape = base.meta["val"].shape view_shape = view.meta["val"].shape - view_dim = view_shape[dim_index] - - # Check that view fully covers base and the full view is used - # (if the view fully covered the base after slicing but was not - # fully used, we could replace slice_scatter with a simple slice - # but that's a niche case). - if (base_shape == view_shape and start == 0 - and self.dims_equivalent(end, view_dim)): + if base_shape == view_shape: node.replace_all_uses_with(view) graph.erase_node(node) count += 1 @@ -132,13 +124,9 @@ class NoOpEliminationPass(VllmInductorPass): self.dump_graph(graph, "after_noop_elimination") self.end_and_log() - def all_dims_equivalent(self, dims: Iterable[Union[int, torch.fx.Node]], - i_dims: Iterable[Union[int, SymInt]]): - return all( - self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims)) - - def dims_equivalent(self, dim: Union[int, torch.fx.Node], - i_dim: Union[int, SymInt]) -> bool: + # ---------------------- Reshape helpers ---------------------- + def reshape_dims_equivalent(self, dim: Union[int, torch.fx.Node], + i_dim: Union[int, SymInt]) -> bool: """ This function checks if two dimensions are equivalent. :param dim: The dimension arg to reshape/slice @@ -156,10 +144,18 @@ class NoOpEliminationPass(VllmInductorPass): In case 3, the reshape dimension is a torch.fx.Node, and its value is a SymInt. That value is equal to the input dimension. - """ # Case 1 and 2 if dim == i_dim or dim == -1: return True # Case 3 return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim + + def reshape_all_dims_equivalent( + self, + dims: Iterable[Union[int, torch.fx.Node]], + i_dims: Iterable[Union[int, SymInt]], + ) -> bool: + return all( + self.reshape_dims_equivalent(s, i_s) + for s, i_s in zip(dims, i_dims)) From 7f6f2c1182b4f6e23d9d10715e25f60fbfd8e6ac Mon Sep 17 00:00:00 2001 From: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com> Date: Tue, 16 Sep 2025 02:28:35 +0200 Subject: [PATCH 295/584] `HuggingFace` -> `Hugging Face` in `Integration with Hugging Face` docs (#24889) --- docs/design/huggingface_integration.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md index 5a7582c86d49f..412ce658b92a2 100644 --- a/docs/design/huggingface_integration.md +++ b/docs/design/huggingface_integration.md @@ -1,31 +1,31 @@ # Integration with Hugging Face -This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. +This document describes how vLLM integrates with Hugging Face libraries. We will explain step by step what happens under the hood when we run `vllm serve`. -Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`. +Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qwen2-7B`. 1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process: - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. - - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. - - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. + - If the `model` argument is a Hugging Face model ID consisting of a username and model name, vLLM will first try to use the config file from the Hugging Face local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the Hugging Face cache works. + - If the `model` argument is a Hugging Face model ID but it is not found in the cache, vLLM will download the config file from the Hugging Face model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation. 3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that: - - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. - - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. + - Hugging Face also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, Hugging Face will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. + - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, Hugging Face will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. 4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation. 5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs. -Beyond that, there are two more things vLLM depends on HuggingFace for. +Beyond that, there are two more things vLLM depends on Hugging Face for. -1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). +1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). -2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. +2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: -This completes the integration between vLLM and HuggingFace. +This completes the integration between vLLM and Hugging Face. -In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. +In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the Hugging Face model hub or a local directory. It uses the config class from either vLLM, Hugging Face transformers, or loads the config class from the model's repository. From e95084308b2032748e226f420e93becaf3af64d5 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Mon, 15 Sep 2025 22:01:28 -0400 Subject: [PATCH 296/584] Updated CODEOWNERS for flashinfer, mla, fused_moe (#24906) Signed-off-by: mgoin <mgoin64@gmail.com> --- .github/CODEOWNERS | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c0858c309501a..e3dbd28fa91e9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,17 +2,20 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review +/vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche +/vllm/v1/attention @LucasWilkinson /vllm/v1/sample @22quinn @houseroad /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee @@ -30,6 +33,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/spec_decode @benchislett @luccafong +/vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/core @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345 @@ -41,7 +45,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche -/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256 +/tests/evals @mgoin +/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/prefix_caching @comaniac @KuntaiDu @@ -104,4 +109,4 @@ mkdocs.yaml @hmellor /vllm/tests/v1/tpu @NickLucche # KVConnector installation files -/requirements/kv_connectors.txt @NickLucche \ No newline at end of file +/requirements/kv_connectors.txt @NickLucche From de2cc3d8678623f0b00be9526db888c1a1496660 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:03:29 -0400 Subject: [PATCH 297/584] [Deprecation] Remove DeepGEMM Old Symbol Wrapper (#24902) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/utils/deep_gemm.py | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 90cdd396209c7..38d92f01192b1 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -67,23 +67,6 @@ def _missing(*_: Any, **__: Any) -> NoReturn: "package to enable FP8 kernels.") -def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: - """Return the *new* symbol if it exists, otherwise the *old* one.""" - if hasattr(module, new): - return getattr(module, new) - if hasattr(module, old): - # TODO(wentao): deprecate old symbol in the future. - logger.warning_once( - "Found legacy DeepGEMM symbol `%s`. Please upgrade the `deep_gemm` " - "package so that `%s` is available. Support for the legacy symbol " - "will be removed in a future vLLM release.", - old, - new, - ) - return getattr(module, old) - return None - - _fp8_gemm_nt_impl: Callable[..., Any] | None = None _grouped_impl: Callable[..., Any] | None = None _grouped_masked_impl: Callable[..., Any] | None = None @@ -109,14 +92,9 @@ def _lazy_init() -> None: _dg = importlib.import_module("deep_gemm") - _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt", - "gemm_fp8_fp8_bf16_nt") - _grouped_impl = _resolve_symbol( - _dg, "m_grouped_fp8_gemm_nt_contiguous", - "m_grouped_gemm_fp8_fp8_bf16_nt_contiguous") - _grouped_masked_impl = _resolve_symbol( - _dg, "fp8_m_grouped_gemm_nt_masked", - "m_grouped_gemm_fp8_fp8_bf16_nt_masked") + _fp8_gemm_nt_impl = getattr(_dg, "fp8_gemm_nt", None) + _grouped_impl = getattr(_dg, "m_grouped_fp8_gemm_nt_contiguous", None) + _grouped_masked_impl = getattr(_dg, "fp8_m_grouped_gemm_nt_masked", None) def fp8_gemm_nt(*args, **kwargs): From 2891603efdbd38ac7197d5f41b0e245fb6a53b82 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:05:12 -0400 Subject: [PATCH 298/584] [ROCm][Bugfix] Fix the case where there's bias (#24895) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> --- .../quantization/test_rocm_skinny_gemms.py | 31 +++++++++++++++++++ .../layers/quantization/utils/w8a8_utils.py | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index 03d5d98739c50..a9b1c71ef0718 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -5,6 +5,8 @@ import torch import vllm._custom_ops as ops from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + rocm_per_tensor_w8a8_scaled_mm_impl) from vllm.platforms import current_platform DTYPES = [torch.bfloat16, torch.float16] @@ -116,3 +118,32 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed): current_platform.get_cu_count()) assert torch.allclose(out, ref_out, rtol=0.01) + + +@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("use_bias", [True, False]) +@pytest.mark.skipif( + not (current_platform.is_rocm() and current_platform.supports_fp8()), + reason="only test for rocm fp8") +def test_rocm_per_tensor_w8a8_scaled_mm_impl(n, k, m, dtype, seed, use_bias): + torch.manual_seed(seed) + + A = torch.rand(n, k, device="cuda") + B = torch.rand(m, k, device="cuda") + + A, scale_a = ref_dynamic_per_tensor_fp8_quant(A) + B, scale_b = ref_dynamic_per_tensor_fp8_quant(B) + + bias = torch.rand(1, m, dtype=dtype, device="cuda") if use_bias else None + + output = rocm_per_tensor_w8a8_scaled_mm_impl(A, B.t(), dtype, scale_a, + scale_b, bias) + ref_out = torch._scaled_mm(A, + B.t(), + out_dtype=dtype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias) + assert torch.allclose(output, ref_out, rtol=0.01) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index e89a5e643b0e5..8cda1789e6c97 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -179,7 +179,7 @@ def rocm_per_tensor_w8a8_scaled_mm_impl(qinput: torch.Tensor, bias: torch.Tensor) -> torch.Tensor: from vllm.platforms.rocm import on_mi3xx if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx( - ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0: + ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0 and bias is None: output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b, current_platform.get_cu_count()) else: From d96e11167dd269d092fb188bf164ab8bf33807bc Mon Sep 17 00:00:00 2001 From: Reza Barazesh <3146276+rzabarazesh@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:08:46 -0400 Subject: [PATCH 299/584] Add pytest-cov and .coveragerc (#24778) Signed-off-by: Reza Barazesh <rezabarazesh@meta.com> --- .coveragerc | 32 ++++++++++++++++++++++++++++++++ requirements/test.in | 1 + requirements/test.txt | 9 ++++++++- 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000000..bc6342956109b --- /dev/null +++ b/.coveragerc @@ -0,0 +1,32 @@ +[run] +source = vllm +omit = + */tests/* + */test_* + */__pycache__/* + */build/* + */dist/* + */vllm.egg-info/* + */third_party/* + */examples/* + */benchmarks/* + */docs/* + +[report] +exclude_lines = + pragma: no cover + def __repr__ + if self.debug: + if settings.DEBUG + raise AssertionError + raise NotImplementedError + if 0: + if __name__ == .__main__.: + class .*\bProtocol\): + @(abc\.)?abstractmethod + +[html] +directory = htmlcov + +[xml] +output = coverage.xml diff --git a/requirements/test.in b/requirements/test.in index 744cfbe885278..451bd73879107 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -6,6 +6,7 @@ pytest-asyncio pytest-rerunfailures pytest-shard pytest-timeout +pytest-cov # testing utils backoff # required for phi4mm test diff --git a/requirements/test.txt b/requirements/test.txt index 5eebdc788aa3d..39040f210b2fd 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -135,6 +135,8 @@ colorful==0.5.6 # via ray contourpy==1.3.0 # via matplotlib +coverage==7.10.6 + # via pytest-cov cramjam==2.9.0 # via fastparquet cupy-cuda12x==13.6.0 @@ -686,7 +688,9 @@ platformdirs==4.3.6 plotly==5.24.1 # via genai-perf pluggy==1.5.0 - # via pytest + # via + # pytest + # pytest-cov polars==1.29.0 # via mteb pooch==1.8.2 @@ -786,6 +790,7 @@ pytest==8.3.5 # buildkite-test-collector # genai-perf # pytest-asyncio + # pytest-cov # pytest-forked # pytest-mock # pytest-rerunfailures @@ -796,6 +801,8 @@ pytest==8.3.5 # terratorch pytest-asyncio==0.24.0 # via -r requirements/test.in +pytest-cov==6.3.0 + # via -r requirements/test.in pytest-forked==1.6.0 # via -r requirements/test.in pytest-mock==3.14.0 From b42566f44039ed8dc5f6c124203d5d71740c6fe0 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:10:55 -0400 Subject: [PATCH 300/584] [Bug] Fix `is_flashmla_supported` Check Error (#24774) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/attention/backends/flashmla.py | 15 ++------------- vllm/v1/attention/backends/mla/flashmla.py | 15 ++------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 411eb5413f53c..aeaa0ab631cfb 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -17,7 +17,6 @@ from vllm.attention.backends.mla.common import (MLACommonBackend, from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, get_mla_metadata, is_flashmla_supported) -from vllm.platforms.cuda import CudaPlatform class FlashMLABackend(MLACommonBackend): @@ -179,18 +178,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - assert is_flashmla_supported(), \ - "FlashMLA is not supported on this device" - - # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs - # context: - # https://github.com/deepseek-ai/FlashMLA/issues/83 - # https://github.com/vllm-project/vllm/issues/24513 - if CudaPlatform.has_device_capability(100): - raise NotImplementedError( - "FlashMLA is temporarily disabled on Blackwell (SM 10.0). " - "Please use CUTLASS_MLA or TRITON_MLA instead. " - "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`") + is_supported, reason = is_flashmla_supported() + assert is_supported, reason unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 549af1a062252..150e38553e4bb 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -12,7 +12,6 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, is_flashmla_supported) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.platforms.cuda import CudaPlatform from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonDecodeMetadata, MLACommonImpl, @@ -156,18 +155,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - assert is_flashmla_supported(), \ - "FlashMLA is not supported on this device" - - # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs - # context: - # https://github.com/deepseek-ai/FlashMLA/issues/83 - # https://github.com/vllm-project/vllm/issues/24513 - if CudaPlatform.has_device_capability(100): - raise NotImplementedError( - "FlashMLA is temporarily disabled on Blackwell (SM 10.0). " - "Please use CUTLASS_MLA or TRITON_MLA instead. " - "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`") + is_supported, reason = is_flashmla_supported() + assert is_supported, reason unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): From 3c96e7b8a1ffac31b947c753f4fe1d0cd14abf87 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:14:50 -0400 Subject: [PATCH 301/584] [CI] Small Accuracy Eval Test for Deepseek Model (#24259) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- .../evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml | 6 ++++++ tests/evals/gsm8k/configs/models-small.txt | 1 + 2 files changed, 7 insertions(+) create mode 100644 tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml diff --git a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml new file mode 100644 index 0000000000000..7ec6a1e0be27f --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml @@ -0,0 +1,6 @@ +model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8" +accuracy_threshold: 0.72 +num_questions: 1319 +num_fewshot: 5 +max_model_len: 4096 + diff --git a/tests/evals/gsm8k/configs/models-small.txt b/tests/evals/gsm8k/configs/models-small.txt index afd1065b9191b..7bce3f0004f7d 100644 --- a/tests/evals/gsm8k/configs/models-small.txt +++ b/tests/evals/gsm8k/configs/models-small.txt @@ -3,3 +3,4 @@ Llama-3.2-1B-Instruct-INT8-CT.yaml Llama-3-8B-Instruct-nonuniform-CT.yaml Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Qwen1.5-MoE-W4A16-CT.yaml +DeepSeek-V2-Lite-Instruct-FP8.yaml From 2942970d4462c1d938db4f142008e106702c0dc5 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin <markmc@redhat.com> Date: Tue, 16 Sep 2025 03:15:57 +0100 Subject: [PATCH 302/584] [Metrics] Hide deprecated metrics with gpu_ prefix (#24245) Signed-off-by: Mark McLoughlin <markmc@redhat.com> --- tests/entrypoints/openai/test_metrics.py | 21 ++++-- vllm/v1/metrics/loggers.py | 84 +++++++++++++----------- 2 files changed, 61 insertions(+), 44 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index a4e1aca8bcac2..0c9e0f3a51429 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [ "vllm:gpu_cache_usage_perc", "vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_hits", + "vllm:kv_cache_usage_perc", + "vllm:prefix_cache_queries", + "vllm:prefix_cache_hits", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", @@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [ ] HIDDEN_DEPRECATED_METRICS: list[str] = [ + "vllm:gpu_cache_usage_perc", + "vllm:gpu_prefix_cache_queries", + "vllm:gpu_prefix_cache_hits", "vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_count", @@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool): running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) # Expect no running requests or kvcache usage assert running_requests == 0 @@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Check that we have running requests running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) # Expect running requests and kvcache usage assert running_requests > 0 @@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Verify running and waiting requests counts and KV cache usage are zero running_requests_after, waiting_requests_after, kv_cache_usage_after = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) assert running_requests_after == 0,\ (f"Expected 0 running requests after abort, got " @@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, f"{kv_cache_usage_after}") -def _get_running_metrics_from_api(server: RemoteOpenAIServer): +def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): """Return (running_count, waiting_count, kv_cache_usage)""" response = requests.get(server.url_for("metrics")) @@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): # Verify running and waiting requests counts and KV cache usage are zero running_requests, waiting_requests, kv_cache_usage = None, None, None + kv_cache_usage_metric = ("vllm:kv_cache_usage_perc" + if use_v1 else "vllm:gpu_cache_usage_perc") + for family in text_string_to_metric_families(response.text): if family.name == "vllm:num_requests_running": for sample in family.samples: @@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): if sample.name == "vllm:num_requests_waiting": waiting_requests = sample.value break - elif family.name == "vllm:gpu_cache_usage_perc": + elif family.name == kv_cache_usage_metric: for sample in family.samples: - if sample.name == "vllm:gpu_cache_usage_perc": + if sample.name == kv_cache_usage_metric: kv_cache_usage = sample.value break diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 3f4699281ccea..b30036a6f8e80 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -202,40 +202,46 @@ class PrometheusStatLogger(StatLoggerBase): # # GPU cache # - # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc - # TODO: in 0.10, only enable if show_hidden_metrics=True - gauge_gpu_cache_usage = self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), - multiprocess_mode="mostrecent", - labelnames=labelnames) - self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, - engine_indexes, - model_name) + # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + gauge_gpu_cache_usage = self._gauge_cls( + name="vllm:gpu_cache_usage_perc", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine( + gauge_gpu_cache_usage, engine_indexes, model_name) - # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries - # TODO: in 0.10, only enable if show_hidden_metrics=True - counter_gpu_prefix_cache_queries = self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), - labelnames=labelnames) - self.counter_gpu_prefix_cache_queries = make_per_engine( - counter_gpu_prefix_cache_queries, engine_indexes, model_name) + # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + counter_gpu_prefix_cache_queries = self._counter_cls( + name="vllm:gpu_prefix_cache_queries", + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." + ), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) - # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits - # TODO: in 0.10, only enable if show_hidden_metrics=True - counter_gpu_prefix_cache_hits = self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames) - self.counter_gpu_prefix_cache_hits = make_per_engine( - counter_gpu_prefix_cache_hits, engine_indexes, model_name) + # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + counter_gpu_prefix_cache_hits = self._counter_cls( + name="vllm:gpu_prefix_cache_hits", + documentation=( + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", @@ -509,15 +515,17 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_waiting[engine_idx].set( scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage[engine_idx].set( - scheduler_stats.kv_cache_usage) + if self.show_hidden_metrics: + self.gauge_gpu_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) self.gauge_kv_cache_usage[engine_idx].set( scheduler_stats.kv_cache_usage) - self.counter_gpu_prefix_cache_queries[engine_idx].inc( - scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits[engine_idx].inc( - scheduler_stats.prefix_cache_stats.hits) + if self.show_hidden_metrics: + self.counter_gpu_prefix_cache_queries[engine_idx].inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits[engine_idx].inc( + scheduler_stats.prefix_cache_stats.hits) self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) From e1279ef00fe2e21350600e17afe4164f7fdc46b5 Mon Sep 17 00:00:00 2001 From: Richard Zou <zou3519@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:25:50 -0400 Subject: [PATCH 303/584] [Docs] Update instructions for how to using existing torch binary (#24892) Signed-off-by: Richard Zou <zou3519@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/getting_started/installation/gpu/cuda.inc.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 01c5f5fc02f3e..9e64c6f2540af 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -165,7 +165,19 @@ There are scenarios where the PyTorch dependency cannot be easily installed with - Building vLLM with PyTorch nightly or a custom PyTorch build. - Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it. -To build vLLM using an existing PyTorch installation, it is recommended to use `uv`, because it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) for disabling build isolation for specific packages and vLLM leverages this mechanism to specify `torch` as the package to disable build isolation. +To build vLLM using an existing PyTorch installation: + +```bash +# install PyTorch first, either from PyPI or from source +git clone https://github.com/vllm-project/vllm.git +cd vllm +python use_existing_torch.py +uv pip install -r requirements/build.txt +uv pip install --no-build-isolation -e . +``` + +Alternatively: if you are exclusively using `uv` to create and manage virtual environments, it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) +for disabling build isolation for specific packages. vLLM can leverage this mechanism to specify `torch` as the package to disable build isolation for: ```bash # install PyTorch first, either from PyPI or from source From 0af3ce13558056b3637dd4661c89f4b037faa85a Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Mon, 15 Sep 2025 19:36:09 -0700 Subject: [PATCH 304/584] Upgrade flashinfer to 0.3.1 (#24470) Signed-off-by: Lu Fang <lufang@fb.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- docker/Dockerfile.nightly_torch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index e147b97f0e056..ae12ed0f7cabb 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.2.2.post1 +# release version: v0.3.1 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ @@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ echo "git clone flashinfer..." \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ - && git checkout v0.2.2.post1 \ + && git checkout v0.3.1 \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ && rm -rf build \ From 5206ab20ba7477e5457c2e64469590d548fa15e6 Mon Sep 17 00:00:00 2001 From: Kunshang Ji <kunshang.ji@intel.com> Date: Tue, 16 Sep 2025 11:35:36 +0800 Subject: [PATCH 305/584] [XPU] Fix circular import error. (#24927) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> --- vllm/platforms/xpu.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 792115b33ea8d..67ef058df10f1 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -9,7 +9,6 @@ import torch import vllm.envs as envs from vllm.logger import init_logger from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS -from vllm.v1.attention.backends.utils import set_kv_cache_layout from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -164,11 +163,16 @@ class XPUPlatform(Platform): vllm_config.scheduler_config.max_num_batched_tokens = max( vllm_config.scheduler_config.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) + from vllm.v1.attention.backends.utils import set_kv_cache_layout set_kv_cache_layout("NHD") logger.info("Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " "only NHD layout is supported by XPU attention kernels.") + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True + @classmethod def is_pin_memory_available(cls): return True From 759ef49b15149daaa0b2ba8900c1983e7e5e4514 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Mon, 15 Sep 2025 21:17:14 -0700 Subject: [PATCH 306/584] Remove V0 Encoder-Decoder Support (#24907) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- .../scripts/hardware_ci/run-cpu-test.sh | 1 - .buildkite/test-pipeline.yaml | 9 - docs/contributing/model/multimodal.md | 1 - docs/models/supported_models.md | 8 - docs/usage/v1_guide.md | 2 +- examples/offline_inference/dolphin.py | 311 --- examples/offline_inference/encoder_decoder.py | 195 -- .../encoder_decoder_multimodal.py | 114 +- examples/offline_inference/vision_language.py | 62 - .../vision_language_multi_image.py | 21 - tests/core/block/test_block_manager.py | 155 +- tests/core/test_scheduler_encoder_decoder.py | 105 - tests/distributed/test_pipeline_parallel.py | 3 - tests/encoder_decoder/__init__.py | 0 tests/encoder_decoder/test_e2e_correctness.py | 131 -- .../openai/test_encoder_decoder.py | 56 - tests/entrypoints/test_chat_utils.py | 199 -- .../attention/test_encoder_decoder_attn.py | 1105 ----------- tests/models/language/generation/test_bart.py | 222 --- .../models/language/generation/test_mbart.py | 123 -- .../multimodal/generation/test_florence2.py | 147 -- .../multimodal/generation/test_mllama.py | 768 -------- .../multimodal/processing/test_common.py | 5 - .../multimodal/processing/test_mllama.py | 72 - .../processing/test_tensor_schema.py | 1 - tests/models/registry.py | 16 +- tests/models/test_initialization.py | 4 - tests/models/test_registry.py | 1 - tests/test_config.py | 3 +- tests/utils_/test_utils.py | 28 - tests/v1/test_oracle.py | 21 - .../test_encoder_decoder_model_runner.py | 648 ------- vllm/config/__init__.py | 11 +- vllm/engine/llm_engine.py | 2 +- vllm/model_executor/models/bart.py | 1319 ------------- vllm/model_executor/models/donut.py | 381 ---- vllm/model_executor/models/florence2.py | 1097 ----------- vllm/model_executor/models/mllama.py | 1697 ----------------- vllm/model_executor/models/registry.py | 11 +- vllm/multimodal/profiling.py | 2 +- vllm/test_utils.py | 1 - .../chat_templates/registry.py | 1 - vllm/transformers_utils/config.py | 5 - vllm/v1/engine/processor.py | 2 +- vllm/worker/enc_dec_model_runner.py | 553 ------ vllm/worker/utils.py | 49 - vllm/worker/worker.py | 6 +- 47 files changed, 13 insertions(+), 9661 deletions(-) delete mode 100644 examples/offline_inference/dolphin.py delete mode 100644 examples/offline_inference/encoder_decoder.py delete mode 100644 tests/core/test_scheduler_encoder_decoder.py delete mode 100644 tests/encoder_decoder/__init__.py delete mode 100644 tests/encoder_decoder/test_e2e_correctness.py delete mode 100644 tests/entrypoints/openai/test_encoder_decoder.py delete mode 100644 tests/kernels/attention/test_encoder_decoder_attn.py delete mode 100644 tests/models/language/generation/test_bart.py delete mode 100644 tests/models/language/generation/test_mbart.py delete mode 100644 tests/models/multimodal/generation/test_florence2.py delete mode 100644 tests/models/multimodal/generation/test_mllama.py delete mode 100644 tests/models/multimodal/processing/test_mllama.py delete mode 100644 tests/worker/test_encoder_decoder_model_runner.py delete mode 100644 vllm/model_executor/models/bart.py delete mode 100644 vllm/model_executor/models/donut.py delete mode 100644 vllm/model_executor/models/florence2.py delete mode 100644 vllm/model_executor/models/mllama.py delete mode 100644 vllm/worker/enc_dec_model_runner.py delete mode 100644 vllm/worker/utils.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 0f734763f13fd..64943d2a15a79 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -66,7 +66,6 @@ function cpu_tests() { pytest -x -v -s tests/models/language/pooling -m cpu_model pytest -x -v -s tests/models/multimodal/generation \ - --ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_pixtral.py \ -m cpu_model" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c3b5aa2907a49..df95fcaa04382 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -549,15 +549,6 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: Encoder Decoder tests # 12min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/encoder_decoder - commands: - - pytest -v -s encoder_decoder - - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index dc742c8fcf2cd..87d34d207cde3 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -840,7 +840,6 @@ Some HF processors directly insert feature tokens without replacing anything in Examples: - BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py> -- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py> - Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py> ### Handling prompt updates unrelated to multi-modal data diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index cc3ee8b788ddb..73834ddd0c5d6 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -331,8 +331,6 @@ th { | `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ | -| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | -| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | @@ -426,9 +424,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -!!! note - Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture. - ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. @@ -625,9 +620,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ | | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ | | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ | -| `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | | | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ | -| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | @@ -654,7 +647,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ | | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index d404c87e8f5a7..340aaf54bb720 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -120,7 +120,7 @@ Please note that prefix caching is not yet supported for any of the above models Whisper is supported. Other models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, -`MllamaForConditionalGeneration`) are not yet supported. +`MllamaForConditionalGeneration`) are not supported. ### Features diff --git a/examples/offline_inference/dolphin.py b/examples/offline_inference/dolphin.py deleted file mode 100644 index d2ba27cd1e027..0000000000000 --- a/examples/offline_inference/dolphin.py +++ /dev/null @@ -1,311 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import copy -import os -from dataclasses import dataclass - -import cv2 -import numpy as np -import regex as re -from PIL import Image -from transformers import DonutProcessor - -from vllm import LLM, SamplingParams -from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt -from vllm.multimodal.utils import fetch_image - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -@dataclass -class ImageDimensions: - original_w: int - original_h: int - padded_w: int - padded_h: int - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def map_to_original_coordinates( - x1, y1, x2, y2, dims: ImageDimensions -) -> tuple[int, int, int, int]: - try: - top = (dims.padded_h - dims.original_h) // 2 - left = (dims.padded_w - dims.original_w) // 2 - orig_x1 = max(0, x1 - left) - orig_y1 = max(0, y1 - top) - orig_x2 = min(dims.original_w, x2 - left) - orig_y2 = min(dims.original_h, y2 - top) - if orig_x2 <= orig_x1: - orig_x2 = min(orig_x1 + 1, dims.original_w) - if orig_y2 <= orig_y1: - orig_y2 = min(orig_y1 + 1, dims.original_h) - return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2) - except Exception as e: - print(f"map_to_original_coordinates error: {str(e)}") - return 0, 0, min(100, dims.original_w), min(100, dims.original_h) - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def adjust_box_edges(image, boxes: list[list[float]], max_pixels=15, threshold=0.2): - if isinstance(image, str): - image = cv2.imread(image) - img_h, img_w = image.shape[:2] - new_boxes = [] - for box in boxes: - best_box = copy.deepcopy(box) - - def check_edge(img, current_box, i, is_vertical): - edge = current_box[i] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold( - gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU - ) - if is_vertical: - line = binary[current_box[1] : current_box[3] + 1, edge] - else: - line = binary[edge, current_box[0] : current_box[2] + 1] - transitions = np.abs(np.diff(line)) - return np.sum(transitions) / len(transitions) - - edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)] - current_box = copy.deepcopy(box) - current_box[0] = min(max(current_box[0], 0), img_w - 1) - current_box[1] = min(max(current_box[1], 0), img_h - 1) - current_box[2] = min(max(current_box[2], 0), img_w - 1) - current_box[3] = min(max(current_box[3], 0), img_h - 1) - - for i, direction, is_vertical in edges: - best_score = check_edge(image, current_box, i, is_vertical) - if best_score <= threshold: - continue - for step in range(max_pixels): - current_box[i] += direction - if i == 0 or i == 2: - current_box[i] = min(max(current_box[i], 0), img_w - 1) - else: - current_box[i] = min(max(current_box[i], 0), img_h - 1) - score = check_edge(image, current_box, i, is_vertical) - if score < best_score: - best_score = score - best_box = copy.deepcopy(current_box) - if score <= threshold: - break - new_boxes.append(best_box) - return new_boxes - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None): - try: - x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h) - x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h) - x1, y1, x2, y2 = ( - max(0, min(x1, dims.padded_w - 1)), - max(0, min(y1, dims.padded_h - 1)), - max(0, min(x2, dims.padded_w)), - max(0, min(y2, dims.padded_h)), - ) - if x2 <= x1: - x2 = min(x1 + 1, dims.padded_w) - if y2 <= y1: - y2 = min(y1 + 1, dims.padded_h) - new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]]) - x1, y1, x2, y2 = new_boxes[0] - x1, y1, x2, y2 = ( - max(0, min(x1, dims.padded_w - 1)), - max(0, min(y1, dims.padded_h - 1)), - max(0, min(x2, dims.padded_w)), - max(0, min(y2, dims.padded_h)), - ) - if x2 <= x1: - x2 = min(x1 + 1, dims.padded_w) - if y2 <= y1: - y2 = min(y1 + 1, dims.padded_h) - if previous_box is not None: - prev_x1, prev_y1, prev_x2, prev_y2 = previous_box - if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1): - y1 = prev_y2 - y1 = min(y1, dims.padded_h - 1) - if y2 <= y1: - y2 = min(y1 + 1, dims.padded_h) - new_previous_box = [x1, y1, x2, y2] - orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates( - x1, y1, x2, y2, dims - ) - return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box - except Exception as e: - print(f"process_coordinates error: {str(e)}") - orig_x1, orig_y1, orig_x2, orig_y2 = ( - 0, - 0, - min(100, dims.original_w), - min(100, dims.original_h), - ) - return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100] - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def prepare_image(image) -> tuple[np.ndarray, ImageDimensions]: - try: - image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) - original_h, original_w = image_cv.shape[:2] - max_size = max(original_h, original_w) - top = (max_size - original_h) // 2 - bottom = max_size - original_h - top - left = (max_size - original_w) // 2 - right = max_size - original_w - left - padded_image = cv2.copyMakeBorder( - image_cv, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0) - ) - padded_h, padded_w = padded_image.shape[:2] - dimensions = ImageDimensions( - original_w=original_w, - original_h=original_h, - padded_w=padded_w, - padded_h=padded_h, - ) - return padded_image, dimensions - except Exception as e: - print(f"prepare_image error: {str(e)}") - h, w = image.height, image.width - dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h) - return np.zeros((h, w, 3), dtype=np.uint8), dimensions - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def parse_layout_string(bbox_str): - """Parse layout string using regular expressions""" - pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)" - matches = re.finditer(pattern, bbox_str) - - parsed_results = [] - for match in matches: - coords = [float(match.group(i)) for i in range(1, 5)] - label = match.group(5).strip() - parsed_results.append((coords, label)) - - return parsed_results - - -model_id = "ByteDance/Dolphin" - -# The input image size for Dolphin is 896 x 896, -# and the patch_size is 4 x 4. -# Therefore, the initial number of patches is: -# Height: 896 / 4 = 224 patches -# Width: 896 / 4 = 224 patches - -# The Dolphin model uses a staged downsampling approach, -# defined by the "depths": [2, 2, 14, 2] configuration. -# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed, -# which halves the feature map's dimensions (dividing both height and width by 2). -# Before Stage 2: The size changes from 224 x 224 to (224/2) x (224/2) = 112 x 112. -# Before Stage 3: The size changes from 112 x 112 to (112/2) x (112/2) = 56 x 56. -# Before Stage 4: The size changes from 56 x 56 to (56/2) x (56/2) = 28 x 28. - -# Because vLLM needs to fill the image features with an encoder_prompt, -# and the encoder_prompt will have `<pad>` tokens added when tokenized, -# we need to construct an encoder_prompt with a length of 28 x 28 - 1 = 783. -encoder_prompt = "".join(["0"] * 783) -sampling_params = SamplingParams( - temperature=0.0, - max_tokens=2048, -) - -processor = DonutProcessor.from_pretrained(model_id) -llm = LLM( - model=model_id, - dtype="float16", - max_num_seqs=8, - hf_overrides={"architectures": ["DonutForConditionalGeneration"]}, -) - -parser = argparse.ArgumentParser() -parser.add_argument( - "--image_path", type=str, default=None, help="Path to a local image file." -) -args = parser.parse_args() - -if args.image_path: - if not os.path.exists(args.image_path): - raise FileNotFoundError(f"Error: File not found at {args.image_path}") - image = Image.open(args.image_path).convert("RGB") -else: - image = fetch_image( - "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg" - ) - - -prompt = "Parse the reading order of this document. " -decoder_prompt = f"<s>{prompt}<Answer/>" -decoder_prompt_tokens = TokensPrompt( - prompt_token_ids=processor.tokenizer(decoder_prompt, add_special_tokens=False)[ - "input_ids" - ] -) -enc_dec_prompt = ExplicitEncoderDecoderPrompt( - encoder_prompt=TextPrompt(prompt=encoder_prompt, multi_modal_data={"image": image}), - decoder_prompt=decoder_prompt_tokens, -) -layout_outputs = llm.generate(prompts=enc_dec_prompt, sampling_params=sampling_params) -layout_result_str = layout_outputs[0].outputs[0].text -print(f"Layout analysis output:\n{layout_result_str}") - -padded_image, dims = prepare_image(image) -layout_results = parse_layout_string(layout_result_str) -text_table_elements = [] -previous_box = None -reading_order = 0 -for bbox_coords, label in layout_results: - if label == "fig": - continue - try: - x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = ( - process_coordinates(bbox_coords, padded_image, dims, previous_box) - ) - cropped = padded_image[y1:y2, x1:x2] - if cropped.size > 0 and cropped.shape[0] > 3 and cropped.shape[1] > 3: - pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)) - prompt_ocr = ( - "Parse the table in the image. " - if label == "tab" - else "Read text in the image. " - ) - text_table_elements.append( - { - "crop": pil_crop, - "prompt": prompt_ocr, - "reading_order": reading_order, - } - ) - reading_order += 1 - except Exception as e: - print(f"Error processing bbox (label: {label}): {str(e)}") - continue - -if text_table_elements: - batch_prompts = [] - for elem in text_table_elements: - decoder_prompt_str = f"<s>{elem['prompt']}<Answer/>" - decoder_prompt_tokens = TokensPrompt( - prompt_token_ids=processor.tokenizer( - decoder_prompt_str, add_special_tokens=False - )["input_ids"] - ) - enc_dec_prompt = ExplicitEncoderDecoderPrompt( - encoder_prompt=TextPrompt( - prompt=encoder_prompt, multi_modal_data={"image": elem["crop"]} - ), - decoder_prompt=decoder_prompt_tokens, - ) - batch_prompts.append(enc_dec_prompt) - batch_outputs = llm.generate(prompts=batch_prompts, sampling_params=sampling_params) - for i, output in enumerate(batch_outputs): - text_table_elements[i]["text"] = output.outputs[0].text.strip() - -print("------" * 8) -text_table_elements.sort(key=lambda x: x["reading_order"]) -for elem in text_table_elements: - print(elem.get("text", "")) diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py deleted file mode 100644 index 957db3c23b863..0000000000000 --- a/examples/offline_inference/encoder_decoder.py +++ /dev/null @@ -1,195 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Demonstrate prompting of text-to-text -encoder/decoder models, specifically BART and mBART. - -This script is refactored to allow model selection via command-line arguments. - -NOTE: This example is not yet supported in V1. -""" - -import argparse -from typing import NamedTuple, Optional - -from vllm import LLM, SamplingParams -from vllm.inputs import ( - ExplicitEncoderDecoderPrompt, - TextPrompt, - TokensPrompt, - zip_enc_dec_prompts, -) - - -class ModelRequestData(NamedTuple): - """ - Holds the configuration for a specific model, including its - HuggingFace ID and the prompts to use for the demo. - """ - - model_id: str - encoder_prompts: list - decoder_prompts: list - hf_overrides: Optional[dict] = None - - -def get_bart_config() -> ModelRequestData: - """ - Returns the configuration for facebook/bart-large-cnn. - This uses the exact test cases from the original script. - """ - encoder_prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "An encoder prompt", - ] - decoder_prompts = [ - "A decoder prompt", - "Another decoder prompt", - ] - return ModelRequestData( - model_id="facebook/bart-large-cnn", - encoder_prompts=encoder_prompts, - decoder_prompts=decoder_prompts, - ) - - -def get_mbart_config() -> ModelRequestData: - """ - Returns the configuration for facebook/mbart-large-en-ro. - This uses prompts suitable for an English-to-Romanian translation task. - """ - encoder_prompts = [ - "The quick brown fox jumps over the lazy dog.", - "How are you today?", - ] - decoder_prompts = ["", ""] - hf_overrides = {"architectures": ["MBartForConditionalGeneration"]} - return ModelRequestData( - model_id="facebook/mbart-large-en-ro", - encoder_prompts=encoder_prompts, - decoder_prompts=decoder_prompts, - hf_overrides=hf_overrides, - ) - - -MODEL_GETTERS = { - "bart": get_bart_config, - "mbart": get_mbart_config, -} - - -def create_all_prompt_types( - encoder_prompts_raw: list, - decoder_prompts_raw: list, - tokenizer, -) -> list: - """ - Generates a list of diverse prompt types for demonstration. - This function is generic and uses the provided raw prompts - to create various vLLM input objects. - """ - text_prompt_raw = encoder_prompts_raw[0] - text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)]) - tokens_prompt = TokensPrompt( - prompt_token_ids=tokenizer.encode( - encoder_prompts_raw[2 % len(encoder_prompts_raw)] - ) - ) - - decoder_tokens_prompt = TokensPrompt( - prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0]) - ) - single_prompt_examples = [ - text_prompt_raw, - text_prompt, - tokens_prompt, - ] - explicit_pair_examples = [ - ExplicitEncoderDecoderPrompt( - encoder_prompt=text_prompt_raw, - decoder_prompt=decoder_tokens_prompt, - ), - ExplicitEncoderDecoderPrompt( - encoder_prompt=text_prompt, - decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)], - ), - ExplicitEncoderDecoderPrompt( - encoder_prompt=tokens_prompt, - decoder_prompt=text_prompt, - ), - ] - zipped_prompt_list = zip_enc_dec_prompts( - encoder_prompts_raw, - decoder_prompts_raw, - ) - return single_prompt_examples + explicit_pair_examples + zipped_prompt_list - - -def create_sampling_params() -> SamplingParams: - """Create a sampling params object.""" - return SamplingParams( - temperature=0, - top_p=1.0, - min_tokens=0, - max_tokens=30, - ) - - -def print_outputs(outputs: list): - """Formats and prints the generation outputs.""" - print("-" * 80) - for i, output in enumerate(outputs): - prompt = output.prompt - encoder_prompt = output.encoder_prompt - generated_text = output.outputs[0].text - print(f"Output {i + 1}:") - print(f"Encoder Prompt: {encoder_prompt!r}") - print(f"Decoder Prompt: {prompt!r}") - print(f"Generated Text: {generated_text!r}") - print("-" * 80) - - -def main(args): - """Main execution function.""" - model_key = args.model - if model_key not in MODEL_GETTERS: - raise ValueError( - f"Unknown model: {model_key}. " - f"Available models: {list(MODEL_GETTERS.keys())}" - ) - config_getter = MODEL_GETTERS[model_key] - model_config = config_getter() - - print(f"🚀 Running demo for model: {model_config.model_id}") - llm = LLM( - model=model_config.model_id, - dtype="float", - hf_overrides=model_config.hf_overrides, - ) - tokenizer = llm.llm_engine.get_tokenizer_group() - prompts = create_all_prompt_types( - encoder_prompts_raw=model_config.encoder_prompts, - decoder_prompts_raw=model_config.decoder_prompts, - tokenizer=tokenizer, - ) - sampling_params = create_sampling_params() - outputs = llm.generate(prompts, sampling_params) - print_outputs(outputs) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="A flexible demo for vLLM encoder-decoder models." - ) - parser.add_argument( - "--model", - "-m", - type=str, - default="bart", - choices=MODEL_GETTERS.keys(), - help="The short name of the model to run.", - ) - args = parser.parse_args() - main(args) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 35e9203d1caf0..4a1b0c40604b2 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -13,8 +13,6 @@ from typing import NamedTuple from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm.assets.audio import AudioAsset -from vllm.assets.image import ImageAsset -from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser @@ -23,113 +21,6 @@ class ModelRequestData(NamedTuple): prompts: Sequence[PromptType] -def run_donut(): - engine_args = EngineArgs( - model="naver-clova-ix/donut-base-finetuned-docvqa", - max_num_seqs=2, - limit_mm_per_prompt={"image": 1}, - dtype="float16", - hf_overrides={"architectures": ["DonutForConditionalGeneration"]}, - ) - - # The input image size for donut-base-finetuned-docvqa is 2560 x 1920, - # and the patch_size is 4 x 4. - # Therefore, the initial number of patches is: - # Height: 1920 / 4 = 480 patches - # Width: 2560 / 4 = 640 patches - # The Swin model uses a staged downsampling approach, - # defined by the "depths": [2, 2, 14, 2] configuration. - # Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed, - # which halves the feature map's dimensions (dividing both height and width by 2). - # Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320. - # Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160. - # Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80. - # Because vLLM needs to fill the image features with an encoder_prompt, - # and the encoder_prompt will have `<pad>` tokens added when tokenized, - # we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799. - prompts = [ - { - "encoder_prompt": { - "prompt": "".join(["$"] * 4799), - "multi_modal_data": { - "image": fetch_image( - "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg" - ) # noqa: E501 - }, - }, - "decoder_prompt": "<s_docvqa><s_question>What time is the coffee break?</s_question><s_answer>", # noqa: E501 - }, - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - -def run_florence2(): - engine_args = EngineArgs( - model="microsoft/Florence-2-large", - tokenizer="Isotr0py/Florence-2-tokenizer", - max_num_seqs=8, - trust_remote_code=True, - limit_mm_per_prompt={"image": 1}, - dtype="half", - ) - - prompts = [ - { # implicit prompt with task token - "prompt": "<DETAILED_CAPTION>", - "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image}, - }, - { # explicit encoder/decoder prompt - "encoder_prompt": { - "prompt": "Describe in detail what is shown in the image.", - "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image}, - }, - "decoder_prompt": "", - }, - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - -def run_mllama(): - engine_args = EngineArgs( - model="meta-llama/Llama-3.2-11B-Vision-Instruct", - max_model_len=8192, - max_num_seqs=2, - limit_mm_per_prompt={"image": 1}, - dtype="half", - ) - - prompts = [ - { # Implicit prompt - "prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501 - "multi_modal_data": { - "image": ImageAsset("stop_sign").pil_image, - }, - }, - { # Explicit prompt - "encoder_prompt": { - "prompt": "<|image|>", - "multi_modal_data": { - "image": ImageAsset("stop_sign").pil_image, - }, - }, - "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 - }, - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - def run_whisper(): os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -166,9 +57,6 @@ def run_whisper(): model_example_map = { - "donut": run_donut, - "florence2": run_florence2, - "mllama": run_mllama, "whisper": run_whisper, } @@ -182,7 +70,7 @@ def parse_args(): "--model-type", "-m", type=str, - default="mllama", + default="whisper", choices=model_example_map.keys(), help='Huggingface "model_type".', ) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 67a978ad2aae9..929df8d8bebd9 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -204,28 +204,6 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: ) -# Florence2 -def run_florence2(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - engine_args = EngineArgs( - model="microsoft/Florence-2-large", - tokenizer="Isotr0py/Florence-2-tokenizer", - max_model_len=4096, - max_num_seqs=2, - trust_remote_code=True, - dtype="bfloat16", - limit_mm_per_prompt={modality: 1}, - ) - - prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Fuyu def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1008,44 +986,6 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData: ) -# LLama 3.2 -def run_mllama(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # Note: The default setting of max_num_seqs (256) and - # max_model_len (131072) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [ - [ - { - "role": "user", - "content": [{"type": "image"}, {"type": "text", "text": question}], - } - ] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Molmo def run_molmo(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1665,7 +1605,6 @@ model_example_map = { "command_a_vision": run_command_a_vision, "deepseek_vl_v2": run_deepseek_vl2, "ernie45_vl": run_ernie45_vl, - "florence2": run_florence2, "fuyu": run_fuyu, "gemma3": run_gemma3, "gemma3n": run_gemma3n, @@ -1691,7 +1630,6 @@ model_example_map = { "minicpmv": run_minicpmv, "minimax_vl_01": run_minimax_vl_01, "mistral3": run_mistral3, - "mllama": run_mllama, "molmo": run_molmo, "nemotron_vl": run_nemotron_vl, "NVLM_D": run_nvlm_d, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 01c2905cf26d8..51b41f34b2ff6 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -637,26 +637,6 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - img_prompt = "Given the first image <|image|> and the second image<|image|>" - prompt = f"<|begin_of_text|>{img_prompt}, {question}?" - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "nvidia/NVLM-D-72B" @@ -1253,7 +1233,6 @@ model_example_map = { "llava-next": load_llava_next, "llava-onevision": load_llava_onevision, "mistral3": load_mistral3, - "mllama": load_mllama, "NVLM_D": load_nvlm_d, "ovis": load_ovis, "ovis2_5": load_ovis2_5, diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py index 9eed264fd7d43..24499b9ad4e9c 100644 --- a/tests/core/block/test_block_manager.py +++ b/tests/core/block/test_block_manager.py @@ -3,15 +3,12 @@ import pytest -from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.block_manager import SelfAttnBlockSpaceManager from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list -from ..utils import (create_dummy_prompt, create_seq_group, - create_seq_group_encoder_decoder) +from ..utils import create_dummy_prompt, create_seq_group @pytest.mark.parametrize("block_size", [16]) @@ -58,156 +55,6 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, assert can_allocate_result == AllocStatus.LATER -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160]) -@pytest.mark.parametrize("num_seqs_per_group", [1, 4]) -@pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_seq_group_encoder_decoder(block_size: int, - num_seqs_per_group: int, - num_gpu_blocks: int, - watermark: float): - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - watermark=watermark, - ) - num_watermark_blocks = int(watermark * num_gpu_blocks) - - num_output_blocks_per_seq = 1 - - # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but - # the current implementation assumes all seqs are new prompts / don't have - # different output lens. - num_output_blocks = num_output_blocks_per_seq - - for bdx, num_prompt_blocks in enumerate( - range(1, num_gpu_blocks - num_output_blocks)): - num_cross_blocks_per_seq = num_prompt_blocks - - seq_group = create_seq_group_encoder_decoder( - seq_prompt_len=block_size * num_prompt_blocks, - seq_output_lens=[ - block_size * num_output_blocks_per_seq - for _ in range(num_seqs_per_group) - ], - request_id=str(bdx)) - - assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks - - can_allocate_result = block_manager.can_allocate(seq_group) - - num_required_blocks = num_prompt_blocks + \ - num_output_blocks + \ - num_cross_blocks_per_seq - - if num_gpu_blocks - num_required_blocks < num_watermark_blocks: - assert can_allocate_result == AllocStatus.NEVER - elif num_gpu_blocks >= num_required_blocks: - assert can_allocate_result == AllocStatus.OK - else: - assert can_allocate_result == AllocStatus.LATER - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("num_gpu_blocks", [16]) -@pytest.mark.parametrize("num_seqs_per_group", [1]) -@pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, - num_seqs_per_group: int, - num_gpu_blocks: int, - watermark: float): - ''' - SWA short for Sliding Window Attention. - - At time of writing block manager does not support SWA. - - However even when SWA is implemented for block manager, - there will still most likely be a separate workstream required - to enable SWA for encoder/decoder models. - - Therefore this test enforces that one of the following cases - hold true: - 1. Block manager does not support SWA at all (true at time of writing) - 2. Block manager fails with NotImplementError when SWA is enabled - AND a SequenceGroup with an encoder sequence (i.e. in support of an - encoder/decoder model) is passed into can_allocate() as an argument - - The setup for this test is stripped down version of - test_can_allocate_seq_group_encoder_decoder() - ''' - - with pytest.raises((NotImplementedError, AssertionError)) as exc_info: - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - watermark=watermark, - sliding_window=5 # SWA - ) - - num_output_blocks_per_seq = 1 - num_prompt_blocks = 1 - num_output_blocks = num_output_blocks_per_seq - seq_group = create_seq_group_encoder_decoder( - seq_prompt_len=block_size * num_prompt_blocks, - seq_output_lens=[ - block_size * num_output_blocks_per_seq - for _ in range(num_seqs_per_group) - ], - request_id="0") - - assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks - block_manager.can_allocate(seq_group) - - # Assert that either - # 1. Block manager constructor fails with assertion that sliding window - # is not yet supported (most likely near-term outcome at time of - # writing), or - # 2. can_allocate() fails with NotImplementedError due to combination of - # encoder/decoder and sliding window attention - if isinstance(exc_info.value, NotImplementedError): - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA - elif isinstance(exc_info.value, AssertionError): - assert str(exc_info.value) == "Sliding window not yet supported" - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("num_gpu_blocks", [16]) -@pytest.mark.parametrize("num_seqs_per_group", [1]) -@pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_encoder_decoder_fails_with_prefix_cache( - block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, - watermark: float): - - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - watermark=watermark, - enable_caching=True # Prefix cache - ) - - num_output_blocks_per_seq = 1 - num_prompt_blocks = 1 - num_output_blocks = num_output_blocks_per_seq - seq_group = create_seq_group_encoder_decoder( - seq_prompt_len=block_size * num_prompt_blocks, - seq_output_lens=[ - block_size * num_output_blocks_per_seq - for _ in range(num_seqs_per_group) - ], - request_id="0") - - assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks - - # Assert that either can_allocate() fails with NotImplementedError - # due to combination of encoder/decoder and prefix cache - with pytest.raises(NotImplementedError) as exc_info: - block_manager.can_allocate(seq_group) - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE - - @pytest.mark.parametrize("block_size", [1, 8]) @pytest.mark.parametrize("prompt_len", [1, 7, 8]) @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129]) diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py deleted file mode 100644 index 20cc083ec8db4..0000000000000 --- a/tests/core/test_scheduler_encoder_decoder.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest # noqa - -from vllm.config import CacheConfig, SchedulerConfig -from vllm.core.scheduler import Scheduler -from vllm.sequence import SequenceGroup - -from .utils import (append_new_token, create_dummy_prompt_encoder_decoder, - get_sequence_groups, schedule_and_update_computed_tokens) - - -def test_scheduler_schedule_simple_encoder_decoder(): - ''' - Test basic scheduler functionality in the context - of an encoder/decoder model. Focus on testing - enc/dec-specific functionality sense tests already - exist for decoder-only functionality - - Test behavior: - * Construct Scheduler - * Construct dummy encoder/decoder sequence groups - * Add dummy seq groups to scheduler backlog - * Schedule the next seq group & validate: - * Cross-attn block tables - * Updated states of seq groups - * Number of batched tokens - * Number of blocks to copy/swap-in/swap-out - * Number of scheduled seq groups - * Repeat for both prefill- and decode-phase - * Abort scheduled seq groups - * Assert that aborted seq groups no longer appear in - cross-attention block table - ''' - - block_size = 4 - num_seq_group = 4 - max_model_len = 16 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=64, - max_num_seqs=num_seq_group, - max_model_len=max_model_len, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group - cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - req_id_list = [] - for i in range(num_seq_group): - req_id = str(i) - req_id_list.append(req_id) - _, _, seq_group = create_dummy_prompt_encoder_decoder( - req_id, block_size, block_size, block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Schedule seq groups prefill. - num_tokens = block_size * num_seq_group - seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler) - # - Verify that sequence group cross-attention block tables are - # registered with the block manager - assert all([(req_id in scheduler.block_manager.cross_block_tables) - for req_id in req_id_list]) - # - Validate sequence-group status - assert set(get_sequence_groups(out)) == set(running) - # - Validate number of batched tokens - assert out.num_batched_tokens == num_tokens - # - Validate there are no remaining blocks to swap - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - # - Validate all seq groups were scheduled - assert len(seq_group_meta_list) == num_seq_group - append_new_token(out, 1) - - # Schedule seq groups decode. - seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler) - # - Verify that sequence group metadata includes encoder attention - # and cross-attention metadata - assert all([ - not ((seq_group_meta.encoder_seq_data is None) or - (seq_group_meta.cross_block_table is None)) - for seq_group_meta in seq_group_meta_list - ]) - # - Validate sequence-group status - assert set(get_sequence_groups(out)) == set(running) - # - Validate there is one batched token per seq group - assert out.num_batched_tokens == num_seq_group - # - Validate there are no remaining blocks to swap - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - # - Validate that all seq groups were scheduled - assert len(seq_group_meta_list) == num_seq_group - append_new_token(out, 1) - - # Abort sequences - for req_id in req_id_list: - scheduler.abort_seq_group(req_id) - # - Verify that sequence group cross-attention block tables are - # NO LONGER registered with the block manager - assert req_id not in scheduler.block_manager.cross_block_tables diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 08702e8c061fa..9da9672d95970 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -242,9 +242,6 @@ MULTIMODAL_MODELS = { "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(), - # [Encoder-decoder] - # TODO: Implement PP - # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), } # yapf: enable diff --git a/tests/encoder_decoder/__init__.py b/tests/encoder_decoder/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py deleted file mode 100644 index 3cf4c377fb581..0000000000000 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ /dev/null @@ -1,131 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""E2E tests to verify the correctness of the encoder-decoder framework - -Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. -""" -from typing import Optional - -import pytest -from transformers import AutoModelForSeq2SeqLM - -from vllm.attention.selector import (_Backend, _cached_get_attn_backend, - global_force_attn_backend_context_manager) -from vllm.platforms import current_platform -from vllm.sequence import SampleLogprobs - -from ..conftest import DecoderPromptType -from ..models.utils import check_logprobs_close - -LIST_ENC_DEC_SUPPORTED_BACKENDS = [ - _Backend.XFORMERS, _Backend.FLASH_ATTN, None -] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, -): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "</s>" - if decoder_prompt_type == DecoderPromptType.NONE: - hf_output_str = "<s>" + hf_output_str - - return output_ids, hf_output_str, out_logprobs - - -@pytest.fixture(autouse=True) -def clear_cache(): - """Fixture to clear backend cache before each test.""" - _cached_get_attn_backend.cache_clear() # Clear the cache - yield # This allows the test to run - - -@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) -@pytest.mark.parametrize("enforce_eager", [True, False]) -@pytest.mark.skipif( - current_platform.is_cpu(), - reason="CPU backend is not currently supported with encoder/decoder models" -) -@pytest.mark.skip(reason="bart not supported in V1") -def test_encoder_decoder_e2e( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - decoder_prompt_type: DecoderPromptType, - enforce_eager: bool, - attn_backend: _Backend, -) -> None: - ''' - End-to-End (E2E) test for the encoder-decoder framework. - This test evaluates the encoder-decoder functionality using the BART - model. We compare the outputs of the Hugging Face and vLLM - implementations to ensure that both implementations produce consistent - and correct results. - ''' - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - test_case_prompts = example_encoder_decoder_prompts[ - decoder_prompt_type] - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = ( - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - with vllm_runner(model, dtype=dtype, - enforce_eager=enforce_eager) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) - - hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE - else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py deleted file mode 100644 index 75612962c95f7..0000000000000 --- a/tests/entrypoints/openai/test_encoder_decoder.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import openai -import pytest -import pytest_asyncio - -from ...utils import RemoteOpenAIServer - -MODEL_NAME = "facebook/bart-base" - - -@pytest.fixture(scope="module") -def server(): - args = [ - "--dtype", - "bfloat16", - "--enforce-eager", - ] - - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server - - -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.skip(reason="bart is not yet supported in V1") -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): - completion = await client.completions.create(model=model_name, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=2, total_tokens=7) - - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 1 diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index dd33f5c8c1d8e..84dab737ece26 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -20,7 +20,6 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, parse_chat_messages_futures, resolve_chat_template_content_format, resolve_hf_chat_template) -from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64, encode_video_base64) @@ -38,7 +37,6 @@ QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B" -MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B" MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" @@ -125,27 +123,6 @@ def qwen25omni_tokenizer(): ) -@pytest.fixture(scope="module") -def mllama_model_config(): - return ModelConfig( - MLLAMA_MODEL_ID, - runner="generate", - limit_mm_per_prompt={ - "image": 2, - }, - ) - - -@pytest.fixture(scope="module") -def mllama_tokenizer(): - return TokenizerGroup( - MLLAMA_MODEL_ID, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - ) - - @pytest.fixture(scope="function") def mistral_model_config(): return ModelConfig( @@ -2249,180 +2226,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ) -### Mllama currently wraps images / texts as interleaved dictionaries -def test_mllama_single_image( - mllama_model_config, - mllama_tokenizer, - image_url, -): - """Ensures that a single image is parsed correctly mllama.""" - conversation, mm_data, mm_uuids = parse_chat_messages( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of this image is:" - }, - { - "image_url": image_url - }, - ], - }], - mllama_model_config, - mllama_tokenizer, - content_format="openai", - ) - _assert_mm_data_is_image_input(mm_data, 1) - _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) - assert conversation == [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of this image is:" - }, - { - "type": "image" - }, - ], - }] - - -def test_mllama_interleaved_images( - mllama_model_config, - mllama_tokenizer, - image_url, -): - """Ensures that multiple image are parsed as interleaved dicts.""" - conversation, mm_data, mm_uuids = parse_chat_messages( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of the first image is:", - }, - { - "image_url": image_url - }, - { - "type": "text", - "text": "The content of the second image is:", - }, - { - "image_url": image_url - }, - ], - }], - mllama_model_config, - mllama_tokenizer, - content_format="openai", - ) - _assert_mm_data_is_image_input(mm_data, 2) - _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) - assert conversation == [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of the first image is:" - }, - { - "type": "image" - }, - { - "type": "text", - "text": "The content of the second image is:" - }, - { - "type": "image" - }, - ], - }] - - -@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID]) -def test_multimodal_image_parsing_matches_hf(model, image_url): - """Checks end to end hf alignment for multimodal [image] parsing.""" - - def get_conversation(is_hf: bool): - img_part = {"type": "image_url", "image_url": {"url": image_url}} - if is_hf: - img_part = {"type": "image"} - return [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of the first image is:", - }, - img_part, - { - "type": "text", - "text": "The content of the second image is:", - }, - img_part, - { - "type": "text", - "text": "What animal is in the first image?", - }, - ], - }] - - # Build a config for the model - model_config = ModelConfig( - model, - runner="generate", - limit_mm_per_prompt={ - "image": 2, - }, - ) - - # Build the tokenizer group and grab the underlying tokenizer - tokenizer_group = TokenizerGroup( - model, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - trust_remote_code=model_config.trust_remote_code, - ) - tokenizer = tokenizer_group.tokenizer - - # Build and parse a conversation with {"type": "image"} using the tokenizer - hf_conversation = get_conversation(is_hf=True) - hf_result = tokenizer.apply_chat_template( - hf_conversation, - tokenize=False, - add_generation_prompt=True, - ) - - # Now parse with vLLMs chat utils & apply the template - vllm_conversation = get_conversation(is_hf=False) - conversation, _, _ = parse_chat_messages( - vllm_conversation, - model_config, - tokenizer_group, - content_format="openai", - ) - - vllm_result = apply_hf_chat_template( - tokenizer=tokenizer, - conversation=conversation, - chat_template=None, - model_config=model_config, - tools=None, - add_generation_prompt=True, - ) - - assert hf_result == vllm_result - - @pytest.mark.parametrize( "model", [ @@ -2486,7 +2289,6 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): (QWEN25VL_MODEL_ID, "openai"), (ULTRAVOX_MODEL_ID, "string"), (QWEN2AUDIO_MODEL_ID, "openai"), - (MLLAMA_MODEL_ID, "openai"), (LLAMA_GUARD_MODEL_ID, "openai")], ) # yapf: enable @@ -2545,7 +2347,6 @@ def test_resolve_content_format_hf_defined(model, expected_format): [("Salesforce/blip2-opt-2.7b", "string"), ("facebook/chameleon-7b", "string"), ("deepseek-ai/deepseek-vl2-tiny", "string"), - ("microsoft/Florence-2-base", "string"), ("adept/fuyu-8b", "string"), ("google/paligemma-3b-mix-224", "string"), ("Qwen/Qwen-VL", "string"), diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py deleted file mode 100644 index a2e6986460904..0000000000000 --- a/tests/kernels/attention/test_encoder_decoder_attn.py +++ /dev/null @@ -1,1105 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Tests: - -* E2E test of Encoder attention + Decoder self-attention + - Encoder/decoder cross-attention (collectively - "encoder/decoder attention") - -""" - -from typing import NamedTuple, Optional - -import pytest -import torch - -from tests.kernels.utils import * -from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP -from vllm.attention.selector import (_Backend, _cached_get_attn_backend, - global_force_attn_backend_context_manager) -from vllm.config import VllmConfig, set_current_vllm_config -from vllm.forward_context import set_forward_context -from vllm.platforms import current_platform - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Encoder-decoder is only supported on V0, so set - VLLM_USE_V1=0 for all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -# List of support backends for encoder/decoder models -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] -HEAD_SIZES = [64, 256] - -NUM_HEADS = [1, 16] - -BATCH_SIZES = [1, 16] -BLOCK_SIZES = [16] -CUDA_DEVICE = "cuda:0" - -MAX_DEC_SEQ_LENS = [128] -MAX_ENC_SEQ_LENS = [128] - -# Narrow test-cases for unsupported-scenario -# tests -HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]] - - -class TestPoint(NamedTuple): - """ - Encapsulates the attributes which define a single invocation - of the test_e2e_enc_dec_attn() test - - Attributes: - num_heads: The number of heads in the model. - head_size: Head dimension - backend_name: Name of the backend framework used. - batch_size: Number of samples per batch. - block_size: Size of each block of data processed. - max_dec_seq_len: Maximum sequence length for the decoder. - max_enc_seq_len: Maximum sequence length for the encoder. - num_blocks: Number of blocks in the model. - """ - - num_heads: int - head_size: int - backend_name: str - batch_size: int - block_size: int - max_dec_seq_len: int - max_enc_seq_len: int - num_blocks: int - attn_type: AttentionType - - -class TestResources(NamedTuple): - ''' - Encapsulates key components for performing an - encoder/decoder attention test - - Note that - (1) attn automatically selects an attention backend - based on platform info & a set of canned - heuristics - (2) attn_backend is thus *not the same backend - instance* used by attn, but rather it is - intended to be a - *different instance* of the *same backend class*; - it is assumed that the user of TestResources - will leverage attn_backend for the purpose of - constructing backend-compatible attention - metadata instances - - Attributes: - - * scale: 1/sqrt(d) scale factor for attn - * attn_backend: implementations of abstraction - attention interface using - a particular kernel library - i.e. XFormers - * attn: Attention layer instance - * kv_cache: shared key/value cache for all attention - ''' - - scale: float - attn: Attention - kv_cache: torch.Tensor - - -def _make_test_resources(test_pt: TestPoint, ) -> TestResources: - ''' - Build key components for performing encoder/decoder attention test. - - Note that - (1) The Attention instance constructed here, automatically selects - an attention backend class based on platform info & a set of canned - heuristics, so - (2) The attention backend instance constructed here is thus *not - the same backend instance* used by attn, but rather it is - intended to be a *different instance* of the *same backend class*; - therefore, - (3) This function requires that test_pt.backend_name matches the backend - class that Attention will automatically select when it is constructed. - - - Arguments: - - * test_pt: TestPoint data structure; this function relies on the - following fields: num_heads, head_size, num_blocks, - block_size, backend_name - - Returns: - - * TestResources data structure. - ''' - - scale = float(1.0 / (test_pt.head_size**0.5)) - attn = Attention( - test_pt.num_heads, - test_pt.head_size, - scale=scale, - prefix=f"{test_pt.attn_type}", - attn_type=test_pt.attn_type, - ) - if test_pt.num_blocks is None or test_pt.num_heads is None: - # Caller does not require a KV cache - return TestResources( - scale, attn, - torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) - - # Construct KV cache - if test_pt.attn_type in (AttentionType.DECODER, - AttentionType.ENCODER_DECODER): - kv_cache = make_kv_cache(test_pt.num_blocks, - test_pt.num_heads, - test_pt.head_size, - test_pt.block_size, - device=CUDA_DEVICE, - backend=test_pt.backend_name) - else: - kv_cache = torch.tensor([]) - - attn.kv_cache = [kv_cache] - return TestResources(scale, attn, kv_cache) - - -def _encoder_attn_setup( - test_pt: TestPoint, - test_rsrcs: TestResources, -) -> PhaseTestParameters: - ''' - Set up test vectors & data structures for encoder attention test. - - A triplet of synthetic query/key/value tensors are constructed. - Given this is an encoder attention test, the key & value - sequences will have the same length as the corresponding queries. - - The query/key/value tensors are passed to an ideal reference - self-attention implementation to generate an ideal output tensor. - - Encoder inference does not populate the KV cache, therefore - no KV cache memory mapping is constructed - - Arguments: - - * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, - block_size, max_q_seq_len - * test_rsrcs: TestResources data structure; this function relies on the - scale field - - - Returns: - - * PhaseTestParameters data structure comprising (1) packed query/key/value - tensors, (2) the ideal output of attention computed using a naive - implementation, and (3) KVCache field set to None - ''' - - ( - num_heads, - head_size, - _, - batch_size, - _, - _, - max_q_seq_len, - _, - _, - ) = test_pt - - scale = test_rsrcs.scale - - max_kv_seq_len = max_q_seq_len - - # Make test tensors - - qkv_in, _, _ = make_qkv(batch_size, - max_q_seq_len, - max_kv_seq_len, - num_heads, - head_size, - attn_type=AttentionType.ENCODER, - device=CUDA_DEVICE) - - # Compute correct answer using naive non-causal attention - # implementation - - ideal_output = ref_masked_attention(qkv_in.query, - qkv_in.key, - qkv_in.value, - scale=scale, - q_seq_lens=qkv_in.q_seq_lens, - kv_seq_lens=qkv_in.kv_seq_lens) - - packed_ideal_output, _ = pack_tensor(ideal_output, - qkv_in.q_seq_lens, - device=CUDA_DEVICE) - - packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE) - - return PhaseTestParameters( - PackedQKVO(packed_qkv, packed_ideal_output), - None # No KV cache - ) - - -def _decoder_attn_setup( - test_pt: TestPoint, - test_rsrcs: TestResources, - block_base_addr: int = 0, -) -> tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]: - ''' - Set up test vectors & data structures for self-attention test. - - A triplet of synthetic query/key/value tensors are constructed ("baseline" - query/key/value). Given this is a self-attention test, the key & value - sequences will have the same length as the corresponding queries. - - "Prefill" query/key/value tensors are derived by masking out the last value - in each baseline query/key/value. These tensors are used to test prefill & - populate KV cache for a subsequent decode test. - - "Decode" query/key/value tensors are derived by extracting *only* the last - value from each baseline query/key/value (i.e. complement of the prefill - tensors.) These tensors are used to test decode, conditional on the kv cache - being populated during the prefill test. - - The baseline query/key/value tensors are passed to an ideal reference - self-attention implementation to generate a "Baseline" ideal output tensor. - This tensor is split into the "Prefill" ideal output tensor (all but the - last element of each output sequence) and the "Decode" ideal output tensor - (*only* the last element of each output sequence); the "Prefill" and - "Decode" ideal output tensors can be used to validate the prefill and decode - test results, respectively. - - This function also constructs the self-attention KV cache memory mapping - (slot mapping and block table), ensuring that the block table starts at - block_base_addr - - Arguments: - - * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, - block_size, max_q_seq_len - * test_rsrcs: TestResources data structure; this function relies on the - scale field - * block_base_addr: decoder self-attention block-table base address - - Returns: - * qkv: Unpacked (batch_size x padded_seq_len x num_heads x - head_size) query/key/value tensors - * Prefill-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data - structures appropriate for prefill phase. - * Decode-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data - structures appropriate for decode phase. - * max_block_idx: max physical address in decoder self-attention block-table - (intended to be used as the base address for the encoder/ - decoder cross-attention block-table, which is not - constructed in this function) - ''' - - ( - num_heads, - head_size, - _, - batch_size, - block_size, - max_q_seq_len, - _, - _, - _, - ) = test_pt - - scale = test_rsrcs.scale - - max_kv_seq_len = max_q_seq_len - - # Build test tensors - - ( - qkv, - prefill_qkv, - decode_qkv, - ) = make_qkv(batch_size, - max_q_seq_len, - max_kv_seq_len, - num_heads, - head_size, - attn_type=AttentionType.DECODER, - device=CUDA_DEVICE) - - # Compute correct answer using naive attention implementation - # with causal attention mask - - causal_mask = make_causal_mask(max_q_seq_len, - max_kv_seq_len).to(CUDA_DEVICE) - - ideal_output = ref_masked_attention(qkv.query, - qkv.key, - qkv.value, - scale=scale, - custom_mask=causal_mask, - q_seq_lens=qkv.q_seq_lens, - kv_seq_lens=qkv.kv_seq_lens) - - # Split out the prefill- & decode-phase ideal answers & pack them - - prefill_ideal_output = torch.zeros_like(ideal_output) - decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1]) - for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens): - prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[ - bdx, :prefill_q_seq_len] - decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:( - prefill_q_seq_len + 1)] - - prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output, - prefill_qkv.q_seq_lens, - device=CUDA_DEVICE) - decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output, - [1 for _ in range(batch_size)], - device=CUDA_DEVICE) - - # Build prefill- & decode-phase data structures - # for decoder self-attention. Block tables and - # slot mapping must be in a format compatible - # with KV caching & attention kernels - # - # Prefill-phase: - # - # * Empty block-tables tensor - # * Slot-mapping with entries for prompt tokens - # - # Decode-phase: - # * Block-tables tensor with minimum number of blocks - # required by total num. tokens in the entirety of all sequences - # (including both prefill & decode) - # * Slot-mapping with entries for tokens that will be decoded in the - # current decode iteration - # - # Note: the format described above is simply mirroring what ModelRunner - # produces - - prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE) - - ( - decode_block_tables, - slot_mapping_list, - max_block_idx, - ) = make_block_tables_slot_mapping(block_size, - qkv.q_seq_lens, - device=CUDA_DEVICE, - block_base_addr=block_base_addr) - - ( - prefill_slot_mapping, - decode_slot_mapping, - ) = split_slot_mapping(slot_mapping_list, - qkv.q_seq_lens, - device=CUDA_DEVICE) - - prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE) - - decode_pckd_qkv = pack_qkv(decode_qkv, device=CUDA_DEVICE) - - return ( - qkv, - PhaseTestParameters( # Prefill test params - PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output), - KVMemoryMap(prefill_block_tables, prefill_slot_mapping)), - PhaseTestParameters( # Decode test params - PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output), - KVMemoryMap(decode_block_tables, decode_slot_mapping)), - max_block_idx) - - -def _enc_dec_cross_attn_setup_reuses_query( - decoder_qkv: QKVInputs, - encoder_test_params: PhaseTestParameters, - prefill_decoder_phase_test_params: PhaseTestParameters, - test_pt: TestPoint, - test_rsrcs: TestResources, - block_base_addr: int = 0, -) -> tuple[PhaseTestParameters, PhaseTestParameters]: - ''' - Set up test vectors & data structures for cross-attention test. - - A triplet of synthetic cross-attention key/value tensors are constructed - ("baseline" key/value). Given this is a cross-attention test, we assume - query tensors were already synthesized for a prior self-attention test and - will be reused for cross-attention. The key & value sequences generated here - may have a different length than the corresponding queries (as is often - the case for cross-attention between decoder and encoder sequences.) - - Cross attention key & value tensors do not grow during autoregressive - inference; thus this function obtains a single key/value pair suitable for - both prefill and decode. - - The "baseline" query tensor is received as an argument. The "baseline" - query/key/value tensors are passed to an ideal reference cross-attention - implementation to generate a "baseline" ideal output tensor. This tensor is - split into the "Prefill" ideal output tensor (all but the last element of - each output sequence) and the "Decode" ideal output tensor (*only* the last - element of each output sequence); the "Prefill" and "Decode" ideal output - tensors can be used to validate the prefill and decode test results, - respectively. - - This function also constructs the cross-attention KV cache memory mapping - (slot mapping and block table), ensuring that the block table starts at - block_base_addr. - - Arguments: - - * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x - num_heads x head_size) decoder self-attention inputs; - this function relies on the query and q_seq_lens - fields - * encoder_test_params: PhaseTestParameters data structure which was - used for encoder inference; KV cache field - is not used by this function - * prefill_decoder_phase_test_params: PhaseTestParameters data structure - used for prefill-phase decoder - self-attention; all fields - including KV cache required - * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, - block_size, max_q_seq_len - * test_rsrcs: TestResources data structure; this function relies on the - scale field - * block_base_addr: decoder self-attention block-table base address - - Returns: - - * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed - (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a - naive implementation, and (3) memory-mapping data structures appropriate - for prefill phase. - * Decode-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed - (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a - naive implementation, and (3) memory-mapping data structures appropriate - for decode phase. - ''' - - assert encoder_test_params.packed_qkvo.packed_qkv is not None - assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None - - ( - num_heads, - head_size, - _, - batch_size, - block_size, - max_decoder_seq_len, - max_encoder_seq_len, - _, - _, - ) = test_pt - - scale = test_rsrcs.scale - - decoder_query = decoder_qkv.query - decoder_seq_lens = decoder_qkv.q_seq_lens - encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens - prefill_q_seq_lens = ( - prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens) - - assert prefill_q_seq_lens is not None - - ( - cross_kv, - _, - _, - ) = make_qkv(batch_size, - max_decoder_seq_len, - max_encoder_seq_len, - num_heads, - head_size, - force_kv_seq_lens=encoder_seq_lens, - attn_type=AttentionType.ENCODER_DECODER, - device=CUDA_DEVICE) - - ideal_output = ref_masked_attention(decoder_query, - cross_kv.key, - cross_kv.value, - scale=scale, - q_seq_lens=decoder_seq_lens, - kv_seq_lens=cross_kv.kv_seq_lens) - - prefill_ideal_output = torch.zeros_like(ideal_output) - decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1]) - for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens): - prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[ - bdx, :prefill_q_seq_len] - decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:( - prefill_q_seq_len + 1)] - - prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output, - prefill_q_seq_lens, - device=CUDA_DEVICE) - decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output, - [1 for _ in range(batch_size)], - device=CUDA_DEVICE) - - # Build prefill- & decode-phase data structures - # for encoder/decoder cross-attention. Block tables and - # slot mapping must be in a format compatible - # with KV caching & attention kernels - # - # Whereas decoder self-attention extracts relationships between - # equal-length Q/K/V sequences, which mutually grow in length - # with each decoded token, cross-attention relates the Q sequence - # - which grows with each new decoded token - to fixed-length - # K and V sequences derived from the encoder hidden states. - # - # Prefill-phase: - # - # * Empty block-tables tensor - # * Slot-mapping with as many entries as there are tokens in the encoder - # prompt. - # - # Decode-phase: - # * Block-tables tensor with minimum number of blocks to - # accommodate K & V tensors which are equal in lnegth - # to the encoder prompt length - # * Empty slot-mapping tensor (since K & V are fixed in size, - # new decoded tokens are not KV-cached and require no slot- - # mapping) - # - # Note: the format above is simply an extension of what ModelRunner - # produces for decoder-only models - - prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE) - decode_slot_mapping = make_empty_slot_mapping_tensor(device=CUDA_DEVICE) - - ( - decode_block_tables, - prefill_slot_mapping_list, - _, - ) = make_block_tables_slot_mapping(block_size, - cross_kv.kv_seq_lens, - block_base_addr=block_base_addr, - device=CUDA_DEVICE) - - prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list, - device=CUDA_DEVICE) - - # Packed key/value (query is already provided) - packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE) - - return ( - PhaseTestParameters( # Prefill-phase test params - PackedQKVO(packed_cross_kv, prefill_packed_ideal_output), - KVMemoryMap(prefill_block_tables, prefill_slot_mapping)), - PhaseTestParameters( # Decode-phase test params - PackedQKVO(None, decode_packed_ideal_output), - KVMemoryMap(decode_block_tables, decode_slot_mapping))) - - -def _run_encoder_attention_test( - attn: Attention, - encoder_test_params: PhaseTestParameters, - attn_metadata: AttentionMetadata, - test_pt: TestPoint, - vllm_config: VllmConfig, -) -> torch.Tensor: - ''' - Run encoder attention. - - attn.forward() is passed attn_type=AttentionType.ENCODER in order - to configure the kernel invocation for encoder attention - - Requires attn_metadata.num_decode_tokens == 0 - (There is no encoder execution in the decode-phase) - - Arguments: - - * attn: Attention wrapper instance - * encoder_test_params: encoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - query/key/value fields - * attn_metadata: attention metadata for encoder/decoder-self attention - * test_pt: The TestPoint object containing test details like number of - model heads, head size, name of the backend being used etc. - - Returns: - * Attention.forward() applied to packed {query,key,value} and - & attn_metadata - ''' - assert attn_metadata.num_decode_tokens == 0 - packed_qkv = encoder_test_params.packed_qkvo.packed_qkv - assert packed_qkv is not None - with set_forward_context(attn_metadata, vllm_config): - # In the test setup the shape of the query is - # [batch_size, seq_len, num_heads, head_size]. However - # the attention backend expect the shape to be - # [num_tokens, hidden_size]. Hence reshape the query before - # invoking the forward method. - # TODO - Update the way we construct the query so that it - # is shaped as [num_tokens, hidden_size] and we can skip the reshape. - reshaped_query = packed_qkv.query.view( - -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value) - - -def _run_decoder_self_attention_test( - test_rsrcs: TestResources, - decoder_test_params: PhaseTestParameters, - attn_metadata: AttentionMetadata, - test_pt: TestPoint, - vllm_config: VllmConfig, -) -> torch.Tensor: - ''' - Run decoder self-attention test. - - attn.forward() is passed attn_type=AttentionType.DECODER - in order to configure the kernel invocation for decoder self-attention. - - Arguments: - - * test_rsrcs: TestResources instance; this function relies on the kv_cache - and attn (Attention wrapper instance) fields - * decoder_test_params: decoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - query/key/value fields - * attn_metadata: attention metadata for decoder-self attention - (contains KV cache memory-mapping) - * test_pt: The TestPoint object containing test details like number of - model heads, head size, name of the backend being used etc. - - Returns: - * Attention.forward() applied to packed_{query,key,value}, kv_cache - & attn_metadata - ''' - attn = test_rsrcs.attn - packed_qkv = decoder_test_params.packed_qkvo.packed_qkv - assert packed_qkv is not None - with set_forward_context(attn_metadata, vllm_config): - # In the test setup the shape of the query is - # [batch_size, seq_len, num_heads, head_size]. However - # the attention backend expect the shape to be - # [num_tokens, hidden_size]. Hence reshape the query before - # invoking the forward method. - # TODO - Update the way we construct the query so that it - # is shaped as [num_tokens, hidden_size] and we can skip the reshape. - reshaped_query = packed_qkv.query.view( - -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value) - - -def _run_encoder_decoder_cross_attention_test( - test_rsrcs: TestResources, - decoder_test_params: PhaseTestParameters, - cross_test_params: Optional[PhaseTestParameters], - attn_metadata: AttentionMetadata, - test_pt: TestPoint, - vllm_config: VllmConfig, -) -> torch.Tensor: - ''' - Run encoder/decoder cross-attention test. - - Via PhaseTestParameters data structures, consumes the same query utilized - for decoder self-attention, plus a key/value specific to cross-attention. - - if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv - is None, this reflects that in decode-phase cross attention there - is no growth in the key and value tensors. - - attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER - in order to configure the kernel invocation for encoder/decoder cross- - attention. - - Arguments: - - * test_rsrcs: TestResources instance; this function relies on the kv_cache - and attn (Attention wrapper instance) fields - * decoder_test_params: decoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - query field - * cross_test_params: encoder/decoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - key/value fields - * attn_metadata: attention metadata for encoder/decoder-self attention - * test_pt: The TestPoint object containing test details like number of - model heads, head size, name of the backend being used etc. - - Returns: - * Attention.forward() applied to packed_{query,key,value}, kv_cache - & attn_metadata - ''' - assert decoder_test_params.packed_qkvo.packed_qkv is not None - - attn = test_rsrcs.attn - if cross_test_params is None: - key = None - value = None - else: - cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv - key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key) - value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value) - with set_forward_context(attn_metadata, vllm_config): - # In the test setup the shape of the query is - # [batch_size, seq_len, num_heads, head_size]. However - # the attention backend expect the shape to be - # [num_tokens, hidden_size]. Hence reshape the query before - # invoking the forward method. - # TODO - Update the way we construct the query so that it - # is shaped as [num_tokens, hidden_size] and we can skip the reshape. - reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( - -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, key, value) - - -@pytest.fixture(autouse=True) -def set_reset_environment(attn_backend): - # Set the default torch datatype to bfloat16 to enable - # testing of the Flash Attention backend. Also clear the - # cached value of the backend. - default_dtype = torch.get_default_dtype() - if attn_backend.name == 'FLASH_ATTN': - torch.set_default_dtype(torch.bfloat16) - _cached_get_attn_backend.cache_clear() - yield - # Reset the torch datatype to what it was before the test - # so as not to impact the remaining tests. - torch.set_default_dtype(default_dtype) - - -@pytest.mark.skipif(current_platform.is_rocm(), - reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS) -@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS) -def test_encoder_only( - num_heads: int, - head_size: int, - attn_backend: _Backend, - batch_size: int, - block_size: int, - max_dec_seq_len: int, - max_enc_seq_len: int, -): - ''' - End-to-end encoder-only attention test: - - * Construct fake test vectors for (1) encoder attention - * Construct (1) attention metadata structure with prefill-phase - encoder attention, and (2) an analogous attention metadata - structure but for decode-phase - * Test & validate encoder attention against ideal output - - No KV cache is required for encoder-only attention. - - Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if - current_platform.is_rocm(). - - This test globally forces an override of the usual backend - auto-selection process, forcing the specific backend-under-test - to be utilized. - - Arguments: - - * num_heads - * head_size, - * attn_backend: The attention backend to employ for testing - * batch_size - * block_size: KV cache block size - * max_dec_seq_len: max length of decoder input sequences - * max_enc_seq_len: max length of encoder input sequences - ''' - # Force Attention wrapper backend - with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally - # to be more than necessary, since exceeding the kv cache size - # is not part of this test - test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, AttentionType.ENCODER) - - # Attention scale factor, attention backend instance, attention wrapper - # instance, KV cache init - vllm_config = VllmConfig() - with set_current_vllm_config(vllm_config): - test_rsrcs = _make_test_resources(test_pt) - - # Construct encoder attention test params (only used - # during prefill) - - enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs) - - # Shared prefill metadata structure - - prephase_attn_metadata: AttentionMetadata = make_test_metadata( - attn_backend, - True, - None, - decoder_test_params=None, - encoder_test_params=enc_test_params, - cross_test_params=None, - device=CUDA_DEVICE) - - # PREFILL: encoder attention - - enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test( - test_rsrcs.attn, - enc_test_params, - prephase_attn_metadata, - test_pt=test_pt, - vllm_config=vllm_config)) - - # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, - attn_backend.name) - - -@pytest.mark.skipif(current_platform.is_rocm(), - reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS) -@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS) -def test_e2e_enc_dec_attn( - num_heads: int, - head_size: int, - attn_backend: _Backend, - batch_size: int, - block_size: int, - max_dec_seq_len: int, - max_enc_seq_len: int, -) -> None: - ''' - End-to-end encoder/decoder test: - - * Construct fake test vectors for (1) encoder attention, - (2) decoder self-attention, and (3) encoder/decoder cross-attention - * Construct (1) attention metadata structure with self- and cross-attention - attributes for prefill-phase, and (2) an analogous attention metadata - structure but for decode-phase - * Test attention steps in the following order - - * Encoder attention - * Prefill self-attention - * Prefill cross-attention - * Decode self-attention - * Decode cross-attention - * Besides being reflective of realistic use-cases, this order would - exacerbate any accidental overlap in the self-/cross-attention - block tables, which one hopes to avoid - - - * Validate output correctness against ideal reference attention - implementation - - Block tables are constructed such that cross-attention KV cache is in a - higher, non-intersecting address-space than self-attention KV cache. - - Self- and cross-attention share the same query tensor but not the K/V - tensors. Self-attention K/Vs must have the same seq len as Q while - cross-attention K/Vs are allowed to differ in seq len, as is often the case - for cross-attention. - - This test globally forces an override of the usual backend - auto-selection process, forcing the specific backend-under-test - to be utilized. - - Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if - current_platform.is_rocm(). - - Note on metadata: there is a single attention metadata structure shared by - all prefill-phase attention operations (encoder, decoder, enc/dec cross), - and a single one shared by all decode-phase attention operations - (decoder & enc/dec cross.) This is intended to reflect the behavior - of EncoderDecoderModelRunner, which constructs a single attention metadata - structure for each prefill or decode run. A realistic scenario would rely - on the attention backend to utilize the appropriate attention metadata - fields according to the value of attn_metadata.attention_type. Thus, - this test is organized so as to confirm that the backend-under-test can - handle a shared prefill attention metadata structure & a shared decode\ - attention metadata structure. - - Arguments: - - * num_heads - * head_size, - * attn_backend: The attention backend to employ for testing - * batch_size - * block_size: KV cache block size - * max_dec_seq_len: max length of decoder input sequences - * max_enc_seq_len: max length of encoder input sequences - ''' - # Force Attention wrapper backend - with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally - # to be more than necessary, since exceeding the kv cache size - # is not part of this test - enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, AttentionType.ENCODER) - enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, - AttentionType.ENCODER_DECODER) - dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, AttentionType.DECODER) - - # Attention scale factor, attention backend instance, attention wrapper - # instance, KV cache init - vllm_config = VllmConfig() - with set_current_vllm_config(vllm_config): - enc_test_rsrcs = _make_test_resources(enc_test_pt) - enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt) - dec_test_rsrcs = _make_test_resources(dec_test_pt) - - # Construct encoder attention test params (only used - # during prefill) - - enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs) - - # Construct Decoder self-attention prefill-phase & decode-phase - # test params, including query/key/value tensors, decoder self-attention - # memory-mapping. cross_block_base_addr is the uppermost address in the - # decoder self-attention block-table, i.e. a base address which the - # encoder/decoder cross-attention block-table may build downward toward. - - ( - dec_qkv, - prephase_dec_test_params, - decphase_dec_test_params, - cross_block_base_addr, - ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs) - - # Construct encoder/decoder cross-attention prefill-phase - # & decode-phase test params, including key/value tensors, - # cross-attention memory-mapping - - ( - prephase_cross_test_params, - decphase_cross_test_params, - ) = _enc_dec_cross_attn_setup_reuses_query( - dec_qkv, - enc_test_params, - prephase_dec_test_params, - enc_dec_test_pt, - enc_dec_test_rsrcs, - block_base_addr=cross_block_base_addr) - - # Shared prefill metadata structure - assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None - prephase_attn_metadata: AttentionMetadata = make_test_metadata( - attn_backend, - True, - prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens, - decoder_test_params=prephase_dec_test_params, - encoder_test_params=enc_test_params, - cross_test_params=prephase_cross_test_params, - device=CUDA_DEVICE) - - # PREFILL: encoder attention - - enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn, - enc_test_params, - prephase_attn_metadata, - test_pt=enc_test_pt, - vllm_config=vllm_config) - - # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, - attn_backend.name) - - # PREFILL: decoder self-attention test - - prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - dec_test_rsrcs, - prephase_dec_test_params, - prephase_attn_metadata, - test_pt=dec_test_pt, - vllm_config=vllm_config) - - # - Is prefill decoder self-attention correct? - assert_actual_matches_ideal(prephase_dec_test_params, - prephase_dec_pckd_act_out, - attn_backend.name) - - # PREFILL: encoder/decoder cross-attention test - - prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - enc_dec_test_rsrcs, - prephase_dec_test_params, - prephase_cross_test_params, - prephase_attn_metadata, - test_pt=enc_dec_test_pt, - vllm_config=vllm_config) - - # - Is prefill encoder/decoder cross-attention correct? - assert_actual_matches_ideal(prephase_cross_test_params, - prephase_cross_pckd_act_out, - attn_backend.name) - - # DECODE: build decode-phase attention metadata - - decphase_attn_metadata: AttentionMetadata = make_test_metadata( - attn_backend, - False, - dec_qkv.q_seq_lens, - decoder_test_params=decphase_dec_test_params, - encoder_test_params=enc_test_params, - cross_test_params=decphase_cross_test_params, - device=CUDA_DEVICE) - - # DECODE: decoder self-attention test - - decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - dec_test_rsrcs, - decphase_dec_test_params, - decphase_attn_metadata, - test_pt=dec_test_pt, - vllm_config=vllm_config) - - # - Is decode-phase decoder self-attention correct? - assert_actual_matches_ideal(decphase_dec_test_params, - decphase_dec_pckd_act_out, - attn_backend.name) - - # DECODE: encoder/decoder cross-attention test - - decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - enc_dec_test_rsrcs, - decphase_dec_test_params, - None, - decphase_attn_metadata, - test_pt=enc_dec_test_pt, - vllm_config=vllm_config) - - # - Is decode-phase encoder/decoder cross-attention correct? - assert_actual_matches_ideal(decphase_cross_test_params, - decphase_cross_pckd_act_out, - attn_backend.name) diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py deleted file mode 100644 index 22ceb27869ac4..0000000000000 --- a/tests/models/language/generation/test_bart.py +++ /dev/null @@ -1,222 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional - -import pytest -from transformers import AutoModelForSeq2SeqLM - -from vllm.sequence import SampleLogprobs - -from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt, - HfRunner, VllmRunner) -from ....utils import multi_gpu_test -from ...utils import check_logprobs_close - - -def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, -): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "</s>" - if decoder_prompt_type == DecoderPromptType.NONE: - hf_output_str = "<s>" + hf_output_str - - return output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - decoder_prompt_type: DecoderPromptType, - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -) -> None: - ''' - Test the vLLM BART model for a variety of encoder/decoder input prompts, - by validating it against HuggingFace (HF) BART. - - Arguments: - - * hf_runner: HuggingFace (HF) test model runner - * vllm_runner: vLLM test model runner - * example_encoder_decoder_prompts: test fixture which provides a - dictionary of dummy prompts - * model: the HF ID of the specific BART variant under test - * dtype: the tensor datatype to employ - * max_tokens - * num_logprobs - * decoder_prompt_type: key into the example_encoder_decoder_prompts - dictionary; selects specific encoder/decoder - prompt scenarios to test - - A note on using HF BART as a baseline for validating vLLM BART, - specifically when the decoder prompt is None. - - The HF GenerationMixin's default behavior is to force the first - decoded token to be <BOS> if the prompt does not already contain - <BOS> (this is accomplished using a logit - processor setting.) - - So when we use HF BART as our baseline for comparison, note that - when the user provides a request with a None decoder prompt - (i.e. a singleton encoder prompt, or else an explicit encoder/ - decoder prompt with the decoder sub-prompt set to None), HF and - vLLM handle this in different ways: - - * HF will (1) tokenize the None prompt as an empty token-list, - (2) append <decoder-start-token> to the beginning, yielding - [<decoder-start-token>], (3) pass this token list to the model, and - then (4) after computing logits during prefill, override the model - logits & force <BOS> to be the first generated token. - - * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder- - start-token to the beginning, yielding [<decoder-start-token><BOS>], - (3) pass these tokens to the model & proceed with generation. - - The net effect is that compared to vLLM, the list of HF *decoded* tokens - will contain one more initial <BOS> than the vLLM generated tokens, - because vLLM's <BOS> token is injected into the prompt rather than into - the generated output. This is in spite of the fact that overall, the - complete sequences (prompt + decoded tokens) produced by vLLM will match - HF. - - So when we use HF decoded token output to validate vLLM's decoded token - output, the testing process must account for the difference in decoded - token sequences between vLLM and HF specifically in the - decoder-prompt-is-None case. - - One option is to disable the logit processor feature that forces the - <BOS> token to be decoded (forced_bos_token_id = None), eliminating - the problem entirely. However this is not "normal" BART usage. - - The other option is - only in the decoder-prompt-is-None case - to - discard the first decoded token from the HF output before comparing it - to vLLM. - - To that end, when testing the scenario where the decoder prompt is None - (and only in that one scenario), this test skips the first HF decoded - token during the process of validating the vLLM decoded output. - ''' - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default). - - # Note: currently encoder/decoder models are only compatible with - # enforce_eager=True. Normally this is not a problem because - # for encoder/decoder models vLLM will - # default to enforce_eager=True if enforce_eager - # is left unspecified. However, the - # VllmRunner test fixture (which wraps around the LLM class) defaults to - # enforce_eager=False (a behavior which a number of already-existing - # decoder-only unit tests expect), so when testing an encoder/decoder - # model we must explicitly specify enforce_eager=True in the VllmRunner - # constructor. - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - prompts, max_tokens, num_logprobs) - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( - prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - - hf_skip_tokens = (1 - if decoder_prompt_type == DecoderPromptType.NONE else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) - - -@pytest.mark.parametrize( - "model", - [ - pytest.param("facebook/bart-base", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("facebook/bart-large-cnn"), - ], -) -@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) -@pytest.mark.skip(reason="bart not supported in V1") -def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, - dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: - - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM]) -@pytest.mark.skip(reason="bart not supported in V1") -def test_models_distributed(hf_runner, vllm_runner, - example_encoder_decoder_prompts, - distributed_executor_backend, model, dtype, - max_tokens, num_logprobs, - decoder_prompt_type) -> None: - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/tests/models/language/generation/test_mbart.py b/tests/models/language/generation/test_mbart.py deleted file mode 100644 index 854a72713943b..0000000000000 --- a/tests/models/language/generation/test_mbart.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional - -import pytest -from transformers import AutoModelForSeq2SeqLM - -from vllm.sequence import SampleLogprobs - -from ....conftest import DecoderPromptType, HfRunner, VllmRunner -from ...utils import check_logprobs_close - - -def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, -): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - hf_output_str = output_str + "</s>" - return output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - prompts: list[dict[str, str]], - decoder_prompt_type: DecoderPromptType, - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -) -> None: - ''' - Test the vLLM mBART model by validating it against HuggingFace (HF). - (Docstring content is omitted for brevity) - ''' - - vllm_prompts = prompts - if decoder_prompt_type == DecoderPromptType.NONE: - vllm_prompts = [{ - "encoder_prompt": p['encoder_prompt'], - "decoder_prompt": "" - } for p in prompts] - - vllm_kwargs = { - "hf_overrides": { - "architectures": ["MBartForConditionalGeneration"] - } - } - - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - **vllm_kwargs) as vllm_model: # type: ignore - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - vllm_prompts, max_tokens, num_logprobs) - - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_kwargs["decoder_start_token_id"] = ( - hf_model.tokenizer.lang_code_to_id["ro_RO"]) - - hf_outputs = ( - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - prompts, # HF runner still uses the original prompts - max_tokens, - num_logprobs, - **hf_kwargs, - )) - - hf_skip_tokens = 0 - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) - - -@pytest.mark.parametrize( - "model", - [pytest.param("facebook/mbart-large-en-ro")], -) -@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) -def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, - dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: - - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py deleted file mode 100644 index a622957f96f69..0000000000000 --- a/tests/models/multimodal/generation/test_florence2.py +++ /dev/null @@ -1,147 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -import pytest -from PIL import Image - -from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt -from vllm.multimodal.image import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner -from ...utils import check_logprobs_close - -MODELS = ["microsoft/Florence-2-base"] -# Florence-2 model repo's tokenizer config is missing some special tokens. -# Therefore, we use a converted tokenizer from a forked repo -TOKENIZER = "Isotr0py/Florence-2-tokenizer" -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "<OD>", # special task token which will output special tokens - "cherry_blossom": - "Describe in detail what is shown in the image.", -}) - - -def get_hf_images_prompts( - prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]], -) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]: - prompts, images = [], [] - for prompt in prompts_: - encoder_prompt = prompt["encoder_prompt"] - prompts.append( - ExplicitEncoderDecoderPrompt( - encoder_prompt=encoder_prompt["prompt"], - decoder_prompt=None, - )) - images.append(encoder_prompt["multi_modal_data"]["image"]) - return prompts, images - - -def hf_to_vllm_output(hf_output: tuple[list[int], str, - Optional[SampleLogprobs]]): - """Sanitize hf output to be comparable with vllm output.""" - output_ids, output_str, out_logprobs = hf_output - - output_str = output_str.replace("</s>", "").replace("<s>", "") - - return output_ids, output_str, out_logprobs - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - inputs: list[list[ExplicitEncoderDecoderPrompt]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -) -> None: - with vllm_runner(model, - max_num_seqs=8, - tokenizer_name=TOKENIZER, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_case = [ - vllm_model.generate_encoder_decoder_greedy_logprobs( - prompts, - max_tokens, - num_logprobs=num_logprobs, - skip_special_tokens=False, - ) for prompts in inputs - ] - - hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs] - - with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model: - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.language_model.lm_head - hf_outputs_per_case = [ - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - prompts, max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in hf_inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, - vllm_outputs_per_case): - check_logprobs_close( - outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs], - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=1, - ) - - -# FIXME: https://github.com/huggingface/transformers/issues/38358 -@pytest.mark.skip("Model initialization fails") -@pytest.mark.core_model -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, model: str, - size_factors: list[int], dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [[ - ExplicitEncoderDecoderPrompt( - encoder_prompt=TextPrompt( - prompt=prompt, - multi_modal_data={"image": rescale_image_size(image, factor)}), - decoder_prompt=None, - ) for factor in size_factors - ] for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py deleted file mode 100644 index 1c32cc6d71c04..0000000000000 --- a/tests/models/multimodal/generation/test_mllama.py +++ /dev/null @@ -1,768 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional, overload - -import pytest -import torch -from packaging.version import Version -from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer -from transformers import __version__ as TRANSFORMERS_VERSION - -from vllm import LLM, SamplingParams -from vllm.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.attention.selector import (_Backend, _cached_get_attn_backend, - global_force_attn_backend_context_manager) -from vllm.model_executor.models.mllama import MllamaForConditionalGeneration -from vllm.multimodal.image import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets, - PromptImageInput, VllmRunner) -from ....quantization.utils import is_quant_method_supported -from ....utils import (create_new_process_for_each_test, large_gpu_test, - multi_gpu_test) -from ...utils import check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 3 -MLLAMA_IMAGE_TOKEN_ID = 128256 - -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "<|image|><|begin_of_text|>The meaning of the image is", - "cherry_blossom": - "<|image|><|begin_of_text|>The city is", -}) - -text_only_prompts = [ - "The color of the sky is blue but sometimes it can also be", -] - -models = [ - "meta-llama/Llama-3.2-11B-Vision-Instruct", -] - -# Indices for inputs -TEXT_ONLY = '0' -IMAGE_AT_BEG = '1' -IMAGE_AT_MIDDLE = '2' -TWO_IMAGES = '3' - -# Input tokenized -prompt_data = { - # Tell me a story - TEXT_ONLY: [41551, 757, 264, 3446], - # <|image|> What's the content of this image - IMAGE_AT_BEG: - [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220], - # Hello <|image|>What' the content of this image - IMAGE_AT_MIDDLE: - [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217], - #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501 - TWO_IMAGES: [ - MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30, - MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30 - ] -} - - -def vllm_to_hf_output(vllm_output: tuple[list[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - hf_output_str = output_str - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -def _get_inputs( - image_assets: ImageTestAssets, - *, - size_factors: Optional[list[float]] = None, - sizes: Optional[list[tuple[int, int]]] = None, -) -> list[tuple[list[str], PromptImageInput]]: - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [ - prompt if size is not None else text_only_prompts[0] - for size in sizes - ], - [ - image.resize(size) if size is not None else None - for size in sizes - ], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - if len(sizes) == 0: - inputs_per_image.append( - (text_only_prompts, [None] * len(text_only_prompts))) - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - return inputs_per_image - - -@overload -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - model: str, - *, - size_factors: list[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - model: str, - *, - sizes: list[tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - model: str, - *, - size_factors: Optional[list[float]] = None, - sizes: Optional[list[tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - _run_test( - hf_runner, - vllm_runner, - _get_inputs(image_assets, size_factors=size_factors, sizes=sizes), - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) - - -def _run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - inputs: list[tuple[list[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner( - model, - dtype=dtype, - max_model_len=19212, # 3 max size images - max_num_seqs=3, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - limit_mm_per_prompt={"image": - _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, - dtype=dtype, - model_kwargs={"device_map": "auto"}, - auto_cls=AutoModelForImageTextToText) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.fixture(autouse=True) -def clear_cache(): - """Fixture to clear backend cache before each test.""" - _cached_get_attn_backend.cache_clear() # Clear the cache - yield # This allows the test to run - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "sizes", - [ - # Text only - [], - # Single-size - [(512, 512)], - # Single-size, batched - [(512, 512), (512, 512), (512, 512)], - # Multi-size, batched - [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024), - (1024, 1024), (512, 1536), (512, 2028)], - # Multi-size, batched, including text only - [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024), - (1024, 1024), (512, 1536), (512, 2028), None], - # mllama has 8 possible aspect ratios, carefully set the sizes - # to cover all of them - ]) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, - model, sizes, dtype, max_tokens, - num_logprobs, - attn_backend: _Backend) -> None: - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, num_logprobs, - attn_backend: _Backend) -> None: - - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501 - "<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501 - "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.", # noqa: E501 - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes. - [ - stop_sign.resize((512, 512)), - stop_sign, - ], - [ - stop_sign, - stop_sign.resize((512, 1536)), - cherry_blossom.resize((512, 1024)), - ], - ])] - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, - dtype, max_tokens, num_logprobs, - attn_backend: _Backend) -> None: - - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501 - "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, " # noqa: E501 - "which is a stop sign and which is a cherry blossom?", # noqa: E501 - ], - [ - [stop_sign], - [stop_sign, cherry_blossom], - ])] - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@create_new_process_for_each_test() -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_distributed( - hf_runner, - vllm_runner, - image_assets, - distributed_executor_backend, - model, - dtype, - max_tokens, - num_logprobs, -) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model=model, - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend, - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), - reason='bitsandbytes is not supported on this GPU type.') -def test_bnb_regression( - image_assets: ImageTestAssets, - model: str, - dtype: str, - max_tokens: int, -): - stop_sign = image_assets[0].pil_image - prompts = [ - { - "prompt": "<|begin_of_text|>The content of the image <|image|> is", - "multi_modal_data": { - "image": stop_sign - }, - }, - { - "prompt": - "The color of the sky is blue but sometimes it can also be", - }, - ] - # Test regression about QKVCrossParallelLinear - llm = LLM( - model=model, - dtype=dtype, - max_model_len=8192, - max_num_seqs=2, - quantization="bitsandbytes", - ) - sampling_params = SamplingParams( - temperature=0, - max_tokens=max_tokens, - ) - outputs = llm.generate(prompts, sampling_params) - assert outputs - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_explicit_implicit_prompt( - image_assets: ImageTestAssets, - model: str, - dtype: str, - max_tokens: int, -): - stop_sign = image_assets[0].pil_image - # yapf: disable - prompts = [ - # explicit prompt - { - "encoder_prompt": { - "prompt": "<|image|>", - "multi_modal_data": {"image": stop_sign}, - }, - "decoder_prompt": { - "prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374], # noqa: E501 - } - }, - { - "encoder_prompt": "Not <|image|>", - "decoder_prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501 - }, - # implicit prompt - { - "prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501 - "multi_modal_data": {"image": stop_sign}, - }, - { - "prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501 - }, - ] - # yapf: enable - llm = LLM( - model=model, - dtype=dtype, - max_model_len=8192, - max_num_seqs=2, - tensor_parallel_size=1, - ) - sampling_params = SamplingParams( - temperature=0, - max_tokens=max_tokens, - ) - outputs = llm.generate(prompts, sampling_params) - n_prompts = len(prompts) - explicit_outputs = outputs[:n_prompts // 2] - implicit_outputs = outputs[n_prompts // 2:] - for exp_output, imp_output in zip(explicit_outputs, implicit_outputs): - assert exp_output.outputs[0].text == imp_output.outputs[0].text - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, - num_logprobs, attn_backend: _Backend) -> None: - - stop_sign = image_assets[0].pil_image - - with global_force_attn_backend_context_manager(attn_backend), vllm_runner( - model, - dtype=dtype, - max_model_len=8192, - max_num_seqs=4, - tensor_parallel_size=1, - limit_mm_per_prompt={"image": - _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: - - # Regression tests for https://github.com/vllm-project/vllm/issues/10648 - - # Number of groups of image tokens is greater than the number of images - # provided (the whitespace between the tags is necessary) - prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501 - image = stop_sign - with pytest.raises(ValueError): - vllm_model.generate_greedy_logprobs([prompt], - max_tokens, - num_logprobs, - images=[image]) - - # Batch of a text-only and image request that requires cross-attention - prompts = [ - "What is the capital of spain?", - "Text before the image...<|image|>What is in the image?", # noqa: E501 - ] - images = [ - None, - [stop_sign], - ] - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs, - images=images) - - # Test the reverse order too for good measure - prompts = [ - "<|begin_of_text|>Text before the image...<|image|>What is in the image?", # noqa: E501 - "<|begin_of_text|>Hello!", - ] - images = [ - [stop_sign], - None, - ] - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs, - images=images) - - # Mixed batch with text and images with different numbers of tiles - prompts = [ - "<|begin_of_text|>Hello!", - "<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501 - "<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501 - ] - images = [ - None, - [stop_sign], - # smaller image must be 2nd for the repro - [stop_sign.resize((448, 448))], - ] - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs, - images=images) - - -class DummyModel: - image_token_id = MLLAMA_IMAGE_TOKEN_ID - - -@pytest.mark.core_model -@pytest.mark.parametrize( - "input_indices_and_output", - # inputs, (cross_attention_mask, kv_range_for_decode) - [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)), - ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)), - ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])), - ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])), - ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], - ((23, 24), [[0, 6], [6, 12]])), - ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])), - ([TWO_IMAGES], ((18, 12), [[6, 12]])), - ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))]) -def test_get_cross_attention_mask(input_indices_and_output) -> None: - - input_indices, expected_output = input_indices_and_output - - sequences = [torch.tensor(prompt_data[i]) for i in input_indices] - num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices - if i != TEXT_ONLY] - input = torch.cat(sequences) - - seq_lens = [len(s) for s in sequences] - - attn_data = FlashAttentionMetadata( - seq_lens=seq_lens, - # Dummy values - enable_kv_scales_calculation=False, - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=0, - slot_mapping=0, - multi_modal_placeholder_index_maps=None, - seq_lens_tensor=0, - max_prefill_seq_len=0, - max_decode_seq_len=0, - context_lens_tensor=None, - block_tables=None, - use_cuda_graph=False, - ) - - dummy = DummyModel() - - cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\ - .get_cross_attention_mask(dummy, - input, - attn_data, - num_tiles=num_tiles, - num_tokens_per_tile=3, - dtype=torch.bfloat16) - - expected_cross_attention_mask, expected_kv_range_for_decode = \ - expected_output - - assert kv_range_for_decode == expected_kv_range_for_decode - if expected_cross_attention_mask is not None: - assert cross_attention_mask is not None - assert cross_attention_mask.shape == expected_cross_attention_mask - else: - assert cross_attention_mask is None - - -@pytest.mark.core_model -@pytest.mark.parametrize( - "input_indices", - [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE], - [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], - [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]]) -def test_get_full_text_row_masked_out_mask(input_indices) -> None: - - sequences = [torch.tensor(prompt_data[i]) for i in input_indices] - - seq_lens = [len(s) for s in sequences] - - num_prefill_tokens = sum(seq_lens) - - # TEXT_ONLY is zero, so it will be masked out, - # other instances should not be. - encoder_seq_lens = [int(i) for i in input_indices] - - attn_data = FlashAttentionMetadata( - seq_lens=seq_lens, - encoder_seq_lens=encoder_seq_lens, - num_prefill_tokens=num_prefill_tokens, - # Dummy values - enable_kv_scales_calculation=False, - num_prefills=0, - num_decode_tokens=0, - slot_mapping=0, - multi_modal_placeholder_index_maps=None, - seq_lens_tensor=0, - max_prefill_seq_len=0, - max_decode_seq_len=0, - context_lens_tensor=None, - block_tables=None, - use_cuda_graph=False, - ) - - dummy = DummyModel() - - full_text_row_masked_out_mask = MllamaForConditionalGeneration\ - .get_full_text_row_masked_out_mask(dummy, - attn_data, - torch.get_default_device()) - - full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze() - full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist() - - idx = 0 - assert len(full_text_row_masked_out_mask) == num_prefill_tokens - for i, seq_len in enumerate(seq_lens): - must_be_masked = input_indices[i] != TEXT_ONLY - for _ in range(seq_len): - assert full_text_row_masked_out_mask[idx] == must_be_masked, \ - f"full_text_row_masked_out_mask[{idx}] must be " \ - f"'{must_be_masked}' " - idx += 1 - - -@pytest.mark.core_model -@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [ - ([6404], [[4]], [6404]), - ([0, 6404], [[4]], [6404]), - ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]), - ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]), -]) -def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles, - expected) -> None: - - dummy = DummyModel() - num_tokens_per_tile = 1601 - actual_encoder_seq_lens = MllamaForConditionalGeneration \ - ._get_and_validate_encoder_lens( - dummy, - encoder_seq_lens, - num_tiles, - num_tokens_per_tile, - ) - assert actual_encoder_seq_lens == expected, \ - f"Expected {expected} but got {actual_encoder_seq_lens}" diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 8bd93bd838fed..a272c840f8dac 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -167,8 +167,6 @@ def _test_processing_correctness( # incorrect token ids. So we need use `add_special_tokens=False` here # to leave bos_token to be added by the processor. _ADD_SPECIAL_TOKENS_OVERRIDES = { - "donut": False, - "mllama": False, "ovis": False, "ovis2_5": False, "paligemma": False, @@ -278,9 +276,7 @@ def _test_processing_correctness_one( "facebook/chameleon-7b", "CohereLabs/command-a-vision-07-2025", "deepseek-ai/deepseek-vl2-tiny", - "naver-clova-ix/donut-base-finetuned-docvqa", "baidu/ERNIE-4.5-VL-28B-A3B-PT", - "microsoft/Florence-2-base", "adept/fuyu-8b", "google/gemma-3-4b-it", "google/gemma-3n-E2B-it", @@ -305,7 +301,6 @@ def _test_processing_correctness_one( "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - "meta-llama/Llama-3.2-11B-Vision-Instruct", "TIGER-Lab/Mantis-8B-siglip-llama3", "mispeech/midashenglm-7b", "openbmb/MiniCPM-Llama3-V-2_5", diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py deleted file mode 100644 index b42d3f89f3cbf..0000000000000 --- a/tests/models/multimodal/processing/test_mllama.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for mllama's multimodal preprocessing and profiling.""" -import pytest -from transformers import MllamaConfig - -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.profiling import MultiModalProfiler - -from ...utils import build_model_context - - -@pytest.mark.parametrize("model_id", - ["meta-llama/Llama-3.2-11B-Vision-Instruct"]) -@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072]) -@pytest.mark.parametrize("max_num_seqs", [1, 2, 8]) -def test_profiling( - model_id: str, - max_model_len: int, - max_num_seqs: int, -): - # regression test for https://github.com/vllm-project/vllm/issues/13929 - from vllm.model_executor.models.mllama import calc_token_per_chunk - - model_config_kwargs = { - "max_model_len": max_model_len, - } - ctx = build_model_context( - model_id, - model_config_kwargs=model_config_kwargs, - limit_mm_per_prompt={"image": 1}, - ) - - mm_config = ctx.get_mm_config() - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) - profiler = MultiModalProfiler(processor) - - dummy_encoder_data = profiler.get_encoder_dummy_data( - max_model_len, - mm_counts=mm_config.limit_per_prompt, - ) - dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs( - max_model_len, - mm_counts=mm_config.limit_per_prompt, - ) - - hf_config = ctx.get_hf_config(MllamaConfig) - image_size = hf_config.vision_config.image_size - encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids) - ] * max_num_seqs - - mm_data = processor.apply( - prompt=dummy_mm_data.prompt, - mm_data=dummy_mm_data.mm_data, - hf_processor_mm_kwargs=dict(), - )["mm_kwargs"].get_data() - - # Get the actual number of encoder tokens for each sample. - # Because attn_metadata.encoder_seq_lens only counts the last - # group of images for each sample, which is used to cheat the - # block manager to allocate blocks for those images only. - # See MllamaMultiModalProcessor for more details. - num_tiles = [[t] for t in mm_data.pop("num_tiles")] - num_tokens_per_tile = calc_token_per_chunk(image_size) - actual_encoder_seq_lens = [ - sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles - ] - - # simulate mllama image-present prefill. - for actual_len, last_group_len in zip(actual_encoder_seq_lens, - encoder_seq_lens): - assert actual_len >= last_group_len diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 3b87b669dbbe3..b678313752d65 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", - "Florence2ForConditionalGeneration": "not supported in V1", } ARCH_NEEDS_EXTRAS = [ "InternVLChatModel", diff --git a/tests/models/registry.py b/tests/models/registry.py index 3424cb6e7e7d4..9aef08769fb22 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -354,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True), "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"), - # [Encoder-decoder] - "BartModel": _HfExamplesInfo("facebook/bart-base"), - "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), - "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501 - hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501 } _EMBEDDING_EXAMPLE_MODELS = { @@ -496,7 +491,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 max_model_len=10240, - extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501 + extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501 ), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 @@ -583,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { is_available_online=False, ), # [Encoder-decoder] - "DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa", # noqa: E501 - hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"}, # noqa: E501 - extras={"dolphin": "ByteDance/Dolphin"}), # noqa: E501 - # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer - # Therefore, we borrow the BartTokenizer from the original Bart model - "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501 - tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501 - trust_remote_code=True), # noqa: E501 - "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 # [Cross-encoder] "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501 diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 0e18c45a21eeb..56b5d32d16536 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when # L4 supports FA3. m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1") - if model_arch == "Florence2ForConditionalGeneration": - # An encoder-decoder model that's V0-only. Just skip it - # since V0 is about to be removed. - pytest.skip("Skipping Florence2ForConditionalGeneration") if model_arch == "WhisperForConditionalGeneration": m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") LLM( diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 36882aba5e941..f67d4017eeeec 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -50,7 +50,6 @@ def test_registry_imports(model_arch): @create_new_process_for_each_test() @pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [ ("LlamaForCausalLM", False, False, False), - ("MllamaForConditionalGeneration", True, False, False), ("LlavaForConditionalGeneration", True, True, False), ("BertForSequenceClassification", False, False, True), ("RobertaForSequenceClassification", False, False, True), diff --git a/tests/test_config.py b/tests/test_config.py index 8db04de5469ad..6e37bdbee59eb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -299,9 +299,8 @@ def test_rope_customization(): reason="Encoder Decoder models not supported on ROCm.") @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ ("facebook/opt-125m", False), - ("facebook/bart-base", True), + ("openai/whisper-tiny", True), ("meta-llama/Llama-3.2-1B-Instruct", False), - ("meta-llama/Llama-3.2-11B-Vision", True), ]) def test_is_encoder_decoder(model_id, is_encoder_decoder): config = ModelConfig(model_id) diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 6dbba18b4dcfa..608f517f69145 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention(): assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] -def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch): - # V1 TESTS: ENCODER_DECODER is not supported on V1 yet. - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - - from vllm.attention import Attention, AttentionType - - # example from bart - ctx = { - 'encoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), - 'decoder.layers.0.encoder_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), - 'decoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), - } - - kv_cache = [ - torch.zeros((1, )), - ] - encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache - - bind_kv_cache(ctx, [kv_cache]) - assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache - assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] - assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] - - def test_bind_kv_cache_pp(): with patch("vllm.utils.cuda_device_count_stateless", lambda: 2): # this test runs with 1 GPU, but we simulate 2 GPUs diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index efa604dd6b5a8..794c1f68f1471 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -9,24 +9,9 @@ from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -UNSUPPORTED_MODELS_V1 = [ - "facebook/bart-large-cnn", # encoder decoder -] - MODEL = "meta-llama/Llama-3.2-1B-Instruct" -@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1) -def test_reject_unsupported_models(monkeypatch, model): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - args = AsyncEngineArgs(model=model) - - with pytest.raises(NotImplementedError): - _ = args.create_engine_config() - m.delenv("VLLM_USE_V1") - - def test_reject_bad_config(monkeypatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -77,12 +62,6 @@ def test_enable_by_default_fallback(monkeypatch): assert envs.VLLM_USE_V1 m.delenv("VLLM_USE_V1") - # Should fall back to V0 for supported model. - _ = AsyncEngineArgs( - model=UNSUPPORTED_MODELS_V1[0]).create_engine_config() - assert not envs.VLLM_USE_V1 - m.delenv("VLLM_USE_V1") - def test_v1_llm_by_default(monkeypatch): with monkeypatch.context() as m: diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py deleted file mode 100644 index 35ac90b38e840..0000000000000 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ /dev/null @@ -1,648 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import itertools - -import pytest -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.platforms import current_platform -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import make_tensor_with_pad -from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner - -BATCH_SIZES = [1, 4, 16, 64, 256] - - -def _create_model_runner(model: str, *args, - **kwargs) -> EncoderDecoderModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - model_runner = EncoderDecoderModelRunner( - vllm_config=engine_config, - is_driver_worker=True, - ) - return model_runner - - -@pytest.mark.skipif(condition=current_platform.is_cpu(), - reason="CPU backend is currently " - "unsupported for encoder/ " - "decoder models") -def test_empty_seq_group(): - """Verify prepare prompt and decode returns empty output - for empty seq group list""" - - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=True, - ) - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - ( - input_tokens, - input_positions, - encoder_input_tokens, - encoder_input_positions, - attn_metadata, - return_seq_lens, - ) = ( - model_input.input_tokens, - model_input.input_positions, - model_input.encoder_input_tokens, - model_input.encoder_input_positions, - model_input.attn_metadata, - model_input.seq_lens, - ) - assert input_tokens is None - assert input_positions is None - assert encoder_input_tokens is None - assert encoder_input_positions is None - assert attn_metadata is None - assert return_seq_lens is None - - -@pytest.mark.skipif(condition=current_platform.is_cpu(), - reason="CPU backend is currently " - "unsupported for encoder/ " - "decoder models") -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -def test_prepare_prompt(batch_size): - ''' - Test the ability of the encoder/decoder model runner subclass to - produce prefill-phase model inputs & attention metadata. - - Test behavior: - - * Instantiate BART base model & enc/dec model runner - * Construct sequence-group metadata for dummy prompts - * Test that encoder attention, decoder self-attention, - and encoder/decoder cross-attention inputs are correct - - Arguments: - - * batch_size - * backend_name: The attention backend under test - * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph) - ''' - - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=True, - ) - - seq_lens: list[int] = [] - encoder_seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - cross_block_table = [2] - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - seq_data = SequenceData.from_seqs(range(seq_len)) - encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 - encoder_seq_lens.append(encoder_seq_len) - encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len)) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - - # Build - # * Decoder model inputs - # * Decoder self-attention KV caching data structures - # * Encoder model inputs - # * Encoder/decoder cross-attention KV caching data structures - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - encoder_input_tokens = model_input.encoder_input_tokens - encoder_input_positions = model_input.encoder_input_positions - cross_slot_mapping = attn_metadata.cross_slot_mapping - assert return_seq_lens == seq_lens - assert len(slot_mapping) == len(input_tokens) - assert len(cross_slot_mapping) == len(encoder_input_tokens) - - # Verify input metadata is correct for prompts. - # - Decoder attention metadata - device = model_runner.device - assert attn_metadata.num_prefills > 0 - assert attn_metadata.num_decode_tokens == 0 - assert torch.equal(attn_metadata.seq_lens_tensor, - torch.tensor(seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.max_prefill_seq_len == max(seq_lens) - assert attn_metadata.max_decode_seq_len == 0 - # - Encoder attention metadata - assert attn_metadata.encoder_seq_lens == encoder_seq_lens - assert torch.equal( - attn_metadata.encoder_seq_lens_tensor, - torch.tensor(encoder_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens) - assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens) - - # Test decoder subquery start locs. - start_idx = 0 - start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - start_loc.append(start_idx) - assert torch.equal( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device), - ) - - # Test decoder seq start locs & context lengths - - assert torch.equal( - attn_metadata.seq_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device), - ) - assert torch.equal( - attn_metadata.context_lens_tensor, - torch.zeros(attn_metadata.context_lens_tensor.shape[0], - dtype=torch.int, - device=device), - ) - - # Verify block tables are correct for prompts - # - Decoder self-attention - expected = torch.tensor( - [[] for _ in range(len(seq_group_metadata_list))], - dtype=torch.int32, - device=model_runner.device, - ) - assert torch.equal( - attn_metadata.block_tables, - expected, - ) - # - Encoder/decoder cross-attention - assert torch.equal( - attn_metadata.cross_block_tables, - expected, - ) - - # Cuda graph should not be used for prefill. - assert attn_metadata.use_cuda_graph is False - - # Verify the lengths of input tokens & positions - # - Decoder - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - # -- An indirect check that model_input.input_tokens - # and model_input.input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - input_tokens, - input_positions, - ) - # - Encoder - assert len(encoder_input_tokens) == sum(encoder_seq_lens) - # -- An indirect check that model_input.encoder_input_tokens - # and model_input.encoder_input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - encoder_input_tokens, - encoder_input_positions, - ) - - # Test that vLLM sampling infrastructure chooses the correct - # sequence positions at which to sample (i.e. the end of - # each sequence) in the prefill phase - - expected_selected_token_indices = [] - selected_token_start_idx = 0 - for seq_len in seq_lens: - # Compute the index offset of the final token in each - # prompt (recall that the prompts are concatenated) - expected_selected_token_indices.append(selected_token_start_idx + - seq_len - 1) - selected_token_start_idx += seq_len - - sampling_metadata = model_input.sampling_metadata - actual = sampling_metadata.selected_token_indices - expected = torch.tensor( - expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype, - ) - assert torch.equal(actual, expected) - - -@pytest.mark.skipif(condition=current_platform.is_cpu(), - reason="CPU backend is currently " - "unsupported for encoder/ " - "decoder models") -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False]) -def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): - ''' - Test the ability of the encoder/decoder model runner subclass to - produce decode-phase model inputs & attention metadata. - - Test behavior: - - * Instantiate BART base model & enc/dec model runner - * Construct sequence-group metadata for dummy prompts - * Test that encoder attention, decoder self-attention, - and encoder/decoder cross-attention inputs are correct - - Arguments: - - * batch_size - * multiple_seqs_per_seq_group - * backend_name: The attention backend under test - * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph) - ''' - - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=True, - ) - - seq_lens: list[int] = [] - encoder_seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = { - 0: [1], - 1: [3] - } if multiple_seqs_per_seq_group else { - 0: [1] - } - cross_block_table = [2] - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_data = SequenceData.from_seqs(range(seq_len)) - encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 - encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len)) - - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={ - 0: seq_data, - 1: seq_data - } if multiple_seqs_per_seq_group else {0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - seq_lens.extend( - [seq_len for _ in range(len(seq_group_metadata.seq_data))]) - encoder_seq_lens.extend( - [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))]) - - # Build - # * Decoder model inputs - # * Decoder self-attention KV caching data structures - # * Encoder model inputs - # * Encoder/decoder cross-attention KV caching data structures - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - encoder_input_tokens = model_input.encoder_input_tokens - encoder_input_positions = model_input.encoder_input_positions - cross_slot_mapping = attn_metadata.cross_slot_mapping - assert return_seq_lens == seq_lens - assert len(slot_mapping) == len(input_tokens) - assert len(cross_slot_mapping) == len(encoder_input_tokens) - - # Verify input metadata is correct for decode phase. - # - Decoder attention metadata - device = model_runner.device - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_decode_tokens > 0 - assert torch.equal(attn_metadata.seq_lens_tensor, - torch.tensor(seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.max_prefill_seq_len == 0 - assert attn_metadata.max_decode_seq_len == max(seq_lens) - # - Encoder attention metadata - assert attn_metadata.encoder_seq_lens == encoder_seq_lens - assert torch.equal( - attn_metadata.encoder_seq_lens_tensor, - torch.tensor(encoder_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens) - assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens) - - # Test decoder subquery start locs. - start_idx = 0 - start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += 1 - start_loc.append(start_idx) - assert torch.equal( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device), - ) - - # Test decoder seq start locs. Note that for normal prefill it is - # equivalent to query_start_loc. - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - - # Test seq_start_loc and context lengths - - assert torch.equal( - attn_metadata.seq_start_loc, - torch.tensor(seq_start_loc, dtype=torch.int32, device=device), - ) - assert torch.equal( - attn_metadata.context_lens_tensor, - torch.tensor([seq_len - 1 for seq_len in seq_lens], - dtype=torch.int, - device=device)) - - # Verify block tables are correct for prompts - # - Decoder self-attention - flattened_block_tables = [ - block_table for block_table in block_tables.values() - ] - expected = torch.tensor(flattened_block_tables * - len(seq_group_metadata_list), - dtype=torch.int32, - device=model_runner.device) - assert torch.equal( - attn_metadata.block_tables, - expected, - ) - # - Encoder/decoder cross-attention - expected = torch.tensor([ - cross_block_table for seq_group_metadata in seq_group_metadata_list - for _ in range(len(seq_group_metadata.seq_data)) - ], - dtype=torch.int32, - device=model_runner.device) - assert torch.equal( - attn_metadata.cross_block_tables, - expected, - ) - - # Model runner's CUDAGraph setting should be propagated to attention - # metadata. - assert attn_metadata.use_cuda_graph is False - - # Verify the lengths of input tokens & positions - # - Decoder - assert len(input_tokens) == len(seq_lens) - assert len(input_positions) == len(seq_lens) - # -- An indirect check that model_input.input_tokens - # and model_input.input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - input_tokens, - input_positions, - ) - # - Encoder - assert len(encoder_input_tokens) == 0 - assert len(encoder_input_tokens) == 0 - # -- An indirect check that model_input.encoder_input_tokens - # and model_input.encoder_input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - encoder_input_tokens, - encoder_input_positions, - ) - - # Test that vLLM sampling infrastructure chooses the correct - # sequence positions at which to sample (i.e. the end of - # each sequence) in the decode phase - - expected_selected_token_indices = [] - for selected_token_start_idx, seq_len in enumerate(seq_lens): - # Compute the index offset of the final token in each - # sequence's decoded outputs; since a single token is - # decoded per iteration per sequence, then the length - # of the decoded tokens for a given sequence is 1 and - # the final index offset into a given sequence's - # generated tokens is 0 (i.e. the expected sampling index - # for a given sequence is just `selected_token_start_idx`) - expected_selected_token_indices.append(selected_token_start_idx) - - sampling_metadata = model_input.sampling_metadata - actual = sampling_metadata.selected_token_indices - expected = torch.tensor( - expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype, - ) - assert torch.equal(actual, expected) - - -@pytest.mark.parametrize("batch_size", list(range(1, 257))) -@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False]) -def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): - """ - Tests that for encoder-decoder models with CUDA Graph capture and replay - enabled, the tensors used during the decode phase are correctly padded - for varying input batch sizes. - """ - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=False, - ) - block_tables = { - 0: [1], - 1: [3] - } if multiple_seqs_per_seq_group else { - 0: [1] - } - seq_lens: list[int] = [] - encoder_seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - - cross_block_table = [2] - expanded_batch_size = 0 - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_data = SequenceData.from_seqs(range(seq_len)) - encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 - encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len)) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={ - 0: seq_data, - 1: seq_data - } if multiple_seqs_per_seq_group else {0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_lens.extend( - [seq_len for _ in range(len(seq_group_metadata.seq_data))]) - encoder_seq_lens.extend( - [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))]) - expanded_batch_size = expanded_batch_size + len( - seq_group_metadata.seq_data) - seq_group_metadata_list.append(seq_group_metadata) - - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - encoder_input_tokens = model_input.encoder_input_tokens - encoder_input_positions = model_input.encoder_input_positions - cross_slot_mapping = attn_metadata.cross_slot_mapping - - # With CUDA Graph capture and replay enabled, the decoder and encoder - # input sequences will be padded. Create the expected padded tensors - # accordingly. - graph_batch_size = model_runner.vllm_config.pad_for_cudagraph( - expanded_batch_size) - cuda_graph_pad_size = graph_batch_size - expanded_batch_size - padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size)) - padded_encoder_seq_lens = encoder_seq_lens + list( - itertools.repeat(1, cuda_graph_pad_size)) - - assert return_seq_lens == padded_seq_lens - assert len(slot_mapping) == len(input_tokens) - assert len(cross_slot_mapping) == len(encoder_input_tokens) - - # Verify attention metadata - device = model_runner.device - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_decode_tokens > 0 - assert torch.equal( - attn_metadata.seq_lens_tensor, - torch.tensor(padded_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == padded_seq_lens - assert attn_metadata.max_prefill_seq_len == 0 - assert attn_metadata.max_decode_seq_len == max(seq_lens) - # - Encoder attention metadata - assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens - assert torch.equal( - attn_metadata.encoder_seq_lens_tensor, - torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens) - assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens) - - # Verify block tables are correct for prompts - # - Decoder self-attention. Pad the block tables as expected. - flattened_block_tables = [ - block_table for _ in range(len(seq_group_metadata_list)) - for block_table in block_tables.values() - ] - flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)]) - expected = make_tensor_with_pad( - flattened_block_tables, - max_len=64, - pad=0, - dtype=torch.int32, - device=model_runner.device, - ) - assert torch.equal( - attn_metadata.block_tables, - expected, - ) - # - Encoder/decoder cross-attention. Pad the cross-attention block tables - # as expected. - expected = [ - cross_block_table for seq_group_metadata in seq_group_metadata_list - for _ in range(len(seq_group_metadata.seq_data)) - ] - expected.extend([[] for _ in range(cuda_graph_pad_size)]) - expected = make_tensor_with_pad( - expected, - max_len=64, - pad=0, - dtype=torch.int32, - device=model_runner.device, - ) - assert torch.equal( - attn_metadata.cross_block_tables, - expected, - ) - - # Model runner's CUDAGraph setting should be propagated to attention - # metadata. - assert attn_metadata.use_cuda_graph is True - - # Verify the lengths of input tokens & positions - # - Decoder - assert len(input_tokens) == len(padded_seq_lens) - assert len(input_positions) == len(padded_seq_lens) - # -- An indirect check that model_input.input_tokens - # and model_input.input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - input_tokens, - input_positions, - ) - # - Encoder - assert len(encoder_input_tokens) == 0 - assert len(encoder_input_tokens) == 0 - # -- An indirect check that model_input.encoder_input_tokens - # and model_input.encoder_input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - encoder_input_tokens, - encoder_input_positions, - ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 599eeb63bf804..0847fba878aa2 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1201,11 +1201,8 @@ class ModelConfig: getattr(self.hf_config, "max_source_positions", 0)) self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, effective_max_seq_len) - # CUDAGraph capture not supported for enc-dec models and mllama on ROCm - ROCM_UNSUPPORTED_MODELS = ['mllama'] - unsupported_rocm = (self.hf_config.model_type - in ROCM_UNSUPPORTED_MODELS - or self.is_encoder_decoder) + # CUDAGraph capture not supported for encoder-decoder models on ROCm + unsupported_rocm = self.is_encoder_decoder if (unsupported_rocm and not self.enforce_eager and current_platform.is_rocm()): @@ -1671,10 +1668,6 @@ class ModelConfig: @property def is_encoder_decoder(self) -> bool: """Extract the HF encoder/decoder model flag.""" - """ - For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to - True to enable cross-attention - """ return is_encoder_decoder(self.hf_config) @property diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f25530fc9dac8..0fdd651425b90 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1789,7 +1789,7 @@ class LLMEngine: assert isinstance(mm_processor, EncDecMultiModalProcessor) if mm_processor.pad_dummy_encoder_prompt: - return # Skip encoder length check for Whisper and Donut + return # Skip encoder length check for Whisper if model_config.is_multimodal_model: suggestion = ( diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py deleted file mode 100644 index 242530817c642..0000000000000 --- a/vllm/model_executor/models/bart.py +++ /dev/null @@ -1,1319 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Derived from BART implementation posted on HuggingFace; license below: -# -# coding=utf-8 -# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch BART model.""" -import math -from collections.abc import Iterable -from typing import Optional - -import torch -from torch import nn -from transformers import BartConfig -from transformers.utils import logging - -from vllm.attention import Attention, AttentionType -from vllm.config import CacheConfig, VllmConfig -from vllm.config.lora import LoRAConfig -from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - QKVCrossParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from .interfaces import SupportsQuant, SupportsV0Only -from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, - maybe_prefix) - -logger = logging.get_logger(__name__) - - -def get_bsz_seq_len(input_ids): - shp = input_ids.shape - ndim = len(shp) - if ndim == 1: - return 1, input_ids.numel() - else: - return shp[:2] - - -class BartLearnedPositionalEmbedding(VocabParallelEmbedding): - """ - This module learns positional embeddings up to a fixed maximum size. - """ - - def __init__(self, num_embeddings: int, embedding_dim: int): - # Bart is set up so that if padding_idx is - # specified then offset the embedding ids by 2 - # and adjust num_embeddings appropriately. - # Other models don't have this hack - self.offset = 2 - super().__init__(num_embeddings + self.offset, embedding_dim) - - def forward( - self, - positions: torch.Tensor, - ) -> torch.Tensor: - """`input_ids' shape is expected to be [bsz x seqlen].""" - return super().forward(positions + self.offset) - - -class BartScaledWordEmbedding(VocabParallelEmbedding): - """ - This module overrides VocabParallelEmbedding's - forward by multiplying with embeddings scale. - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - embed_scale: float = 1.0): - super().__init__(num_embeddings, embedding_dim) - self.embed_scale = embed_scale - - def forward(self, input_ids: torch.Tensor) -> torch.Tensor: - return super().forward(input_ids) * self.embed_scale - - -class BartParallelLMHead(ParallelLMHead): - """ - This module overrides ParallelLMHead's - forward by dividing by embeddings scale, - yielding effectively the inverse of - BartScaledWordEmbedding - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - embed_scale: float = 1.0): - super().__init__(num_embeddings, embedding_dim) - self.embed_scale = embed_scale - - def forward(self, input_ids: torch.Tensor) -> torch.Tensor: - return super().forward(input_ids) / self.embed_scale - - -class BartEncoderAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - config: Optional[BartConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.d_model = config.d_model - self.embed_dim = embed_dim - self.total_num_heads = num_heads - self.total_num_kv_heads = self.total_num_heads - self.head_dim = embed_dim // num_heads - self.config = config - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError(f"embed_dim must be divisible by num_heads " - f"(got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads}).") - self.scaling = self.head_dim**-0.5 - - self.qkv_proj = QKVParallelLinear( - self.d_model, - self.d_model // self.total_num_heads, - self.total_num_heads, - self.total_num_kv_heads, - bias=bias, - quant_config=quant_config, - ) - - self.out_proj = RowParallelLinear( - embed_dim, - embed_dim, - bias=bias, - quant_config=quant_config, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - if self.total_num_kv_heads >= tp_world_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_world_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_world_size % self.total_num_kv_heads == 0 - self.num_kv_heads = self.num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=AttentionType.ENCODER) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - """Input shape: Batch x Time x Channel""" - - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - attn_output = self.attn(q, k, v) - - output, _ = self.out_proj(attn_output) - return output - - -class BartDecoderSelfAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - config: Optional[BartConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.d_model = config.d_model - self.embed_dim = embed_dim - self.total_num_heads = num_heads - self.total_num_kv_heads = self.total_num_heads - self.head_dim = embed_dim // num_heads - self.config = config - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError(f"embed_dim must be divisible by num_heads " - f"(got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads}).") - self.scaling = self.head_dim**-0.5 - - self.qkv_proj = QKVParallelLinear( - self.d_model, - self.d_model // self.total_num_heads, - self.total_num_heads, - self.total_num_kv_heads, - bias=bias, - quant_config=quant_config, - ) - - self.out_proj = RowParallelLinear( - embed_dim, - embed_dim, - bias=bias, - quant_config=quant_config, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - if self.total_num_kv_heads >= tp_world_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_world_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_world_size % self.total_num_kv_heads == 0 - self.num_kv_heads = self.num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=AttentionType.DECODER) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - """Input shape: Batch x Time x Channel""" - - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - attn_output = self.attn(q, k, v) - - output, _ = self.out_proj(attn_output) - return output - - -class BartCrossAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - config: Optional[BartConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.d_model = config.d_model - self.embed_dim = embed_dim - self.total_num_heads = num_heads - self.total_num_kv_heads = self.total_num_heads - self.head_dim = embed_dim // num_heads - self.config = config - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError(f"embed_dim must be divisible by num_heads " - f"(got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads}).") - self.scaling = self.head_dim**-0.5 - - # TP sharding sizes is accounted for within "*Parallel" layers. - self.qkv_proj = QKVCrossParallelLinear(self.d_model, - self.d_model // - self.total_num_heads, - self.total_num_heads, - self.total_num_kv_heads, - bias, - quant_config=quant_config) - - self.out_proj = RowParallelLinear( - embed_dim, - embed_dim, - bias=bias, - quant_config=quant_config, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - if self.total_num_kv_heads >= tp_world_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_world_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_world_size % self.total_num_kv_heads == 0 - self.num_kv_heads = self.num_heads # No GQA in bart - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=AttentionType.ENCODER_DECODER) - - def forward( - self, - decoder_hidden_states: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Input shape: Batch x Time x Channel""" - - q, k, v = self.qkv_proj(decoder_hidden_states, encoder_hidden_states) - - attn_output = self.attn(q, k, v) - - output, _ = self.out_proj(attn_output) - return output - - -class BartEncoderLayer(nn.Module): - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.embed_dim = config.d_model - - self.self_attn = BartEncoderAttention( - embed_dim=self.embed_dim, - num_heads=config.encoder_attention_heads, - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.activation_fn = get_act_fn(config.activation_function) - - ffn_hidden_size = self.embed_dim - ffn_intermediate_size = config.encoder_ffn_dim - ffn_has_bias = True - self.fc1 = ColumnParallelLinear( - ffn_hidden_size, - ffn_intermediate_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - self.act = get_act_fn("gelu") - self.fc2 = RowParallelLinear( - ffn_intermediate_size, - ffn_hidden_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - r""" - Args: - hidden_states: torch.Tensor of *encoder* input embeddings. - Returns: - Encoder layer output torch.Tensor - """ - residual = hidden_states - hidden_states = self.self_attn(hidden_states=hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - residual = hidden_states - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) - - if hidden_states.dtype == torch.float16 and ( - torch.isinf(hidden_states).any() - or torch.isnan(hidden_states).any()): - hidden_states = cast_overflow_tensors(hidden_states) - - return hidden_states - - -class BartDecoderLayer(nn.Module): - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.embed_dim = config.d_model - - self.self_attn = BartDecoderSelfAttention( - embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - self.activation_fn = get_act_fn(config.activation_function) - - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - ''' - afeldman-nm: personally I would call this "cross-attention", - however I left the name as "encoder_attn" to maintain consistency - with the name of the pretrained weights. - ''' - self.encoder_attn = BartCrossAttention( - self.embed_dim, - config.decoder_attention_heads, - config=config, - prefix=f"{prefix}.encoder_attn", - ) - self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) - - ffn_hidden_size = self.embed_dim - ffn_intermediate_size = config.encoder_ffn_dim - ffn_has_bias = True - self.fc1 = ColumnParallelLinear( - ffn_hidden_size, - ffn_intermediate_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - self.fc2 = RowParallelLinear( - ffn_intermediate_size, - ffn_hidden_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward( - self, - decoder_hidden_states: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - decoder_hidden_states: torch.Tensor of *decoder* input embeddings. - encoder_hidden_states: torch.Tensor of *encoder* input embeddings. - Returns: - Decoder layer output torch.Tensor - """ - residual = decoder_hidden_states - - # Self Attention - hidden_states = self.self_attn(hidden_states=decoder_hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - # Cross-Attention Block - - residual = hidden_states - - hidden_states = self.encoder_attn( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - hidden_states = residual + hidden_states - hidden_states = self.encoder_attn_layer_norm(hidden_states) - - # Fully Connected - residual = hidden_states - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) - - return hidden_states - - -class BartEncoder(nn.Module): - """ - Transformer encoder consisting of *config.encoder_layers* - self attention layers. Each layer is a [`BartEncoderLayer`]. - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__(self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = ""): - super().__init__() - - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - embed_dim = config.d_model - self.max_source_positions = config.max_position_embeddings - embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - embed_dim, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - embed_dim, - ) - self.layers = nn.ModuleList([ - BartEncoderLayer(config, - cache_config, - quant_config, - prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.encoder_layers) - ]) - - self.layernorm_embedding = nn.LayerNorm(embed_dim) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *encoder* input sequence tokens in the - vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *encoder* input sequence tokens. - Returns: - Decoder output torch.Tensor - """ - # retrieve input_ids and inputs_embeds - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - embed_pos = self.embed_positions(positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - for encoder_layer in self.layers: - hidden_states = encoder_layer(hidden_states=hidden_states) - - return hidden_states - - -class BartDecoder(nn.Module): - """ - Transformer decoder consisting of *config.decoder_layers* layers. - Each layer is a [`BartDecoderLayer`] - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = "", - ): - super().__init__() - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - self.max_target_positions = config.max_position_embeddings - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - config.d_model, - ) - - self.layers = nn.ModuleList( - [BartDecoderLayer(config,cache_config,quant_config, - prefix=f"{prefix}.layers.{layer_idx}") \ - for layer_idx in range(config.decoder_layers)]) - - self.layernorm_embedding = nn.LayerNorm(config.d_model) - - def forward( - self, - decoder_input_ids: torch.Tensor, - decoder_positions: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - decoder_input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - decoder_positions: Positions of *decoder* input sequence tokens. - encoder_hidden_states: Tensor of encoder output embeddings. - Returns: - Decoder output torch.Tensor - """ - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(decoder_input_ids) - else: - decoder_positions = inputs_embeds[:, -1] - - # embed positions - embed_pos = self.embed_positions(decoder_positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - # decoder layers - - for decoder_layer in self.layers: - hidden_states = decoder_layer( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - return hidden_states - - -class BartModel(nn.Module, SupportsQuant): - _tied_weights_keys = [ - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - - self.config = config - - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - - self.encoder = BartEncoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.decoder = BartDecoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *decoder* input sequence tokens. - encoder_input_ids: Indices of *encoder* input sequence tokens - in the vocabulary. - encoder_positions: Positions of *encoder* input sequence tokens. - Returns: - Model output torch.Tensor - """ - - encoder_hidden_states = None - - if encoder_input_ids.numel() > 0: - # Run encoder attention if a non-zero number of encoder tokens - # are provided as input - encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, - positions=encoder_positions) - - # decoder outputs consists of - # (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=encoder_hidden_states) - - return decoder_outputs - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - other_weights = [] - loaded_stacked_params = [] - model_params_dict = dict(self.named_parameters()) - - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - if name not in model_params_dict: - continue - param = model_params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - loaded_stacked_params.append(name) - break - else: - if name in model_params_dict: - other_weights.append((name, loaded_weight)) - - loader = AutoWeightsLoader(self) - loaded_params = loader.load_weights(other_weights) - loaded_params.update(loaded_stacked_params) - return loaded_params - - -class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "decoder.": "model.decoder.", - "encoder.": "model.encoder.", - "shared.": "model.shared." - }, - orig_to_new_substr={ - "beta": "bias", - "gamma": "weight", - "LayerNorm": "layernorm", - }, - ) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - - super().__init__() - config = vllm_config.model_config.hf_config - lora_config = vllm_config.lora_config - # currently all existing BART models have `tie_word_embeddings` enabled - assert config.tie_word_embeddings - self.config = config - self.model = BartModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.lm_head = BartParallelLMHead(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices. - Returns: - Output torch.Tensor - """ - return self.model(input_ids, positions, encoder_input_ids, - encoder_positions) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - weights_tuple_list = list(weights) - - shared_embedding_weight = None - for name, loaded_weight in weights_tuple_list: - if ('shared.weight' in name - or 'encoder.embed_tokens.weight' in name - or 'decoder.embed_tokens.weight' in name - or 'lm_head.weight' in name): - assert shared_embedding_weight is None, ( - "Conflicting embedding weights.") - shared_embedding_weight = loaded_weight - - loader = AutoWeightsLoader( - self, - skip_prefixes=(["cls.", "pooler."]), - ) - loaded_params = loader.load_weights(weights_tuple_list, - mapper=self.hf_to_vllm_mapper) - - if shared_embedding_weight is not None: - weight_loader = getattr(self.lm_head.weight, "weight_loader", - default_weight_loader) - weight_loader(self.lm_head.weight, shared_embedding_weight) - - self.model.encoder.embed_tokens.weight = self.lm_head.weight - self.model.decoder.embed_tokens.weight = self.lm_head.weight - loaded_params.update({ - 'model.encoder.embed_tokens.weight', 'lm_head.weight', - 'model.decoder.embed_tokens.weight' - }) - - return loaded_params - - -class MBartEncoderLayer(BartEncoderLayer): - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - r""" - Args: - hidden_states: torch.Tensor of *encoder* input embeddings. - Returns: - Encoder layer output torch.Tensor - """ - residual = hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - hidden_states = self.self_attn(hidden_states=hidden_states) - - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.final_layer_norm(hidden_states) - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - - if hidden_states.dtype == torch.float16 and ( - torch.isinf(hidden_states).any() - or torch.isnan(hidden_states).any()): - hidden_states = cast_overflow_tensors(hidden_states) - - return hidden_states - - -class MBartDecoderLayer(BartDecoderLayer): - - def forward( - self, - decoder_hidden_states: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - residual = decoder_hidden_states - hidden_states = self.self_attn_layer_norm(decoder_hidden_states) - - # Self Attention - hidden_states = self.self_attn(hidden_states=hidden_states) - - hidden_states = residual + hidden_states - - # Cross-Attention Block - - residual = hidden_states - hidden_states = self.encoder_attn_layer_norm(hidden_states) - - hidden_states = self.encoder_attn( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.final_layer_norm(hidden_states) - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - - return hidden_states - - -class MBartEncoder(nn.Module): - """ - Transformer encoder consisting of *config.encoder_layers* - self attention layers. Each layer is a [`BartEncoderLayer`]. - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__(self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = ""): - super().__init__() - - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - embed_dim = config.d_model - self.max_source_positions = config.max_position_embeddings - embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - embed_dim, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - embed_dim, - ) - self.layers = nn.ModuleList([ - MBartEncoderLayer(config, - cache_config, - quant_config, - prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.encoder_layers) - ]) - - self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.layer_norm = nn.LayerNorm(config.d_model) # 改动 - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *encoder* input sequence tokens in the - vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *encoder* input sequence tokens. - Returns: - Decoder output torch.Tensor - """ - # retrieve input_ids and inputs_embeds - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - embed_pos = self.embed_positions(positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - for encoder_layer in self.layers: - hidden_states = encoder_layer(hidden_states=hidden_states) - - hidden_states = self.layer_norm(hidden_states) - return hidden_states - - -class MBartDecoder(nn.Module): - """ - Transformer decoder consisting of *config.decoder_layers* layers. - Each layer is a [`BartDecoderLayer`] - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = "", - ): - super().__init__() - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - self.max_target_positions = config.max_position_embeddings - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - config.d_model, - ) - - self.layers = nn.ModuleList( - [MBartDecoderLayer(config, cache_config, quant_config, - prefix=f"{prefix}.layers.{layer_idx}") \ - for layer_idx in range(config.decoder_layers)]) - - self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.layer_norm = nn.LayerNorm(config.d_model) - - def forward( - self, - decoder_input_ids: torch.Tensor, - decoder_positions: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - decoder_input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - decoder_positions: Positions of *decoder* input sequence tokens. - encoder_hidden_states: Tensor of encoder output embeddings. - Returns: - Decoder output torch.Tensor - """ - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(decoder_input_ids) - else: - decoder_positions = inputs_embeds[:, -1] - - # embed positions - embed_pos = self.embed_positions(decoder_positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - # decoder layers - - for decoder_layer in self.layers: - hidden_states = decoder_layer( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - hidden_states = self.layer_norm(hidden_states) - return hidden_states - - -class MBartModel(nn.Module, SupportsQuant): - _tied_weights_keys = [ - "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" - ] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - - self.config = config - - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - - self.encoder = MBartEncoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.decoder = MBartDecoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *decoder* input sequence tokens. - encoder_input_ids: Indices of *encoder* input sequence tokens - in the vocabulary. - encoder_positions: Positions of *encoder* input sequence tokens. - Returns: - Model output torch.Tensor - """ - - encoder_hidden_states = None - - if encoder_input_ids.numel() > 0: - # Run encoder attention if a non-zero number of encoder tokens - # are provided as input - encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, - positions=encoder_positions) - - # decoder outputs consists of - # (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=encoder_hidden_states) - - return decoder_outputs - - -class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): - base_model_prefix = "model" - - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "decoder.": "model.decoder.", - "encoder.": "model.encoder.", - "shared.": "model.shared." - }, - orig_to_new_substr={ - "beta": "bias", - "gamma": "weight", - "LayerNorm": "layernorm", - }, - ) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - lora_config = vllm_config.lora_config - assert config.tie_word_embeddings - self.config = config - self.model = MBartModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.lm_head = BartParallelLMHead(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - return self.model(input_ids, positions, encoder_input_ids, - encoder_positions) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - model_params_dict = dict(self.named_parameters()) - loaded_params = set() - remaining_weights = [] - shared_embedding_weight = None - - for name, loaded_weight in weights: - if any(skip in name - for skip in ["cls.", "pooler.", "final_logits_bias"]): - continue - if any(embed_name in name for embed_name in [ - 'shared.weight', 'encoder.embed_tokens.weight', - 'decoder.embed_tokens.weight' - ]): - if shared_embedding_weight is None: - shared_embedding_weight = loaded_weight - continue - is_stacked = False - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - vllm_name = name - for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items( - ): - vllm_name = vllm_name.replace(src, dst) - for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items( - ): - if vllm_name.startswith(src): - vllm_name = dst + vllm_name[len(src):] - break - vllm_name = vllm_name.replace(weight_name, param_name) - if vllm_name in model_params_dict: - param = model_params_dict[vllm_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight, shard_id) - loaded_params.add(vllm_name) - is_stacked = True - break - if not is_stacked: - remaining_weights.append((name, loaded_weight)) - loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."]) - auto_loaded_params = loader.load_weights(remaining_weights, - mapper=self.hf_to_vllm_mapper) - loaded_params.update(auto_loaded_params) - if shared_embedding_weight is not None: - lm_head_param = self.lm_head.weight - weight_loader = getattr(lm_head_param, "weight_loader", - default_weight_loader) - weight_loader(lm_head_param, shared_embedding_weight) - self.model.encoder.embed_tokens.weight = self.lm_head.weight - self.model.decoder.embed_tokens.weight = self.lm_head.weight - loaded_params.update({ - 'model.encoder.embed_tokens.weight', 'lm_head.weight', - 'model.decoder.embed_tokens.weight' - }) - return loaded_params diff --git a/vllm/model_executor/models/donut.py b/vllm/model_executor/models/donut.py deleted file mode 100644 index 23f4c6a4f93fc..0000000000000 --- a/vllm/model_executor/models/donut.py +++ /dev/null @@ -1,381 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Literal, Optional, Union - -import torch -import torch.nn as nn -from transformers import BatchFeature, NougatProcessor - -from vllm.config import VllmConfig -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.bart import BartParallelLMHead, MBartDecoder -from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, - SupportsMultiModal, - SupportsV0Only) -from vllm.model_executor.models.swin import SwinModel -from vllm.model_executor.models.utils import (AutoWeightsLoader, - _flatten_embeddings, flatten_bn) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems) -from vllm.multimodal.parse import MultiModalDataItems -from vllm.multimodal.processing import (BaseProcessingInfo, - EncDecMultiModalProcessor, - PromptIndexTargets, PromptInsertion, - PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.utils.tensor_schema import TensorSchema, TensorShape - - -class MBartDecoderWrapper(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.decoder = MBartDecoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - def forward(self, *args, **kwargs): - return self.decoder(*args, **kwargs) - - -class DonutLanguageForConditionalGeneration(nn.Module, SupportsV0Only): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - - self.config = config - self.model = MBartDecoderWrapper(vllm_config=vllm_config, - prefix=f"{prefix}.model") - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.vocab_size = config.vocab_size - self.lm_head = BartParallelLMHead(self.vocab_size, - config.d_model, - embed_scale=embed_scale) - - self.logits_processor = LogitsProcessor(self.vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - Returns: - Output torch.Tensor - """ - - return self.model(decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=inputs_embeds) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if "final_logits_bias" in name: - continue - # if self.config.tie_word_embeddings and "embed_tokens" in name: - # continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class DonutImagePixelInputs(TensorSchema): - """ - Dimensions: - - b: Batch size - - c: Number of channels (3) - - h: Height - - w: Width - """ - type: Literal["pixel_values"] - data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")] - - -class DonutProcessingInfo(BaseProcessingInfo): - - def get_hf_config(self): - return self.ctx.get_hf_config() - - def get_hf_processor(self): - return self.ctx.get_hf_processor() - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} - - def get_num_image_tokens(self) -> int: - return 1 - - -class DonutDummyInputsBuilder(BaseDummyInputsBuilder[DonutProcessingInfo]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - return "" - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width, target_height = self.info.get_hf_config( - ).encoder.image_size - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - -class DonutMultiModalProcessor(EncDecMultiModalProcessor[DonutProcessingInfo]): - - def _hf_processor_applies_updates( - self, - prompt_text: str, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Mapping[str, object], - ) -> bool: - return False - - def create_encoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return prompt - - def create_decoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return prompt - - @property - def pad_dummy_encoder_prompt(self) -> bool: - return True - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - hf_processor = self.info.get_hf_processor() - if mm_data: - processed_outputs = super()._call_hf_processor( - prompt, mm_data, mm_kwargs, tok_kwargs) - if isinstance(hf_processor, NougatProcessor): - processed_outputs["input_ids"] = processed_outputs["labels"] - else: - tokenizer = hf_processor.tokenizer - processed_outputs = tokenizer(prompt, - add_special_tokens=False, - return_tensors="pt") - return processed_outputs - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image")) - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargsItems, - ) -> Sequence[PromptUpdate]: - hf_processor = self.info.get_hf_processor() - tokenizer = hf_processor.tokenizer - pad_token_id = tokenizer.pad_token_id - num_image_tokens = self.info.get_num_image_tokens() - image_tokens = [pad_token_id] * num_image_tokens - - return [ - PromptInsertion( - modality="image", - target=PromptIndexTargets.start(), - insertion=image_tokens, - ) - ] - - -@MULTIMODAL_REGISTRY.register_processor(DonutMultiModalProcessor, - info=DonutProcessingInfo, - dummy_inputs=DonutDummyInputsBuilder) -class DonutForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsV0Only): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - processor_config = vllm_config.model_config.hf_image_processor_config - - self.config = config - self.vision_config = config.encoder - self.processor_config = processor_config - self.encoder = SwinModel(config=config.encoder) - - self.decoder = DonutLanguageForConditionalGeneration( - vllm_config=vllm_config.with_hf_config(config.decoder), - prefix=f"{prefix}.decoder", - ) - self.pad_token_id = config.pad_token_id - - def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "image_embeds", None) - - if pixel_values is None and image_embeds is None: - return None - - if pixel_values is not None and image_embeds is not None: - raise ValueError( - "Both pixel values and image embeds are provided.") - - if pixel_values is not None: - h, w = self.config.encoder.image_size - return DonutImagePixelInputs(type="pixel_values", - data=flatten_bn(pixel_values, - concat=True), - resolve_bindings={ - "h": h, - "w": w, - }) - - if image_embeds is not None: - raise NotImplementedError - - raise AssertionError("This line should be unreachable.") - - def _process_image_input( - self, image_input: DonutImagePixelInputs) -> torch.Tensor: - assert image_input["type"] == "pixel_values" - pixel_values = image_input["data"] - dtype = next(self.encoder.parameters()).dtype - pixel_values = pixel_values.to(dtype) - return self.encoder(pixel_values) - - def get_language_model(self) -> torch.nn.Module: - return self.decoder - - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return None - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: MultiModalEmbeddings, - ) -> torch.Tensor: - return _flatten_embeddings(multimodal_embeddings) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices - Returns: - Output torch.Tensor - """ - - inputs_embeds = None - if encoder_input_ids.numel() > 0: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(encoder_input_ids, - vision_embeddings) - - hidden_states = self.decoder(input_ids, - positions, - inputs_embeds=inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - return self.decoder.compute_logits(hidden_states, sampling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py deleted file mode 100644 index 5e05e0c60f41c..0000000000000 --- a/vllm/model_executor/models/florence2.py +++ /dev/null @@ -1,1097 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from collections import OrderedDict -from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Literal, Optional, Union - -import torch -import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange -from transformers import BartTokenizer, BatchFeature, PretrainedConfig - -from vllm.config import VllmConfig -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, - BartParallelLMHead, - BartScaledWordEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems) -from vllm.multimodal.parse import MultiModalDataItems -from vllm.multimodal.processing import (BaseProcessingInfo, - EncDecMultiModalProcessor, - PromptIndexTargets, PromptInsertion, - PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.sequence import IntermediateTensors -from vllm.utils.tensor_schema import TensorSchema, TensorShape - -from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, - SupportsV0Only) -from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings - - -class Florence2ImagePixelInputs(TensorSchema): - """ - Dimensions: - - b: Batch size - - c: Number of channels (3) - - h: Height of the image - - w: Width of the image - """ - - type: Literal["pixel_values"] - - data: Annotated[ - torch.Tensor, - TensorShape("b", 3, "h", "w"), - ] - - -# ViT implementation are all copied from -# https://huggingface.co/microsoft/Florence-2-base/blob/main/modeling_florence2.py -class LearnedAbsolutePositionEmbedding2D(nn.Module): - """ - This module learns positional embeddings up to a fixed maximum size. - """ - - def __init__(self, embedding_dim=256, num_pos=50): - super().__init__() - self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2) - self.column_embeddings = nn.Embedding( - num_pos, embedding_dim - (embedding_dim // 2)) - - def forward(self, pixel_values): - """ - pixel_values: (batch_size, height, width, num_channels) - returns: (batch_size, height, width, embedding_dim * 2) - """ - if len(pixel_values.shape) != 4: - raise ValueError('pixel_values must be a 4D tensor') - height, width = pixel_values.shape[1:3] - width_values = torch.arange(width, device=pixel_values.device) - height_values = torch.arange(height, device=pixel_values.device) - x_emb = self.column_embeddings(width_values) - y_emb = self.row_embeddings(height_values) - # (height, width, embedding_dim * 2) - pos = torch.cat([ - x_emb.unsqueeze(0).repeat(height, 1, 1), - y_emb.unsqueeze(1).repeat(1, width, 1) - ], - dim=-1) - # (embedding_dim * 2, height, width) - pos = pos.permute(2, 0, 1) - pos = pos.unsqueeze(0) - # (batch_size, embedding_dim * 2, height, width) - pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) - # (batch_size, height, width, embedding_dim * 2) - pos = pos.permute(0, 2, 3, 1) - return pos - - -class PositionalEmbeddingCosine1D(nn.Module): - """ - This class implements a very simple positional encoding. It follows closely - the encoder from the link below: - https://pytorch.org/tutorials/beginner/translation_transformer.html - Args: - embed_dim: The dimension of the embeddings. - dropout_prob: The dropout probability. - max_seq_len: The maximum length to precompute the positional encodings. - """ - - def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None: - super().__init__() - self.embed_dim = embed_dim - self.max_seq_len = max_seq_len - # Generate the sinusoidal arrays. - factor = math.log(10000) - denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) / - self.embed_dim) - # Matrix where rows correspond to a positional embedding as a function - # of the position index (i.e., the row index). - frequencies = \ - torch.arange(0, self.max_seq_len) \ - .reshape(self.max_seq_len, 1) * denominator - pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim)) - # Populate uneven entries. - pos_idx_to_embed[:, 0::2] = torch.sin(frequencies) - pos_idx_to_embed[:, 1::2] = torch.cos(frequencies) - # Save the positional embeddings in a constant buffer. - # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed) - self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed, - requires_grad=False) - - def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor: - """ - Args: - seq_embeds: The sequence embeddings in order. Allowed size: - 1. [T, D], where T is the length of the sequence, and D is the - frame embedding dimension. - 2. [B, T, D], where B is the batch size and T and D are the - same as above. - Returns a tensor of with the same dimensions as the input: i.e., - [1, T, D] or [T, D]. - """ - shape_len = len(seq_embeds.shape) - assert 2 <= shape_len <= 3 - len_seq = seq_embeds.size(-2) - assert len_seq <= self.max_seq_len - pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :] - # Adapt pre-computed positional embeddings to the input. - if shape_len == 3: - pos_embeds = pos_embeds.view( - (1, pos_embeds.size(0), pos_embeds.size(1))) - return pos_embeds - - -class MySequential(nn.Sequential): - - def forward(self, *inputs): - for module in self._modules.values(): - if isinstance(inputs, tuple): - inputs = module(*inputs) - else: - inputs = module(inputs) - return inputs - - -class PreNorm(nn.Module): - - def __init__(self, norm, fn): - super().__init__() - self.norm = norm - self.fn = fn - - def forward(self, x, *args, **kwargs): - shortcut = x - if self.norm is not None: - x, size = self.fn(self.norm(x), *args, **kwargs) - else: - x, size = self.fn(x, *args, **kwargs) - - x = shortcut + x - - return x, size - - -class Mlp(nn.Module): - - def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - ): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.net = nn.Sequential( - OrderedDict([("fc1", nn.Linear(in_features, hidden_features)), - ("act", act_layer()), - ("fc2", nn.Linear(hidden_features, out_features))])) - - def forward(self, x, size): - return self.net(x), size - - -class DepthWiseConv2d(nn.Module): - - def __init__( - self, - dim_in, - kernel_size, - padding, - stride, - bias=True, - ): - super().__init__() - self.dw = nn.Conv2d(dim_in, - dim_in, - kernel_size=kernel_size, - padding=padding, - groups=dim_in, - stride=stride, - bias=bias) - - def forward(self, x, size): - B, N, C = x.shape - H, W = size - assert N == H * W - - x = self.dw(x.transpose(1, 2).view(B, C, H, W)) - size = (x.size(-2), x.size(-1)) - x = x.flatten(2).transpose(1, 2) - return x, size - - -class ConvEmbed(nn.Module): - """ Image to Patch Embedding - """ - - def __init__(self, - patch_size=7, - in_chans=3, - embed_dim=64, - stride=4, - padding=2, - norm_layer=None, - pre_norm=True): - super().__init__() - self.patch_size = patch_size - - self.proj = nn.Conv2d(in_chans, - embed_dim, - kernel_size=patch_size, - stride=stride, - padding=padding) - - dim_norm = in_chans if pre_norm else embed_dim - self.norm = norm_layer(dim_norm) if norm_layer else None - - self.pre_norm = pre_norm - - def forward(self, x, size): - H, W = size - if len(x.size()) == 3: - if self.norm and self.pre_norm: - x = self.norm(x) - x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W) - - x = self.proj(x) - - _, _, H, W = x.shape - x = rearrange(x, 'b c h w -> b (h w) c') - if self.norm and not self.pre_norm: - x = self.norm(x) - - return x, (H, W) - - -class ChannelAttention(nn.Module): - - def __init__(self, dim, groups=8, qkv_bias=True): - super().__init__() - - self.groups = groups - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.proj = nn.Linear(dim, dim) - - def forward(self, x, size): - B, N, C = x.shape - - qkv = self.qkv(x).reshape(B, N, 3, self.groups, - C // self.groups).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] - - q = q * (float(N)**-0.5) - attention = q.transpose(-1, -2) @ k - attention = attention.softmax(dim=-1) - x = (attention @ v.transpose(-1, -2)).transpose(-1, -2) - x = x.transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - return x, size - - -class ChannelBlock(nn.Module): - - def __init__(self, - dim, - groups, - mlp_ratio=4., - qkv_bias=True, - drop_path_rate=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - conv_at_attn=True, - conv_at_ffn=True): - super().__init__() - - self.conv1 = PreNorm(None, DepthWiseConv2d( - dim, 3, 1, 1)) if conv_at_attn else None - self.channel_attn = PreNorm( - norm_layer(dim), - ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias), - ) - self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, - 1)) if conv_at_ffn else None - self.ffn = PreNorm( - norm_layer(dim), - Mlp(in_features=dim, - hidden_features=int(dim * mlp_ratio), - act_layer=act_layer), - ) - - def forward(self, x, size): - if self.conv1: - x, size = self.conv1(x, size) - x, size = self.channel_attn(x, size) - - if self.conv2: - x, size = self.conv2(x, size) - x, size = self.ffn(x, size) - - return x, size - - -def window_partition(x, window_size: int): - B, H, W, C = x.shape - x = x.view(B, H // window_size, window_size, W // window_size, window_size, - C) - windows = x.permute(0, 1, 3, 2, 4, - 5).contiguous().view(-1, window_size, window_size, C) - return windows - - -def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int): - B = batch_size - - x = windows.view(B, H // window_size, W // window_size, window_size, - window_size, -1) - x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) - return x - - -class WindowAttention(nn.Module): - - def __init__(self, dim, num_heads, window_size, qkv_bias=True): - - super().__init__() - self.dim = dim - self.window_size = window_size - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = float(head_dim)**-0.5 - - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.proj = nn.Linear(dim, dim) - - self.softmax = nn.Softmax(dim=-1) - - def forward(self, x, size): - - H, W = size - B, L, C = x.shape - assert L == H * W, "input feature has wrong size" - - x = x.view(B, H, W, C) - - pad_l = pad_t = 0 - pad_r = (self.window_size - W % self.window_size) % self.window_size - pad_b = (self.window_size - H % self.window_size) % self.window_size - x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) - _, Hp, Wp, _ = x.shape - - x = window_partition(x, self.window_size) - x = x.view(-1, self.window_size * self.window_size, C) - - # W-MSA/SW-MSA - # attn_windows = self.attn(x_windows) - - B_, N, C = x.shape - qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] - - q = q * self.scale - attn = (q @ k.transpose(-2, -1)) - attn = self.softmax(attn) - - x = (attn @ v).transpose(1, 2).reshape(B_, N, C) - x = self.proj(x) - - # merge windows - x = x.view(-1, self.window_size, self.window_size, C) - x = window_reverse(x, B, self.window_size, Hp, Wp) - - if pad_r > 0 or pad_b > 0: - x = x[:, :H, :W, :].contiguous() - - x = x.view(B, H * W, C) - - return x, size - - -class SpatialBlock(nn.Module): - - def __init__(self, - dim, - num_heads, - window_size, - mlp_ratio=4., - qkv_bias=True, - drop_path_rate=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - conv_at_attn=True, - conv_at_ffn=True): - super().__init__() - - self.conv1 = PreNorm(None, DepthWiseConv2d( - dim, 3, 1, 1)) if conv_at_attn else None - self.window_attn = PreNorm( - norm_layer(dim), - WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias), - ) - self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, - 1)) if conv_at_ffn else None - self.ffn = PreNorm( - norm_layer(dim), - Mlp(in_features=dim, - hidden_features=int(dim * mlp_ratio), - act_layer=act_layer), - ) - - def forward(self, x, size): - if self.conv1: - x, size = self.conv1(x, size) - x, size = self.window_attn(x, size) - - if self.conv2: - x, size = self.conv2(x, size) - x, size = self.ffn(x, size) - return x, size - - -class DaViT(nn.Module): - - def __init__( - self, - in_chans=3, - num_classes=1000, - depths=(1, 1, 3, 1), - patch_size=(7, 2, 2, 2), - patch_stride=(4, 2, 2, 2), - patch_padding=(3, 0, 0, 0), - patch_prenorm=(False, False, False, False), - embed_dims=(64, 128, 192, 256), - num_heads=(3, 6, 12, 24), - num_groups=(3, 6, 12, 24), - window_size=7, - mlp_ratio=4., - qkv_bias=True, - drop_path_rate=0.1, - norm_layer=nn.LayerNorm, - enable_checkpoint=False, - conv_at_attn=True, - conv_at_ffn=True, - ): - super().__init__() - - self.num_classes = num_classes - self.embed_dims = embed_dims - self.num_heads = num_heads - self.num_groups = num_groups - self.num_stages = len(self.embed_dims) - self.enable_checkpoint = enable_checkpoint - assert self.num_stages == len(self.num_heads) == len(self.num_groups) - - num_stages = len(embed_dims) - dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, - sum(depths) * 2) - ] - - depth_offset = 0 - convs = [] - blocks = [] - for i in range(num_stages): - conv_embed = ConvEmbed( - patch_size=patch_size[i], - stride=patch_stride[i], - padding=patch_padding[i], - in_chans=in_chans if i == 0 else self.embed_dims[i - 1], - embed_dim=self.embed_dims[i], - norm_layer=norm_layer, - pre_norm=patch_prenorm[i]) - convs.append(conv_embed) - - block = MySequential(*[ - MySequential( - OrderedDict([('spatial_block', - SpatialBlock( - embed_dims[i], - num_heads[i], - window_size, - drop_path_rate=dpr[depth_offset + j * 2], - qkv_bias=qkv_bias, - mlp_ratio=mlp_ratio, - conv_at_attn=conv_at_attn, - conv_at_ffn=conv_at_ffn, - )), - ('channel_block', - ChannelBlock( - embed_dims[i], - num_groups[i], - drop_path_rate=dpr[depth_offset + j * 2 + - 1], - qkv_bias=qkv_bias, - mlp_ratio=mlp_ratio, - conv_at_attn=conv_at_attn, - conv_at_ffn=conv_at_ffn, - ))])) for j in range(depths[i]) - ]) - blocks.append(block) - depth_offset += depths[i] * 2 - - self.convs = nn.ModuleList(convs) - self.blocks = nn.ModuleList(blocks) - - self.avgpool = nn.AdaptiveAvgPool1d(1) - - @property - def dim_out(self): - return self.embed_dims[-1] - - def forward_features_unpool(self, x): - """ - forward until avg pooling - Args: - x (_type_): input image tensor - """ - input_size = (x.size(2), x.size(3)) - for conv, block in zip(self.convs, self.blocks): - x, input_size = conv(x, input_size) - x, input_size = block(x, input_size) - return x - - def forward_features(self, x): - x = self.forward_features_unpool(x) - - # (batch_size, num_tokens, token_dim) - x = self.avgpool(x.transpose(1, 2)) - # (batch_size, 1, num_tokens) - x = torch.flatten(x, 1) - x = self.norms(x) - - return x - - def forward(self, x): - x = self.forward_features(x) - x = self.head(x) - return x - - @classmethod - def from_config(cls, config): - return cls( - depths=config.depths, - embed_dims=config.dim_embed, - num_heads=config.num_heads, - num_groups=config.num_groups, - patch_size=config.patch_size, - patch_stride=config.patch_stride, - patch_padding=config.patch_padding, - patch_prenorm=config.patch_prenorm, - drop_path_rate=config.drop_path_rate, - window_size=config.window_size, - ) - - -# Language backbone and processor implementation -class Florence2LanguageModel(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.config = config - - self.vocab_size = config.vocab_size - - self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model) - self.encoder = BartEncoder(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.decoder = BartDecoder(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - if self.config.tie_word_embeddings: - self.encoder.embed_tokens.weight = self.shared.weight - self.decoder.embed_tokens.weight = self.shared.weight - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you - provide it. - positions: Positions of *decoder* input sequence tokens. - encoder_input_ids: Indices of *encoder* input sequence tokens - in the vocabulary. - encoder_positions: Positions of *encoder* input sequence tokens. - Returns: - Model output torch.Tensor - """ - - encoder_hidden_states = None - - if ((inputs_embeds is not None and inputs_embeds.numel() > 0) - or encoder_input_ids.numel() > 0): - # Run encoder attention if a non-zero number of encoder tokens - # are provided as input - encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, - positions=encoder_positions, - inputs_embeds=inputs_embeds) - - # decoder outputs consists of - # (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=encoder_hidden_states) - - return decoder_outputs - - -class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - - self.config = config - self.model = Florence2LanguageModel(vllm_config=vllm_config, - prefix=f"{prefix}.model") - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.vocab_size = config.vocab_size - self.lm_head = BartParallelLMHead(self.vocab_size, - config.d_model, - embed_scale=embed_scale) - if self.config.tie_word_embeddings: - self.lm_head.tie_weights(self.model.shared) - - self.logits_processor = LogitsProcessor(self.vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices - Returns: - Output torch.Tensor - """ - - return self.model(input_ids, - positions, - encoder_input_ids, - encoder_positions, - inputs_embeds=inputs_embeds) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.encoder.embed_tokens(input_ids) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if "final_logits_bias" in name: - continue - if self.config.tie_word_embeddings and ("embed_tokens" in name - or "lm_head" in name): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class Florence2ProcessingInfo(BaseProcessingInfo): - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} - - def get_num_image_tokens(self) -> int: - processor_config = self.ctx.get_hf_image_processor_config() - return processor_config["image_seq_length"] - - -class Florence2DummyInputsBuilder( - BaseDummyInputsBuilder[Florence2ProcessingInfo]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - return "" - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width = target_height = self.info.get_hf_config().projection_dim - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - -class Florence2MultiModalProcessor( - EncDecMultiModalProcessor[Florence2ProcessingInfo]): - - def _hf_processor_applies_updates( - self, - prompt_text: str, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Mapping[str, object], - ) -> bool: - return False - - def create_encoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return prompt - - def create_decoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return [self.info.get_hf_config().eos_token_id] - - def _apply_hf_processor_tokens_only( - self, - prompt_tokens: list[int], - ) -> list[int]: - hf_processor = self.info.get_hf_processor() - tokenizer: BartTokenizer = hf_processor.tokenizer - prompt_text = tokenizer.decode(prompt_tokens) - # convert task tokens to prompt - prompt_text = hf_processor._construct_prompts([prompt_text])[0] - prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False) - return prompt_tokens - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - if mm_data: - processed_outputs = super()._call_hf_processor( - prompt, mm_data, mm_kwargs, tok_kwargs) - else: - hf_processor = self.info.get_hf_processor() - tokenizer = hf_processor.tokenizer - prompt = hf_processor._construct_prompts([prompt])[0] - processed_outputs = tokenizer(prompt, - add_special_tokens=True, - return_tensors="pt") - return processed_outputs - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image")) - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargsItems, - ) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - pad_token_id = hf_config.pad_token_id - num_image_tokens = self.info.get_num_image_tokens() - image_tokens = [pad_token_id] * num_image_tokens - - return [ - PromptInsertion( - modality="image", - target=PromptIndexTargets.start(), - insertion=image_tokens, - ) - ] - - -@MULTIMODAL_REGISTRY.register_processor( - Florence2MultiModalProcessor, - info=Florence2ProcessingInfo, - dummy_inputs=Florence2DummyInputsBuilder) -class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsV0Only): - - @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: - if modality.startswith("image"): - return None - - raise ValueError("Only image modality is supported") - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - processor_config = vllm_config.model_config.hf_image_processor_config - - self.config = config - self.vision_config = config.vision_config - self.processor_config = processor_config - assert config.vision_config.model_type == 'davit', ( - 'only DaViT is supported for now') - self.vision_tower = DaViT.from_config(config=config.vision_config) - self._build_image_projection_layers(config) - self.language_model = Florence2LanguageForConditionalGeneration( - vllm_config=vllm_config.with_hf_config(config.text_config), - prefix=f"{prefix}.language_model", - ) - self.pad_token_id = config.pad_token_id - - def _build_image_projection_layers(self, config: PretrainedConfig): - image_dim_out = config.vision_config.dim_embed[-1] - dim_projection = config.vision_config.projection_dim - self.image_projection = nn.Parameter( - torch.empty(image_dim_out, dim_projection)) - self.image_proj_norm = nn.LayerNorm(dim_projection) - image_pos_embed_config = config.vision_config.image_pos_embed - if image_pos_embed_config['type'] == 'learned_abs_2d': - self.image_pos_embed = LearnedAbsolutePositionEmbedding2D( - embedding_dim=image_dim_out, - num_pos=image_pos_embed_config['max_pos_embeddings']) - else: - raise NotImplementedError("Florence2 only supports learned_abs_2d " - "as image position embedding.") - - self.image_feature_source = config.vision_config.image_feature_source - - # temporal embedding - visual_temporal_embedding_config = ( - self.vision_config.visual_temporal_embedding) - if visual_temporal_embedding_config['type'] == 'COSINE': - self.visual_temporal_embed = PositionalEmbeddingCosine1D( - embed_dim=image_dim_out, - max_seq_len=visual_temporal_embedding_config[ - 'max_temporal_embeddings']) - else: - raise NotImplementedError( - 'Florence2 only supports COSINE as temporal embedding.') - - def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "image_embeds", None) - - if pixel_values is None and image_embeds is None: - return None - - if pixel_values is not None and image_embeds is not None: - raise ValueError( - "Both pixel values and image embeds are provided.") - - if pixel_values is not None: - size = self.processor_config["size"] - expected_h, expected_w = size["height"], size["width"] - - return Florence2ImagePixelInputs( - type="pixel_values", - data=flatten_bn(pixel_values, concat=True), - resolve_bindings={ - "h": expected_h, - "w": expected_w - }, - ) - - if image_embeds is not None: - raise NotImplementedError - - raise AssertionError("This line should be unreachable.") - - def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor: - dtype = next(self.vision_tower.parameters()).dtype - pixel_values = pixel_values.to(dtype) - - batch_size, T = pixel_values.size(0), 1 - x = self.vision_tower.forward_features_unpool(pixel_values) - if self.image_pos_embed is not None: - x = x.view(batch_size * T, -1, x.shape[-1]) - num_tokens = x.shape[-2] - h, w = int(num_tokens**0.5), int(num_tokens**0.5) - assert h * w == num_tokens, ( - 'only support square feature maps for now') - x = x.view(batch_size * T, h, w, x.shape[-1]) - pos_embed = self.image_pos_embed(x) - x = x + pos_embed - x = x.view(batch_size, T * h * w, x.shape[-1]) - - if self.visual_temporal_embed is not None: - visual_temporal_embed = self.visual_temporal_embed( - x.view(batch_size, T, -1, x.shape[-1])[:, :, 0]) - x = x.view(batch_size, T, -1, - x.shape[-1]) + visual_temporal_embed.view( - 1, T, 1, x.shape[-1]) - - x_feat_dict = {} - - spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2) - x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x - - temporal_avg_pool_x = x.view(batch_size, T, -1, - x.shape[-1]).mean(dim=1) - x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x - - x = x.view(batch_size, T, -1, x.shape[-1])[:, -1] - x_feat_dict['last_frame'] = x - - new_x = [] - for _image_feature_source in self.image_feature_source: - if _image_feature_source not in x_feat_dict: - raise ValueError('invalid image feature source: {}'.format( - _image_feature_source)) - new_x.append(x_feat_dict[_image_feature_source]) - - x = torch.cat(new_x, dim=1) - - x = x @ self.image_projection - x = self.image_proj_norm(x) - - return x - - def _process_image_input( - self, image_input: Florence2ImagePixelInputs) -> torch.Tensor: - assert image_input["type"] == "pixel_values" - pixel_values = image_input["data"] - return self._encode_image(pixel_values) - - def get_language_model(self) -> torch.nn.Module: - return self.language_model - - def get_multimodal_embeddings(self, - **kwargs: object) -> MultiModalEmbeddings: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return [] - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.pad_token_id) - return inputs_embeds - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices - Returns: - Output torch.Tensor - """ - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - if encoder_input_ids.numel() > 0 or vision_embeddings is not None: - inputs_embeds = self.get_input_embeddings(encoder_input_ids, - vision_embeddings) - else: - inputs_embeds = None - - hidden_states = self.language_model(input_ids, - positions, - encoder_input_ids, - encoder_positions, - inputs_embeds=inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py deleted file mode 100644 index 048894085b360..0000000000000 --- a/vllm/model_executor/models/mllama.py +++ /dev/null @@ -1,1697 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2024 the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch Mllama model.""" -import math -from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Literal, Optional, Union - -import numpy as np -import torch -import torch.nn.functional as F -import transformers.models.mllama.configuration_mllama as config_mllama -from PIL.Image import Image -from torch import nn -from transformers import BatchFeature, MllamaConfig -from transformers.modeling_outputs import (BaseModelOutput, - CausalLMOutputWithPast) -from transformers.models.mllama.image_processing_mllama import ( - get_optimal_tiled_canvas) -from transformers.models.mllama.processing_mllama import ( - MllamaProcessor, get_cross_attention_token_mask) - -import vllm.distributed.parallel_state as ps -from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.layer import MultiHeadAttention -from vllm.attention.ops.paged_attn import PagedAttention -from vllm.attention.selector import _Backend -from vllm.config import VllmConfig -from vllm.distributed import get_pp_group, get_tp_group -from vllm.forward_context import get_forward_context -from vllm.logger import init_logger -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - QKVCrossParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, - MultiModalFieldConfig, - MultiModalKwargsItems, MultiModalUUIDDict) -from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, - MultiModalDataItems) -from vllm.multimodal.processing import (BaseProcessingInfo, - EncDecMultiModalProcessor, - PromptReplacement, PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.utils.tensor_schema import TensorSchema, TensorShape - -from .clip import CLIPMLP -from .interfaces import SupportsMultiModal, SupportsV0Only -from .llama import LlamaDecoderLayer, LlamaMLP -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix - -logger = init_logger(__name__) - - -class MllamaImagePixelInputs(TensorSchema): - """ - Dimensions: - - batch_size: Batch size - - max_num_image: Max number of images - - max_num_chunk: Max number of chunks - - max_num_tiles: Max number of tiles per image - - num_channel: Number of channels - - height: Height - - width: Width - """ - - type: Literal["pixel_values"] = "pixel_values" - - data: Annotated[torch.Tensor, - TensorShape("batch_size", "max_num_image", "max_num_chunk", - "num_channel", "height", "width")] - - aspect_ratio_ids: Annotated[torch.Tensor, - TensorShape("batch_size", "max_num_image")] - - aspect_ratio_mask: Annotated[ - torch.Tensor, - TensorShape("batch_size", "max_num_image", "max_num_tiles")] - - -# TODO: support LlamaImageEmbeddingInputs - - -def calc_token_per_chunk(image_size: int) -> int: - assert image_size % 14 == 0, "chunk size should be multiple of 14" - token_per_chunk = (image_size // 14)**2 + 1 - return token_per_chunk - - -class MllamaProcessingInfo(BaseProcessingInfo): - - def get_hf_config(self) -> MllamaConfig: - return self.ctx.get_hf_config(MllamaConfig) - - def get_hf_processor(self, **kwargs: object) -> MllamaProcessor: - return self.ctx.get_hf_processor(MllamaProcessor, **kwargs) - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} - - def get_token_per_chunk_from_config(self) -> int: - image_size = self.get_hf_config().vision_config.image_size - return calc_token_per_chunk(image_size) - - def get_num_tiles_per_image(self, image_height: int, - image_width: int) -> int: - vision_config = self.get_hf_config().vision_config - max_num_tiles = vision_config.max_num_tiles - image_size = vision_config.image_size - tiled_height, tiled_width = get_optimal_tiled_canvas( - image_height, - image_width, - max_num_tiles, - tile_size=image_size, - ) - num_tiles_height = tiled_height // image_size - num_tiles_width = tiled_width // image_size - return num_tiles_height * num_tiles_width - - def get_image_size_with_most_features(self) -> ImageSize: - vision_config = self.get_hf_config().vision_config - image_size = vision_config.image_size - max_num_tiles = vision_config.max_num_tiles - # Result in the max possible feature size (h:w = 16:1) - return ImageSize(height=max_num_tiles * image_size, width=image_size) - - -class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - num_images = mm_counts.get("image", 0) - - processor = self.info.get_hf_processor() - image_token = processor.image_token - - return image_token * num_images - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width, target_height = \ - self.info.get_image_size_with_most_features() - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - -class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] - ): - - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_uuids: Optional[MultiModalUUIDDict] = None, - ) -> MultiModalEncDecInputs: - mm_inputs = super().apply(prompt, - mm_data, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids) - - image_token_id = self.info.get_hf_config().image_token_index - # Check that the number of image tokens in the decoder prompt matches - # the number of images provided in mm_data - num_image_tokens = mm_inputs['prompt_token_ids'].count(image_token_id) - image_data = mm_data.get("image", []) - num_images = 1 if isinstance(image_data, Image) else len(image_data) - if num_image_tokens != num_images: - raise ValueError( - f"The number of image tokens ({num_image_tokens}) must be" - f" the same as the number of images ({num_images})") - - # Given prompt: <IMG0> P0 P1 <IMG1> <IMG2> P3 P4 D5 D6...., (P-prefill, D-decode) # noqa: E501 - # P0 & P1 do cross attention with placeholder of <IMG0> - # P3 P4 D5 D6 do cross attention with placeholder of <IMG1> and <IMG2> - # Example input to encoder and decoder: - # { - # 'encoder': { - # 'type': 'token', - # 'prompt_token_ids': [128256, 128256, ..., 128256], - # 'prompt': '<|image|><|image|>...<|image|>', - # 'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>}, # noqa: E501 - # }, - # 'decoder': { - # 'type': 'token', - # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 - # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 - # 'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>}, # noqa: E501 - # }, - # } - - if mm_data: - hf_processor = self.info.get_hf_processor() - image_token: str = hf_processor.image_token - - # Since only the last group of consecutive images - # are attended by the decoded tokens, we only need to - # get the number of tokens for those images. - token_per_chunk = self.info.get_token_per_chunk_from_config() - num_decode_images = self._get_num_image_in_last_group( - mm_inputs["prompt_token_ids"]) - num_encode_images = num_images - num_decode_images - - # Set encoder prompt length based on the number of tiles. - # This tells the block manager to allocate correct number - # of slots for encoder tokens. - num_tiles = mm_inputs["mm_kwargs"].get_data()["num_tiles"] - decode_tiles = num_tiles[num_encode_images:num_images].sum().item() - num_tokens = decode_tiles * token_per_chunk - mm_inputs["encoder_prompt_token_ids"] = [image_token_id - ] * num_tokens - mm_inputs["encoder_prompt"] = image_token * num_tokens - - return mm_inputs - - def _get_num_image_in_last_group(self, prompt_token_ids: list[int]) -> int: - num_images = 0 - for token_id in prompt_token_ids[::-1]: - if token_id == self.info.get_hf_config().image_token_index: - num_images += 1 - elif num_images > 0: - break - return num_images - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - tokenizer = self.info.get_tokenizer() - if mm_data: - num_tiles = [ - self.info.get_num_tiles_per_image(img.height, img.width) - for img in mm_data["images"] - ] - processed_outputs = super()._call_hf_processor( - prompt, mm_data, mm_kwargs, tok_kwargs) - processed_outputs["num_tiles"] = torch.tensor(num_tiles) - for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"): - processed_outputs[k] = processed_outputs[k].squeeze(0) - - processed_token_ids = processed_outputs.pop("input_ids") - start_idx, end_idx = 0, processed_token_ids.size(1) - processed_prompt_text = tokenizer.decode(processed_token_ids[0]) - - hf_processor = self.info.get_hf_processor() - bos_token = hf_processor.bos_token - # Remove the bos_token from the start of prompt, - # because we all know there would be image_token. - if processed_prompt_text.startswith(bos_token): - start_idx += 1 - # Remove the bos_token from the end of prompt, - # because text is empty in this case. - if processed_prompt_text.endswith(bos_token): - end_idx -= 1 - processed_outputs[ - "input_ids"] = processed_token_ids[:, start_idx:end_idx] - else: - processed_outputs = tokenizer(prompt, - add_special_tokens=False, - return_tensors="pt") - return processed_outputs - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - aspect_ratio_ids=MultiModalFieldConfig.batched("image"), - aspect_ratio_mask=MultiModalFieldConfig.batched("image"), - num_tiles=MultiModalFieldConfig.batched("image"), - ) - - def create_encoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - data = mm_data.get("image", []) - num_images = 1 if isinstance(data, Image) else len(data) - image_token_id = self.info.get_hf_config().image_token_index - return [image_token_id] * num_images - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargsItems, - ) -> Sequence[PromptUpdate]: - token_per_chunk = self.info.get_token_per_chunk_from_config() - image_token_id = self.info.get_hf_config().image_token_index - - def get_replacement_mllama(item_idx): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - num_tile = self.info.get_num_tiles_per_image( - image_height=image_size.height, - image_width=image_size.width, - ) - num_tokens = num_tile * token_per_chunk - return [image_token_id] * num_tokens - - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement_mllama, - ) - ] - - -def _prepare_aspect_ratio_attention_mask( - aspect_ratio_mask: torch.Tensor, - num_patches: int, - target_length: int, - dtype: torch.dtype, -) -> torch.Tensor: - # Expand aspect ratio mask to target_length - batch_size, max_num_tiles = aspect_ratio_mask.shape - attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, - 1).to(dtype) - attention_mask = attention_mask.repeat(1, 1, target_length, 1) - - # Mask padding patches - pad_patches = target_length - num_patches - attention_mask[:, :, -pad_patches:] = 0 - - # Invert the mask (0 -> 1, 1 -> 0) - attention_mask = 1 - attention_mask - - # Reshape to 2D and create 4D attention mask - # (batch_size, 1, max_num_tiles*target_length, max_num_tiles*target_length) - attention_mask = attention_mask.reshape(batch_size, - max_num_tiles * target_length, 1) - attention_mask = attention_mask @ attention_mask.transpose( - -1, -2) * torch.finfo(dtype).min - attention_mask = attention_mask.unsqueeze(1) - - return attention_mask - - -class ColumnParallelConv2dPatch(torch.nn.Module): - """Conv2D Patching layer with model parallelism. - Column parallel over unfolded input. - Arguments: - in_channels: Input channels. - out_channels: Output channels. - kernel_size: Size of convolution kernel. - stride (default 1): Stride for convolution. - bias (default False): Use bias in Conv2d. - Input: (bsz, in_channels, width, height) - Output: (bsz, num_tokens, out_channels) - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: Union[int, tuple[int, int]], - stride: Union[int, tuple[int, int]], - bias: bool = False, - ) -> None: - super().__init__() - if isinstance(kernel_size, int): - kernel_size = (kernel_size, kernel_size) - self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride) - self._linear = ColumnParallelLinear( - in_channels * kernel_size[0] * kernel_size[1], - out_channels, - bias=bias, - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self._unfold(x) - x = x.permute(0, 2, 1) - x, _ = self._linear(x) - return x - - -class MllamaPrecomputedAspectRatioEmbedding(nn.Module): - - def __init__(self, - config: config_mllama.MllamaVisionConfig, - is_gated: bool = True): - super().__init__() - self.max_num_tiles = config.max_num_tiles - self.hidden_size = config.hidden_size - self.max_aspect_ratio_id = config.max_aspect_ratio_id - self.is_gated = is_gated - - self.embedding = nn.Embedding(self.max_aspect_ratio_id + 1, - self.max_num_tiles * self.hidden_size) - if is_gated: - self.gate = nn.Parameter(torch.zeros(1)) - - def forward(self, hidden_state: torch.Tensor, - aspect_ratio_ids: torch.Tensor) -> torch.Tensor: - embeddings = self.embedding(aspect_ratio_ids) - embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, - self.hidden_size) - - if self.is_gated: - embeddings = embeddings * self.gate.tanh() - - hidden_state = hidden_state + embeddings - return hidden_state - - -class MllamaPrecomputedPositionEmbedding(nn.Module): - - def __init__(self, config: config_mllama.MllamaVisionConfig): - super().__init__() - self.max_num_tiles = config.max_num_tiles - self.max_aspect_ratio_id = config.max_aspect_ratio_id - self.num_patches = (config.image_size // config.patch_size)**2 + 1 - self.hidden_size = config.hidden_size - self.scale = config.hidden_size**-0.5 - - self.gate = nn.Parameter(torch.zeros(1)) - - # position embedding - position_embedding = torch.randn(self.num_patches, self.hidden_size) - self.embedding = nn.Parameter(self.scale * position_embedding) - - # tile position embedding - self.tile_embedding = nn.Embedding( - self.max_aspect_ratio_id + 1, - self.max_num_tiles * self.num_patches * self.hidden_size) - - def forward(self, hidden_state: torch.Tensor, - aspect_ratio_ids: torch.Tensor) -> torch.Tensor: - # position embeddings - gated_position_embedding = (1 - self.gate.tanh()) * self.embedding - hidden_state = hidden_state + gated_position_embedding.view( - 1, 1, self.num_patches, self.hidden_size) - - # precomputed tile position embeddings - tile_position_embedding = self.tile_embedding(aspect_ratio_ids) - batch_size = hidden_state.shape[0] - tile_position_embedding = tile_position_embedding.reshape( - batch_size, self.max_num_tiles, self.num_patches, self.hidden_size) - gated_tile_position_embedding = self.gate.tanh( - ) * tile_position_embedding - hidden_state = hidden_state + gated_tile_position_embedding - - return hidden_state - - -# TODO: support other attention backends for attention in vision model -class MllamaVisionSdpaAttention(nn.Module): - - def __init__(self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): - super().__init__() - - tensor_parallel_size = get_tp_group().world_size - self.embed_dim = config.hidden_size - self.num_heads = config.attention_heads - self.head_dim = config.hidden_size // config.attention_heads - self.num_local_heads = self.num_heads // tensor_parallel_size - self.q_size = self.num_local_heads * self.head_dim - self.kv_size = self.num_local_heads * self.head_dim - - self.qkv_proj = QKVParallelLinear( - self.embed_dim, - self.head_dim, - self.num_heads, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.o_proj = RowParallelLinear( - self.num_heads * self.head_dim, - self.embed_dim, - bias=False, - input_is_parallel=True, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - - # Use unified MultiHeadAttention with automatic backend selection - self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim, - 1.0 / math.sqrt(self.head_dim)) - - def forward( - self, - hidden_state: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_state) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - # Use unified MultiHeadAttention with automatic backend selection - attn_output = self.attn(q, k, v) - - attn_output = attn_output.reshape(attn_output.shape[0], - attn_output.shape[1], -1) - output, _ = self.o_proj(attn_output) - return output - - -class MllamaVisionEncoderLayer(nn.Module): - - def __init__( - self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig], - prefix: str = "", - is_gated: bool = False, - ) -> None: - super().__init__() - - self.hidden_size = config.hidden_size - self.num_attention_heads = config.attention_heads - self.is_gated = is_gated - self.intermediate_size = config.intermediate_size - - self.self_attn = MllamaVisionSdpaAttention( - config, quant_config=quant_config, prefix=f"{prefix}.self_attn") - self.mlp = CLIPMLP(config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - - self.input_layernorm = nn.LayerNorm(self.hidden_size, - eps=config.norm_eps) - self.post_attention_layernorm = nn.LayerNorm(self.hidden_size, - eps=config.norm_eps) - - # there used to be an if else here, no code path - if is_gated: - self.gate_attn = nn.Parameter(torch.ones(1) * math.pi / 4) - self.gate_ffn = nn.Parameter(torch.ones(1) * math.pi / 4) - - def forward( - self, - hidden_state: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ): - # Self Attention - residual = hidden_state - hidden_state = self.input_layernorm(hidden_state) - hidden_state = self.self_attn(hidden_state, - attention_mask=attention_mask) - gate_attn = 1 if not self.is_gated else self.gate_attn.tanh() - hidden_state = residual + gate_attn * hidden_state - - # Feed forward - residual = hidden_state - hidden_state = self.post_attention_layernorm(hidden_state) - hidden_state = self.mlp(hidden_state) - gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh() - hidden_state = residual + gate_ffn * hidden_state - - return hidden_state - - -class MllamaVisionEncoder(nn.Module): - - def __init__( - self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig], - num_layers: int = 32, - is_gated: bool = False, - output_hidden_states=None, - prefix: str = "", - ) -> None: - super().__init__() - self.config = config - self.layers = nn.ModuleList([ - MllamaVisionEncoderLayer(config, - quant_config=quant_config, - is_gated=is_gated, - prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(num_layers) - ]) - self.output_hidden_states = output_hidden_states or [] - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ) -> Union[BaseModelOutput]: - encoder_states = () - - for i, encoder_layer in enumerate(self.layers): - if i in self.output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - hidden_states = encoder_layer( - hidden_states, - attention_mask, - ) - - if len(self.layers) - 1 in self.output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - - return hidden_states, encoder_states - - -class MllamaVisionModel(nn.Module): - - def __init__( - self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: - super().__init__() - - self.image_size = config.image_size - self.patch_size = config.patch_size - self.max_num_tiles = config.max_num_tiles - self.hidden_size = config.hidden_size - self.in_channels = config.num_channels - self.intermediate_layers_indices = config.intermediate_layers_indices - - self.num_patches = (self.image_size // self.patch_size)**2 + 1 - self.scale = config.hidden_size**-0.5 - - self.patch_embedding = ColumnParallelConv2dPatch( - in_channels=config.num_channels, - out_channels=self.hidden_size, - kernel_size=self.patch_size, - stride=self.patch_size, - bias=False, - ) - - self.class_embedding = nn.Parameter(self.scale * - torch.randn(self.hidden_size)) - self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding( - config) - - self.pre_tile_positional_embedding = \ - MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True) - self.post_tile_positional_embedding = \ - MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True) - - # layer norms - self.layernorm_pre = nn.LayerNorm(self.hidden_size) - self.layernorm_post = nn.LayerNorm(self.hidden_size) - - # encoders - self.transformer = MllamaVisionEncoder( - config, - quant_config, - config.num_hidden_layers, - is_gated=False, - output_hidden_states=config.intermediate_layers_indices, - prefix=f"{prefix}.transformer", - ) - self.global_transformer = MllamaVisionEncoder( - config, - quant_config, - config.num_global_layers, - is_gated=True, - prefix=f"{prefix}.global_transformer", - ) - - def apply_class_embedding(self, - hidden_state: torch.Tensor) -> torch.Tensor: - batch_size, _, hidden_size = hidden_state.shape - class_embedding = self.class_embedding.expand(batch_size, 1, - hidden_size) - hidden_state = torch.cat([class_embedding, hidden_state], dim=1) - return hidden_state - - def forward(self, pixel_values: torch.Tensor, - aspect_ratio_ids: torch.Tensor, - aspect_ratio_mask: torch.Tensor) -> torch.Tensor: - batch_size, num_concurrent_media, num_tiles, num_channels, \ - height, width = pixel_values.shape - - pixel_values = pixel_values.reshape( - batch_size * num_concurrent_media * num_tiles, num_channels, - height, width) - aspect_ratio_ids = aspect_ratio_ids.reshape( - batch_size * num_concurrent_media, -1) - - # patch embedding - patch_embeds = self.patch_embedding( - pixel_values.to(self.layernorm_pre.weight.dtype)) - hidden_state = patch_embeds - hidden_state = ps.get_tp_group().all_gather(hidden_state) - - # tile embeddings - _, num_patches, dim = hidden_state.shape - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, -1, dim) - hidden_state = self.pre_tile_positional_embedding( - hidden_state, aspect_ratio_ids) - - # apply cls token - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media * num_tiles, num_patches, dim) - hidden_state = self.apply_class_embedding(hidden_state) - num_patches += 1 - - # apply position embeddings - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, num_patches, dim) - hidden_state = self.gated_positional_embedding(hidden_state, - aspect_ratio_ids) - - # apply encoder - hidden_state = self.layernorm_pre(hidden_state) - - # Compute the number of tokens to pad - num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8 - # Compute padding tuple for pad function - padding = ( - 0, 0, 0, num_padding_patches - ) # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2) - # Pad the tensor - hidden_state = F.pad(hidden_state, padding, mode="constant", value=0) - slice_index = -num_padding_patches if num_padding_patches > 0 else None - - attention_mask = aspect_ratio_mask.reshape( - batch_size * num_concurrent_media, -1) - attention_mask = _prepare_aspect_ratio_attention_mask( - aspect_ratio_mask=attention_mask, - num_patches=self.num_patches, - target_length=hidden_state.shape[2], - dtype=self.layernorm_pre.weight.dtype, - ) - - hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, - dim) - output = self.transformer( - hidden_state, - attention_mask=attention_mask, - ) - hidden_state, intermediate_hidden_states = output[0], output[1] - intermediate_hidden_states = torch.stack(intermediate_hidden_states, - dim=-1) - - # apply global encoder - hidden_state = self.layernorm_post(hidden_state) - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, - num_patches + num_padding_patches, - dim) - hidden_state = self.post_tile_positional_embedding( - hidden_state, aspect_ratio_ids) - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, - num_tiles * (num_patches + num_padding_patches), dim) - hidden_state = self.global_transformer( - hidden_state, attention_mask=attention_mask)[0] - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, - num_patches + num_padding_patches, - dim) - hidden_state = hidden_state[:, :, :slice_index] - - # adding intermediate layer outputs - hidden_state = hidden_state.reshape(batch_size, num_concurrent_media, - num_tiles, num_patches, dim) - intermediate_hidden_states = intermediate_hidden_states.reshape( - batch_size * num_concurrent_media, num_tiles, - num_patches + num_padding_patches, -1) - intermediate_hidden_states = intermediate_hidden_states[:, :, : - slice_index] - intermediate_hidden_states = intermediate_hidden_states.reshape( - batch_size, num_concurrent_media, num_tiles, num_patches, -1) - hidden_state = torch.cat([hidden_state, intermediate_hidden_states], - dim=-1) - return hidden_state - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - ] - params_dict = dict(self.named_parameters()) - updated_params: set[str] = set() - for name, loaded_weight in weights: - if 'patch_embedding._linear.weight' in name: - loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - updated_params.add(name) - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - param = params_dict.pop(name) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - updated_params.add(name) - return updated_params - - -class MllamaTextRMSNorm(nn.Module): - - def __init__(self, hidden_size, eps=1e-6): - """ - MllamaTextRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + - self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" - - -class MllamaTextCrossAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__( - self, - config: Optional[config_mllama.MllamaTextConfig] = None, - layer_idx: Optional[int] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.config = config - self.pipeline_parallel_rank = get_pp_group().rank_in_group - self.tensor_parallel_size = get_tp_group().world_size - self.num_heads = config.num_attention_heads - self.num_key_value_heads = config.num_key_value_heads - - self.num_local_heads = self.num_heads // self.tensor_parallel_size - self.num_local_key_value_heads = \ - self.num_key_value_heads // self.tensor_parallel_size - self.hidden_size = config.hidden_size - self.head_dim = config.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - - self.layer_idx = layer_idx - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.q_local_size = self.num_local_heads * self.head_dim - self.kv_local_size = self.num_local_key_value_heads * self.head_dim - - self.qkv_proj = QKVCrossParallelLinear( - self.hidden_size, - self.head_dim, - self.num_heads, - self.num_key_value_heads, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - - self.o_proj = RowParallelLinear( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - # vllm.model_executor.layers.layernorm.RMSNorm has precision issue, - # use huggingface's instead - self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) - self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) - self.scaling = self.head_dim**-0.5 - - self.attn = Attention( - self.num_local_heads, - self.head_dim, - self.scaling, - self.num_local_key_value_heads, - prefix=f"{prefix}.attn", - attn_type=AttentionType.ENCODER_DECODER, - ) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - cross_attention_states: Optional[torch.Tensor], - ) -> torch.Tensor: - q, k, v = self.qkv_proj(hidden_states, cross_attention_states) - if cross_attention_states is not None: - k = k.view(-1, self.num_local_key_value_heads, self.head_dim) - v = v.view(-1, self.num_local_key_value_heads, self.head_dim) - k = self.k_norm(k) - - q = q.view(-1, self.num_local_heads, self.head_dim) - q = self.q_norm(q) - - if attention_mask is not None: - output = self._attention_with_mask(q, k, v, attention_mask, - kv_range_for_decode) - else: - output = self.attn( - q.view(-1, self.num_local_heads * self.head_dim), k, v) - out, _ = self.o_proj(output) - return out - - def _attention_with_mask( - self, - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - attention_mask: torch.Tensor, - kv_range_for_decode: list[tuple[int, int]], - ) -> torch.Tensor: - kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank] - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - # Skip writing kv-cache for the initial profiling run. - # TODO (NickLucche) replace with custom attn bias and use standard attn - if len(kv_cache.shape) > 1: - i = torch.ones(1, dtype=torch.float32) - if self.attn.backend in (_Backend.FLASH_ATTN, - _Backend.FLASH_ATTN_VLLM_V1): - cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) - cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) - torch.ops._C_cache_ops.reshape_and_cache_flash( - cached_k, - cached_v, - kv_cache[0], - kv_cache[1], - attn_metadata. - cross_slot_mapping, # type: ignore[union-attr] - "auto", - i, - i, - ) - elif self.attn.backend in (_Backend.XFORMERS, _Backend.ROCM_FLASH, - _Backend.TORCH_SDPA): - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_local_key_value_heads, self.head_dim) - cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) - cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) - PagedAttention.write_to_paged_cache( - cached_k, cached_v, key_cache, value_cache, - attn_metadata.cross_slot_mapping, "auto", i, i) - else: - raise ValueError( - f"Unsupported Attention backend {self.attn.backend} " - "enum found. Expected the Attention backend to be " - "FLASH_ATTN, FLASH_ATTN_VLLM_V1, " - "XFORMERS or TORCH_SDPA.") - - # We have to call torch.sdpa for prefill when using a - # custom cross-attention mask. Because the mask is not a - # standard causal mask, neither a block diagonal mask which - # can be optimized by xformers.BlockDiagonalMask. - # The mask is specially calculated for supporting multi - # images and interleaved images. - q_len = q.shape[0] - kv_len = k.shape[0] - q = q.transpose(0, 1).view(self.num_local_key_value_heads, - self.num_key_value_groups, q_len, - self.head_dim).contiguous() - k = k.transpose(0, - 1)[:, - None, :, :].expand(self.num_local_key_value_heads, - self.num_key_value_groups, - kv_len, - self.head_dim).contiguous() - v = v.transpose(0, - 1)[:, - None, :, :].expand(self.num_local_key_value_heads, - self.num_key_value_groups, - kv_len, - self.head_dim).contiguous() - attention_mask = attention_mask.view(1, 1, q_len, kv_len) - output = F.scaled_dot_product_attention(q, - k, - v, - attn_mask=attention_mask, - is_causal=False) - output = output.permute(2, 0, 1, 3).reshape( - q_len, self.num_local_heads * self.head_dim) - return output - - -class MllamaCrossAttentionDecoderLayer(torch.nn.Module): - """Cross-attention transformer block with tanh-gated attention - and feedforward.""" - - def __init__( - self, - config: config_mllama.MllamaTextConfig, - layer_idx: int, - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: - super().__init__() - - self.layer_idx = layer_idx - self.cross_attn = MllamaTextCrossAttention( - config=config, - layer_idx=layer_idx, - quant_config=quant_config, - prefix=f"{prefix}.cross_attn", - ) - - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.cross_attn_attn_gate = torch.nn.Parameter(torch.zeros(1)) - - self.mlp = LlamaMLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.cross_attn_mlp_gate = torch.nn.Parameter(torch.zeros(1)) - - def forward( - self, - hidden_states: torch.Tensor, - cross_attention_states: torch.Tensor, - cross_attention_mask: torch.Tensor, - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: torch.Tensor, - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - - hidden_states = self.cross_attn( - hidden_states=hidden_states, - attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - cross_attention_states=cross_attention_states, - ) - hidden_states = full_text_row_masked_out_mask * hidden_states - hidden_states = residual + self.cross_attn_attn_gate.tanh( - ) * hidden_states - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = full_text_row_masked_out_mask * hidden_states - hidden_states = residual + self.cross_attn_mlp_gate.tanh( - ) * hidden_states - return hidden_states - - -class MllamaTextModel(nn.Module): - config_class = config_mllama.MllamaTextConfig - base_model_prefix = "model" - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config.text_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, - config.hidden_size) - self.cross_attention_layers = config.cross_attention_layers - - layers = [] - for layer_idx in range(config.num_hidden_layers): - if layer_idx in self.cross_attention_layers: - layers.append( - MllamaCrossAttentionDecoderLayer( - config, - layer_idx, - quant_config=quant_config, - prefix=f"{prefix}.layers.{layer_idx}", - )) - else: - # TODO: force LlamaDecoderLayer to config.attention_bias=False - layers.append( - LlamaDecoderLayer( - config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.layers.{layer_idx}", - )) - - self.layers = nn.ModuleList(layers) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.LongTensor, - positions: Optional[torch.LongTensor], - cross_attention_states: Optional[torch.LongTensor], - cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, - torch.Tensor]], - skip_cross_attention: bool, - ) -> torch.Tensor: - inputs_embeds = self.embed_tokens(input_ids) - hidden_states = inputs_embeds - - for idx, decoder_layer in enumerate(self.layers): - if idx in self.cross_attention_layers: - if not skip_cross_attention: - hidden_states = decoder_layer( - hidden_states=hidden_states, - cross_attention_states=cross_attention_states, - cross_attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - full_text_row_masked_out_mask= - full_text_row_masked_out_mask, - ) - else: - hidden_states, residual = decoder_layer( - positions=positions, - hidden_states=hidden_states, - residual=None, - ) - hidden_states = hidden_states + residual - hidden_states = self.norm(hidden_states) - return hidden_states - - -class MllamaForCausalLM(nn.Module): - config_class = config_mllama.MllamaTextConfig - base_model_prefix = "language_model" - _no_split_modules = [ - "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer" - ] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config.text_config - quant_config = vllm_config.quant_config - self.quant_config = quant_config - - self.vocab_size = config.vocab_size - self.model = MllamaTextModel(vllm_config=vllm_config, - prefix=f"{prefix}.model") - self.lm_head = ParallelLMHead( - config.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE, - quant_config=quant_config, - prefix=f"{prefix}.lm_head", - ) - - def forward( - self, - input_ids: torch.LongTensor, - positions: Optional[torch.LongTensor], - cross_attention_states: Optional[torch.LongTensor], - cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, - torch.Tensor]], - skip_cross_attention: bool, - ) -> torch.Tensor: - hidden_states = self.model( - input_ids=input_ids, - positions=positions, - cross_attention_states=cross_attention_states, - cross_attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - full_text_row_masked_out_mask=full_text_row_masked_out_mask, - skip_cross_attention=skip_cross_attention, - ) - return hidden_states - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - updated_params: set[str] = set() - for name, loaded_weight in weights: - if 'patch_embedding.weight' in name: - name = name.replace('patch_embedding.weight', - 'patch_embedding._linear.weight') - loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) - if (self.quant_config is not None and - (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else - loaded_weight[0]) - weight_loader(param, loaded_weight) - updated_params.add(scale_name) - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - updated_params.add(name) - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - orig_name = name - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - logger.debug("Missing name %s, orig name %s", name, - orig_name) - continue - - param = params_dict.pop(name) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - updated_params.add(name) - return updated_params - - -@MULTIMODAL_REGISTRY.register_processor(MllamaMultiModalProcessor, - info=MllamaProcessingInfo, - dummy_inputs=MllamaDummyInputsBuilder) -class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsV0Only): - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] - } - - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - # mapping for new names in checkpoint saved after transformers v4.52 - "model.vision_model.": "vision_model.", - "model.multi_modal_projector.": "multi_modal_projector.", - "model.language_model.": "language_model.model.", - "lm_head.": "language_model.lm_head.", - }, - orig_to_new_suffix={ - "patch_embedding.weight": "patch_embedding._linear.weight", - }, - ) - - @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: - if modality.startswith("image"): - return "<|image|>" - - raise ValueError("Only image modality is supported") - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config: MllamaConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.vocab_size = config.text_config.vocab_size - self.hidden_size = config.text_config.hidden_size - self.max_num_tiles = config.vision_config.max_num_tiles - self.vision_output_dim = config.vision_config.vision_output_dim - self.pad_token_id = \ - config.pad_token_id if config.pad_token_id is not None else -1 - self.image_size = config.vision_config.image_size - self.image_token_id = config.image_token_index - - self.vision_model = MllamaVisionModel(config.vision_config, - quant_config, - prefix=maybe_prefix( - prefix, "vision_model")) - self.language_model = MllamaForCausalLM( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model"), - ) - self.multi_modal_projector = ColumnParallelLinear( - config.vision_config.vision_output_dim, - config.text_config.hidden_size, - bias=True, - quant_config=quant_config, - gather_output=True, - prefix=maybe_prefix(prefix, "multi_modal_projector"), - ) - self.logits_processor = LogitsProcessor(config.output_hidden_states, - config.text_config.vocab_size) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.language_model.lm_head, - hidden_states, sampling_metadata) - return logits - - def unpack_data(self, - image_data: Union[list[torch.Tensor], torch.Tensor], - padding_value=0) -> torch.Tensor: - if isinstance(image_data, torch.Tensor): - # torch.Tensor - return image_data - else: - assert isinstance( - image_data[0], - torch.Tensor), "Image data is not properly batched." - # list[torch.Tensor] - bsz = len(image_data) - max_length = max(t.size(0) for t in image_data) - trailing_dims = image_data[0].shape[1:] - for data in image_data: - cur_trailing_dims = data.shape[1:] - assert cur_trailing_dims == trailing_dims - output_tensor = torch.full((bsz, max_length, *trailing_dims), - padding_value, - dtype=image_data[0].dtype, - device=image_data[0].device) - for i, t in enumerate(image_data): - output_tensor[i, :t.size(0)] = t - return output_tensor - - def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[MllamaImagePixelInputs]: - # tensor with the same shape will be batched together by - # MultiModalKwargs.batch, so pixel_values here can be: - # - list[torch.Tensor]: - # with shape (num_image, num_tiles, 3, image_res, image_res) - # - torch.Tensor: - # with shape (bs, num_image, num_tiles, 3, image_res, image_res) - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "image_embeds", None) - aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "aspect_ratio_ids", None) - aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "aspect_ratio_mask", None) - - if pixel_values is None and image_embeds is None: - return None - - if pixel_values is not None and image_embeds is not None: - raise ValueError( - "Both pixel values and image embeds are provided.") - - if pixel_values is not None: - assert aspect_ratio_ids is not None - assert aspect_ratio_mask is not None - - return MllamaImagePixelInputs( - type="pixel_values", - data=self.unpack_data(pixel_values), - aspect_ratio_ids=self.unpack_data(aspect_ratio_ids), - aspect_ratio_mask=self.unpack_data(aspect_ratio_mask)) - - if image_embeds is not None: - raise NotImplementedError - - raise AssertionError("This line should be unreachable.") - - def _get_and_validate_encoder_lens( - self, - encoder_seq_lens: list[int], - num_tiles: list[list[int]], - num_tokens_per_tile: int, - ) -> list[int]: - # Get the actual number of encoder tokens for each sample. - # Because attn_metadata.encoder_seq_lens only counts the last - # group of images for each sample, which is used to cheat the - # block manager to allocate blocks for those images only. - # See MllamaMultiModalProcessor for more details. - actual_encoder_seq_lens = [ - sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles - ] - - # remove 0 encoder len entries for text-only requests for these - # assertions - attn_metadata_lens = [x for x in encoder_seq_lens if x > 0] - assert len(actual_encoder_seq_lens) == len(attn_metadata_lens) - for actual_len, last_group_len in zip(actual_encoder_seq_lens, - attn_metadata_lens): - assert actual_len >= last_group_len - - return actual_encoder_seq_lens - - def flat_encoder_result(self, cross_attention_states: torch.Tensor, - attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: list[int]): - - cross_attention_states_flat = torch.zeros( - sum(actual_encoder_seq_lens), - cross_attention_states.shape[-1], - device=cross_attention_states.device, - dtype=cross_attention_states.dtype) - start_pos = 0 - for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens, - cross_attention_states): - end_pos = start_pos + seq_len - cross_attention_states_flat[ - start_pos:end_pos] = vision_token_in_batch[:seq_len] - start_pos = end_pos - cross_attention_states = cross_attention_states_flat - return cross_attention_states - - def get_language_model(self) -> torch.nn.Module: - return self.language_model - - def get_cross_attention_states( - self, - image_inputs: MllamaImagePixelInputs, - attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: list[int], - ) -> tuple[torch.Tensor]: - # NOTE: llama's reference implementation runs vision model on CPU - pixel_values = image_inputs['data'] - aspect_ratio_ids = image_inputs['aspect_ratio_ids'] - aspect_ratio_mask = image_inputs['aspect_ratio_mask'] - cross_attention_states = self.vision_model(pixel_values, - aspect_ratio_ids, - aspect_ratio_mask) - cross_attention_states, _ = self.multi_modal_projector( - cross_attention_states) - - bsz, _, _, _, image_token_dim = tuple(cross_attention_states.shape) - cross_attention_states = cross_attention_states.view( - bsz, -1, image_token_dim) - - cross_attention_states = self.flat_encoder_result( - cross_attention_states, attn_metadata, actual_encoder_seq_lens) - - return cross_attention_states - - def get_cross_attention_mask( - self, - input_ids: torch.Tensor, - attn_metadata: AttentionMetadata, - num_tiles: list[list[int]], - num_tokens_per_tile: int, - dtype: torch.dtype, - ) -> tuple[torch.Tensor, torch.Tensor]: - token_ids = input_ids.tolist() - start = 0 - batch_token_ids = [] - for seq_len in attn_metadata.seq_lens: - batch_token_ids.append(token_ids[start:start + seq_len]) - start += seq_len - sparse_mask = [ - get_cross_attention_token_mask(t, self.image_token_id) - for t in batch_token_ids - ] - - # Skip generating cross-attention mask if all samples - # are text-only or have only 1 leading image. - if skip_attention_mask(sparse_mask): - return None, None - - dense_mask, tile_range_for_decode = \ - convert_sparse_cross_attention_mask_to_dense( - sparse_mask, num_tiles, attn_metadata.seq_lens) - cross_attention_mask = \ - convert_dense_cross_attention_mask_to_tensor( - dense_mask, num_tokens_per_tile, input_ids.device, dtype) - kv_range_for_decode = [[ - t[0] * num_tokens_per_tile, t[1] * num_tokens_per_tile - ] for t in tile_range_for_decode] - - return cross_attention_mask, kv_range_for_decode - - def get_full_text_row_masked_out_mask( - self, - attn_metadata: AttentionMetadata, - device: torch.device, - ) -> torch.Tensor: - full_text_row_masked_out_mask = torch.ones( - (attn_metadata.num_prefill_tokens, 1), dtype=torch.bool) - start_pos = 0 - for seq_len, encoder_seq_len in zip(attn_metadata.seq_lens, - attn_metadata.encoder_seq_lens): - if encoder_seq_len == 0: - full_text_row_masked_out_mask[start_pos:start_pos + - seq_len] = False - start_pos += seq_len - full_text_row_masked_out_mask = full_text_row_masked_out_mask.to( - device) - return full_text_row_masked_out_mask - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - **kwargs: object, - ) -> Union[CausalLMOutputWithPast]: - attn_metadata = get_forward_context().attn_metadata - if attn_metadata.num_prefill_tokens > 0 and \ - attn_metadata.num_decode_tokens > 0: - raise ValueError("Chunk prefill not supported") - image_inputs = self._parse_and_validate_image_input(**kwargs) - cross_attention_states = None - cross_attention_mask = None - kv_range_for_decode = None - - # For 1) text-only prefill and decode, 2) image-present decode. - if image_inputs is None: - full_text_row_masked_out_mask = ( - attn_metadata.encoder_seq_lens_tensor - != 0).reshape(-1, 1).to(input_ids.device) - skip_cross_attention = attn_metadata.max_encoder_seq_len == 0 - - # For image-present prefill. - else: - skip_cross_attention = False - - num_tiles = [t.tolist() for t in kwargs.pop("num_tiles")] - num_tokens_per_tile = calc_token_per_chunk(self.image_size) - - actual_encoder_seq_lens = self._get_and_validate_encoder_lens( - attn_metadata.encoder_seq_lens, - num_tiles, - num_tokens_per_tile, - ) - - cross_attention_states = self.get_cross_attention_states( - image_inputs, attn_metadata, actual_encoder_seq_lens) - - full_text_row_masked_out_mask = \ - self.get_full_text_row_masked_out_mask( - attn_metadata, input_ids.device) - - cross_attention_mask, kv_range_for_decode = \ - self.get_cross_attention_mask( - input_ids, attn_metadata, num_tiles, - num_tokens_per_tile, cross_attention_states.dtype) - - outputs = self.language_model( - input_ids=input_ids, - positions=positions, - cross_attention_states=cross_attention_states, - cross_attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - full_text_row_masked_out_mask=full_text_row_masked_out_mask, - skip_cross_attention=skip_cross_attention, - ) - - return outputs - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) - - def get_mm_mapping(self) -> MultiModelKeys: - """ - Get the module prefix in multimodal models - """ - return MultiModelKeys.from_string_field( - language_model="language_model", - connector="multi_modal_projector", - tower_model="vision_model") - - -def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: - for mask in sparse_mask: - # Skip text-only samples. - if len(mask) == 0: - continue - # If the sample contains more than 1 images, - # we can't skip mask. - if len(mask) != 1: - return False - # If the sample contains only 1 image, - # but the image is not the leading one, - # we can't skip mask. - if mask[0][0] != 0 or mask[0][1] != -1: - return False - return True - - -def convert_sparse_cross_attention_mask_to_dense( - sparse_mask: list[list[list[int]]], - num_tiles: list[list[int]], - lengths: list[int], -) -> tuple[np.ndarray, list[tuple[int, int]]]: - total_length = sum(lengths) - total_tiles = sum([sum(tiles) for tiles in num_tiles]) - dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) - # A list of ranges, range[i] = [start, end] means that the i-th image will - # use tiles[start, end] for cross-attention decoding. - tile_range_for_decode = [] - - seq_start = 0 - tile_start = 0 - - # sparse_mask has an [] entry for each sequence that does not have images, - # but num_tiles does not have these entries... - num_tiles_idx = 0 - for masks, length in zip(sparse_mask, lengths): - if len(masks) == 0: - # Text only - continue - - tiles = num_tiles[num_tiles_idx] - num_tiles_idx += 1 - ts, td = -1, 0 - for mask, tile in zip(masks, tiles): - if len(mask) != 2: - continue - start, end = mask - end = min(end, length) - if end == -1: - end = length - if end == length: - if ts == -1: - ts = tile_start - td += tile - dense_mask[seq_start + start:seq_start + end, - tile_start:tile_start + tile] = 1 - tile_start += tile - assert ts != -1 - assert td != 0 - tile_range_for_decode.append((ts, ts + td)) - seq_start += length - assert num_tiles_idx == len(num_tiles) - - return dense_mask, tile_range_for_decode - - -def convert_dense_cross_attention_mask_to_tensor( - cross_attention_token_mask: np.ndarray, - num_tokens_per_tile: int, - device: torch.device, - dtype: torch.dtype, -) -> torch.Tensor: - mask = torch.tensor(cross_attention_token_mask, dtype=dtype, device=device) - mask = mask.repeat_interleave(num_tokens_per_tile, dim=1) - - mask = 1.0 - mask - mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(dtype).min) - - ninf = torch.finfo(dtype).min - full_text_mask = ((mask != ninf).any(dim=-1).type_as(mask)[..., None]) - mask *= full_text_mask - # (num_prompt_tokens, num_encoder_tokens) - return mask diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c6ea4c2050577..6bb65ed6debc6 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -147,10 +147,6 @@ _TEXT_GENERATION_MODELS = { "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"), "XverseForCausalLM": ("llama", "LlamaForCausalLM"), "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"), - # [Encoder-decoder] - "BartModel": ("bart", "BartForConditionalGeneration"), - "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), - "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"), } _EMBEDDING_MODELS = { @@ -237,6 +233,7 @@ _MULTIMODAL_MODELS = { "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), + "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501 "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 @@ -263,16 +260,12 @@ _MULTIMODAL_MODELS = { "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), + "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501 "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 # [Encoder-decoder] - "DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"), - "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501 - "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 - "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501 - "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 } diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index ffc69a2db60a4..bad6c0c3d9db2 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -209,7 +209,7 @@ class MultiModalProfiler(Generic[_I]): if processor.pad_dummy_encoder_prompt: num_tokens_to_pad = max(total_len, seq_len) - total_len encoder_prompt_token_ids.extend([0] * num_tokens_to_pad) - # NOTE: Whisper and Donut allows total_len > seq_len. + # NOTE: Whisper allows total_len > seq_len. elif total_len > seq_len and not envs.VLLM_USE_V1: # `max_num_batched_tokens` is defined by `SchedulerConfig` logger.warning_once( diff --git a/vllm/test_utils.py b/vllm/test_utils.py index 23679b8228d6f..91dcc2fd84e17 100644 --- a/vllm/test_utils.py +++ b/vllm/test_utils.py @@ -36,7 +36,6 @@ MODELS_ON_S3 = [ "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", # "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.2-1B", "meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Meta-Llama-3-8B", diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py index d09c5fa924fb0..3a97f2c056181 100644 --- a/vllm/transformers_utils/chat_templates/registry.py +++ b/vllm/transformers_utils/chat_templates/registry.py @@ -35,7 +35,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja", "chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja", "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja", - "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja", "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja", "minicpmv": _get_minicpmv_chat_template_fallback, "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja", diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index fd19d33ca0c89..cafc43f6b7673 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -90,11 +90,6 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = { "internvl_chat": { "has_no_defaults_at_init": True }, - # transformers regards mllama as is_encoder_decoder=False - # vllm needs is_encoder_decoder=True to enable cross-attention - "mllama": { - "is_encoder_decoder": True - }, "NVLM_D": { "has_no_defaults_at_init": True }, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index f3fad15b750ad..327b4e2705485 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -498,7 +498,7 @@ class Processor: assert isinstance(mm_processor, EncDecMultiModalProcessor) if mm_processor.pad_dummy_encoder_prompt: - return # Skip encoder length check for Whisper and Donut + return # Skip encoder length check for Whisper if model_config.is_multimodal_model: suggestion = ( diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py deleted file mode 100644 index 12fd25f4de2ad..0000000000000 --- a/vllm/worker/enc_dec_model_runner.py +++ /dev/null @@ -1,553 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import itertools -from typing import Any, Dict, List, Optional, Tuple, Type, cast - -import torch -import torch.distributed - -from vllm.attention.backends.abstract import (AttentionBackend, - AttentionMetadata) -from vllm.attention.backends.utils import PAD_SLOT_ID -from vllm.attention.selector import (get_env_variable_attn_backend, - get_global_forced_attn_backend) -from vllm.config import VllmConfig -from vllm.forward_context import set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, - MultiModalRegistry) -from vllm.platforms import _Backend -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad -from vllm.worker.model_runner import (GPUModelRunnerBase, - ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) -from vllm.worker.model_runner_base import ( - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict) -from vllm.worker.utils import assert_enc_dec_mr_supported_scenario - -logger = init_logger(__name__) -LORA_WARMUP_RANK = 8 - - -@dataclasses.dataclass(frozen=True) -class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): - """ - Used by the EncoderDecoderModelRunner. - """ - encoder_input_tokens: Optional[torch.Tensor] = None - encoder_input_positions: Optional[torch.Tensor] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "encoder_input_tokens": self.encoder_input_tokens, - "encoder_input_positions": self.encoder_input_positions, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "EncoderDecoderModelInput": - return cast( - EncoderDecoderModelInput, - super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) - - -class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): - _model_input_cls: Type[EncoderDecoderModelInput] = ( - EncoderDecoderModelInput) - _builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder) - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - input_registry: InputRegistry = INPUT_REGISTRY, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - ''' - EncoderDecoderModelRunner constructor. - - `lora_config` is unused (since these features are not yet supported - for encoder/decoder models) but these arguments are present here for - compatibility with the base-class constructor. - ''' - self._maybe_force_supported_attention_backend() - - super().__init__( - vllm_config=vllm_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker, - input_registry=input_registry, - mm_registry=mm_registry, - ) - - # Crash for unsupported encoder/scenarios - assert_enc_dec_mr_supported_scenario(self) - - def _maybe_force_supported_attention_backend(self): - ''' - Force vLLM to use the XFormers attention backend, - which is currently the only supported option. - ''' - - def raise_backend_err(): - # The user has specified an attention backend override - # which is invalid for encoder/decoder models - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_BACKEND) - - maybe_env_var_forced_backend = get_env_variable_attn_backend() - maybe_global_forced_backend = get_global_forced_attn_backend() - is_forced_by_global = maybe_global_forced_backend is not None - is_forced_by_env_var = maybe_env_var_forced_backend is not None - if is_forced_by_global: # noqa: SIM102 - # Backend override enforced by global variable takes - # precedence over vLLM backend environment variable. - if maybe_global_forced_backend not in\ - [_Backend.XFORMERS, _Backend.FLASH_ATTN]: - raise_backend_err() - elif is_forced_by_env_var: # noqa: SIM102 - # Backend override enforced by vLLM backend - # environment variable - if maybe_env_var_forced_backend not in\ - [_Backend.XFORMERS, _Backend.FLASH_ATTN]: - raise_backend_err() - - def _list_to_int32_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.int32, device=self.device) - - def _list_to_long_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.long, device=self.device) - - def _empty_int32_tensor(self) -> torch.Tensor: - return self._list_to_int32_tensor([]) - - def _empty_long_tensor(self) -> torch.Tensor: - return self._list_to_long_tensor([]) - - @torch.inference_mode() - def execute_model( - self, - model_input: EncoderDecoderModelInput, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError("num_steps > 1 is not supported in " - "EncoderDecoderModelRunner") - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - if (model_input.attn_metadata is not None - and model_input.attn_metadata.prefill_metadata is None - and model_input.attn_metadata.decode_metadata.use_cuda_graph): - if model_input.inputs_embeds is None: - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, False)]) - else: - graph_batch_size = model_input.inputs_embeds.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, True)]) - else: - model_executable = self.model - - seqlen_agnostic_kwargs = { - "finished_requests_ids": model_input.finished_requests_ids, - "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_inner_state else {} - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=model_input.inputs_embeds, - positions=model_input.input_positions, - encoder_input_ids=model_input.encoder_input_tokens, - encoder_positions=model_input.encoder_input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - multi_modal_kwargs, - device=self.device, - ), - **seqlen_agnostic_kwargs, - ) - - logits = self.model.compute_logits(hidden_or_intermediate_states, - model_input.sampling_metadata) - - if not self.is_driver_worker: - return [] - - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - - return [output] - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput: - return EncoderDecoderModelInput.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> EncoderDecoderModelInput: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - Since chunked prefill is not supported for encoder/decoder models, - `input_tokens` is assumed to be either entirely prefill tokens or - entirely decode tokens. - - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - ( - attn_metadata, - encoder_input_tokens_tensor, - encoder_input_positions_tensor, - ) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list, - model_input)) - # Inject attn_metadata encoder/cross-attention fields & - # encoder input tokens/positions into model_input. - # Frozen dataclass fields cannot be modified, so use - # dataclasses.replace to construct a new model input - # instance. - model_input = dataclasses.replace( - model_input, - attn_metadata=attn_metadata, - encoder_input_tokens=encoder_input_tokens_tensor, - encoder_input_positions=encoder_input_positions_tensor, - ) - - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - self.pin_memory, - generators=generators) - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - is_prompt=is_prompt, - virtual_engine=virtual_engine) - - @torch.inference_mode() - def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - - # This represents the maximum number of different requests - # that will have unique loras, and therefore the max amount of - # memory consumption. Create dummy lora request copies from the - # lora request passed in, which contains a lora from the lora - # warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config: - dummy_lora_requests = self._add_dummy_loras( - self.lora_config.max_loras) - assert len(dummy_lora_requests) == self.lora_config.max_loras - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] - - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - logger.info("Starting profile run for multi-modal models.") - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - decoder_dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry, - is_encoder_data=False) - encoder_dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry, - is_encoder_data=True) - - # Having more tokens is over-conservative but otherwise fine - assert len( - decoder_dummy_data.seq_data.prompt_token_ids - ) >= seq_len, ( - f"Expected at least {seq_len} dummy tokens for profiling, " - f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}" - ) - - assert decoder_dummy_data.multi_modal_data is None or \ - encoder_dummy_data.multi_modal_data is None, ( - "Multi-modal data can't be provided in both encoder and decoder" - ) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: decoder_dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - encoder_seq_data=encoder_dummy_data.seq_data, - cross_block_table=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - multi_modal_data=decoder_dummy_data.multi_modal_data - or encoder_dummy_data.multi_modal_data, - multi_modal_placeholders=decoder_dummy_data. - multi_modal_placeholders - or encoder_dummy_data.multi_modal_placeholders) - seqs.append(seq) - - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - self.execute_model(model_input, None, intermediate_tensors) - torch.cuda.synchronize() - return - - def _prepare_encoder_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - model_input: EncoderDecoderModelInput, - ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], - Optional[torch.Tensor]]: - """Helper method to prepare the encoder- and cross-attn-related - model inputs based on a given sequence group. These additional inputs - are used to augment an already-computed `EncoderDecoderModelInput` - data structure which already has decoder-related model inputs - populated. - - Sets the following attn_metadata fields: - * `num_encoder_tokens` - * `encoder_seq_lens` - * `encoder_seq_lens_tensor` - * `max_encoder_seq_len` - * `cross_slot_mapping` - * `cross_block_tables` - - Constructs a new model inputs data structure, based on - (1) the existing fields in the `model_inputs` argument, - and (2) the following additional fields which are - computed (or in the case of `attn_metadata`, updated) - by this function: - * attn_metadata - * encoder_input_tokens - * encoder_input_positions - - Arguments: - - * seq_group_metadata_list: list of sequence groups for which to - compute inputs - * model_inputs: model inputs data structure with decoder-oriented - fields already computed. - - Return: - - * Updated model inputs data structure - """ - - if len(seq_group_metadata_list) == 0: - return (model_input.attn_metadata, None, None) - - # Since we are not supporting chunked prefill either the entire - # batch is prefill or it is decode - is_prompt = seq_group_metadata_list[0].is_prompt - - # Build encoder inputs - encoder_seq_lens: List[int] = [] - if is_prompt: - # Prefill phase. - cross_block_tables = self._empty_int32_tensor().view( - len(seq_group_metadata_list), -1) - - # Extract input tokens/positions, cross-attention slot-mapping, - # & seq len from each sequence group metadata - ( - encoder_input_tokens, - encoder_input_positions, - cross_slot_mapping, - ) = ( - [], - [], - [], - ) - for seq_group_metadata in seq_group_metadata_list: - # Build seq lens - seq_len = seq_group_metadata.encoder_seq_data.get_len() - token_ids = seq_group_metadata.encoder_seq_data.get_token_ids() - encoder_seq_lens.append(seq_len) - - # Build slot mapping - is_profile_run = (seq_group_metadata.block_tables is None) - if is_profile_run: - # During memory profiling, the block tables are not - # initialized yet. In this case, we just use a dummy - # slot mapping. - # In embeddings, the block tables are {seq_id: None}. - cross_slot_mapping.extend([PAD_SLOT_ID] * seq_len) - else: - for i in range(0, seq_len): - block_number = seq_group_metadata.cross_block_table[ - i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - cross_slot_mapping.append(slot) - - # Build encoder input tokens - encoder_input_tokens.extend(token_ids) - encoder_input_positions.extend(list(range(0, seq_len))) - - # Convert tokens/positions & cross-attention - # slot-mapping to encoder input tensors - encoder_input_tokens_tensor = self._list_to_long_tensor( - encoder_input_tokens) - encoder_input_positions_tensor = self._list_to_long_tensor( - encoder_input_positions) - cross_slot_mapping_tensor = self._list_to_long_tensor( - cross_slot_mapping) - - else: - # Decode phase. - encoder_input_tokens_tensor = self._empty_long_tensor() - encoder_input_positions_tensor = self._empty_long_tensor() - cross_slot_mapping_tensor = self._empty_long_tensor() - # Extract cross-attention block tables & - # seq len from each sequence group metadata. - # Cross-attention block tables are empty - # during vLLM memory profiling. - cross_block_tables = [] - for seq_group_metadata in seq_group_metadata_list: - for _ in range(len(seq_group_metadata.seq_data)): - encoder_seq_lens.append( - seq_group_metadata.encoder_seq_data.get_len()) - cross_block_table = seq_group_metadata.cross_block_table - cross_block_tables.append([] if ( - cross_block_table is None) else cross_block_table) - - if (model_input.attn_metadata is not None - and model_input.attn_metadata.use_cuda_graph): - # We will be using CUDA graph replay for this decode. - max_len_of_block_table = self.get_max_block_per_batch() - batch_size = len(encoder_seq_lens) - graph_batch_size = self.vllm_config.pad_for_cudagraph( - batch_size) - assert graph_batch_size >= batch_size - cuda_graph_pad_size = graph_batch_size - batch_size - # extend the cross_block_tables and encoder_seq_lens to match - # the graph_batch_size. - cross_block_tables.extend([[] - for _ in range(cuda_graph_pad_size) - ]) - encoder_seq_lens.extend( - itertools.repeat(1, cuda_graph_pad_size)) - - else: - max_len_of_block_table = max( - len(block_table) for block_table in cross_block_tables) - - cross_block_tables = make_tensor_with_pad( - cross_block_tables, - max_len=max_len_of_block_table, - pad=0, - dtype=torch.int32, - device=self.device, - ) - - # Compute encoder sequence lengths & encoder - # sequence starting offset tensors - max_encoder_seq_len = max(encoder_seq_lens, default=0) - encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens) - encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + - 1, - dtype=torch.int32, - device=self.device) - torch.cumsum(encoder_seq_lens_tensor, - dim=0, - dtype=encoder_seq_start_loc.dtype, - out=encoder_seq_start_loc[1:]) - - # Update attention metadata with encoder-oriented attributes - attn_metadata = model_input.attn_metadata - assert attn_metadata is not None - ( - attn_metadata.num_encoder_tokens, - attn_metadata.encoder_seq_lens, - attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, - attn_metadata.encoder_seq_start_loc, - attn_metadata.cross_slot_mapping, - attn_metadata.cross_block_tables, - ) = ( - sum(encoder_seq_lens), - encoder_seq_lens, - encoder_seq_lens_tensor, - max_encoder_seq_len, - encoder_seq_start_loc, - cross_slot_mapping_tensor, - cross_block_tables, - ) - - return (attn_metadata, encoder_input_tokens_tensor, - encoder_input_positions_tensor) diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py deleted file mode 100644 index 512a1dca73701..0000000000000 --- a/vllm/worker/utils.py +++ /dev/null @@ -1,49 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -''' -Worker-related helper functions. -''' - -from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS -from vllm.worker.model_runner import GPUModelRunnerBase - - -def assert_enc_dec_mr_supported_scenario( - enc_dec_mr: GPUModelRunnerBase) -> None: - ''' - Asserted that the provided encoder/decoder model runner instance reflects - a supported scenario. - ''' - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - - if enc_dec_mr.cache_config.enable_prefix_caching: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE']) - - if enc_dec_mr.sliding_window is not None: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SWA']) - - if enc_dec_mr.scheduler_config.chunked_prefill_enabled: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[ - 'STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL']) - - if getattr(enc_dec_mr.model_config.hf_config, 'attn_logit_softcapping', - None) is not None: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP'] - ) - - if enc_dec_mr.lora_config is not None: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LORA']) - - if enc_dec_mr.parallel_config.pipeline_parallel_size > 1: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP']) - - if enc_dec_mr.scheduler_config.num_lookahead_slots > 0: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC']) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 670f256c0bf65..12047bc390737 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -28,7 +28,6 @@ from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, memory_profiling) from vllm.worker.cache_engine import CacheEngine -from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) @@ -82,10 +81,7 @@ class Worker(LocalOrDistributedWorkerBase): "qwen3_next_mtp")) \ else {"return_hidden_states": True} - ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner - if self.model_config.is_encoder_decoder: - ModelRunnerClass = EncoderDecoderModelRunner - self.model_runner: GPUModelRunnerBase = ModelRunnerClass( + self.model_runner: GPUModelRunnerBase = ModelRunner( vllm_config=self.vllm_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, From 17871983a2efde49d93ee8692f17da490da47e95 Mon Sep 17 00:00:00 2001 From: cascade <cascade812@outlook.com> Date: Mon, 15 Sep 2025 21:32:32 -0700 Subject: [PATCH 307/584] [Bugfix] Fix sequence parallelism bug when enable pipeline parallelism (#24021) Signed-off-by: cascade812 <cascade812@outlook.com> --- tests/distributed/test_sequence_parallel.py | 3 +- vllm/distributed/parallel_state.py | 51 +++++++++++++++-- vllm/v1/worker/cpu_worker.py | 18 +++++- vllm/v1/worker/gpu_model_runner.py | 63 +++++++++++---------- vllm/v1/worker/gpu_worker.py | 15 ++++- vllm/v1/worker/utils.py | 27 ++++++++- 6 files changed, 135 insertions(+), 42 deletions(-) diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 65c5e68968440..ded3d834faf00 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -235,7 +235,6 @@ def _compare_sp( 'level': 3, 'custom_ops': ["+rms_norm"], 'compile_sizes': [4, 8], - 'splitting_ops': [], 'pass_config': { 'enable_sequence_parallelism': True, 'enable_fusion': enable_fusion, @@ -251,6 +250,8 @@ def _compare_sp( *common_args, "--tensor-parallel-size", str(tp_size), + "--pipeline-parallel-size", + str(pp_size), "--distributed-executor-backend", distributed_backend, "--compilation_config", diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index ef229299b6848..12571afaa4c13 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -663,14 +663,29 @@ class GroupCoordinator: tensor_dict: dict[str, Union[torch.Tensor, Any]], dst: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_tensors: Optional[dict[str, bool]] = None, ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: """Send the input tensor dictionary. NOTE: `dst` is the local rank of the source rank. + + all_gather_group: The group for the all-gather operation. If provided, + an optimization is enabled where each rank in the group sends a + slice of a tensor and the receiver reconstructs it using an + all-gather, which can improve performance. This is typically the + tensor-parallel group. + all_gather_tensors: A dictionary to specify which tensors should use + the all-gather optimization, which is only effective when + `all_gather_group` is provided. By default, this optimization is + on for any tensor whose size is divisible by the + `all_gather_group`'s world size. However, it should be disabled + for tensors that are not fully replicated across the group (e.g., + the residual tensor when sequence parallelism is enabled). This + dictionary allows overriding the default behavior on a per-tensor + basis. """ # Bypass the function if we are using only 1 GPU. if not torch.distributed.is_initialized() or self.world_size == 1: return tensor_dict - all_gather_size = (1 if all_gather_group is None else all_gather_group.world_size) all_gather_rank = (0 if all_gather_group is None else @@ -699,14 +714,23 @@ class GroupCoordinator: # `send_object_list` has serialization & deserialization, # all happening on CPU. Therefore, we can use the CPU group. self.send_object(metadata_list, dst=dst) - for tensor in tensor_list: + + tensor_keys = [ + k for k, v in tensor_dict.items() if isinstance(v, torch.Tensor) + ] + assert len(tensor_keys) == len(tensor_list) + + for key, tensor in zip(tensor_keys, tensor_list): if tensor.numel() == 0: # Skip sending empty tensors. continue # send-allgather: send only a slice, then do allgather. - if (all_gather_group is not None - and tensor.numel() % all_gather_size == 0): + use_all_gather = (all_gather_group is not None + and tensor.numel() % all_gather_size == 0) + use_all_gather = all_gather_tensors.get(key, use_all_gather) \ + if all_gather_tensors else use_all_gather + if use_all_gather: tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank] if tensor.is_cpu: @@ -725,14 +749,29 @@ class GroupCoordinator: self, src: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_tensors: Optional[dict[str, bool]] = None, ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: """Recv the input tensor dictionary. NOTE: `src` is the local rank of the source rank. + + all_gather_group: The group for the all-gather operation. If provided, + an optimization is enabled where each rank in the group sends a + slice of a tensor and the receiver reconstructs it using an + all-gather, which can improve performance. This is typically the + tensor-parallel group. + all_gather_tensors: A dictionary to specify which tensors should use + the all-gather optimization, which is only effective when + `all_gather_group` is provided. By default, this optimization is + on for any tensor whose size is divisible by the + `all_gather_group`'s world size. However, it should be disabled + for tensors that are not fully replicated across the group (e.g., + the residual tensor when sequence parallelism is enabled). This + dictionary allows overriding the default behavior on a per-tensor + basis. """ # Bypass the function if we are using only 1 GPU. if not torch.distributed.is_initialized() or self.world_size == 1: return None - all_gather_size = (1 if all_gather_group is None else all_gather_group.world_size) all_gather_rank = (0 if all_gather_group is None else @@ -766,6 +805,8 @@ class GroupCoordinator: # send-allgather: send only a slice, then do allgather. use_all_gather = (all_gather_group is not None and tensor.numel() % all_gather_size == 0) + use_all_gather = all_gather_tensors.get(key, use_all_gather) \ + if all_gather_tensors else use_all_gather if use_all_gather: orig_shape = tensor.shape diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index b87c4fe09bb90..daee91ec404fe 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -19,6 +19,7 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import (Worker, init_worker_distributed_environment) +from vllm.v1.worker.utils import is_residual_scattered_for_sp logger = init_logger(__name__) @@ -107,18 +108,29 @@ class CPUWorker(Worker): scheduler_output: "SchedulerOutput", ) -> Optional[ModelRunnerOutput]: intermediate_tensors = None + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + num_input_tokens = self.model_runner._get_num_input_tokens( + num_scheduled_tokens) + all_gather_tensors = { + "residual": + not is_residual_scattered_for_sp(self.vllm_config, + num_input_tokens) + } if not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group())) + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors)) output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) if not get_pp_group().is_last_rank: assert isinstance(output, IntermediateTensors) - get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group()) + get_pp_group().send_tensor_dict( + output.tensors, + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors) return None assert isinstance(output, ModelRunnerOutput) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d4afaf51e6e86..d4d1f814afc0c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -88,6 +88,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.v1.worker.utils import is_residual_scattered_for_sp from .utils import (AttentionGroup, MultiModalBudget, add_kv_sharing_layers_to_kv_cache_groups, bind_kv_cache, @@ -1633,21 +1634,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert self.intermediate_tensors is not None tp = self.vllm_config.parallel_config.tensor_parallel_size - enabled_sp = self.compilation_config.pass_config. \ - enable_sequence_parallelism - if enabled_sp: - # When sequence parallelism is enabled, we always pad num_tokens - # to be a multiple of tensor_parallel_size (tp) earlier - assert num_tokens % tp == 0 - is_residual_scattered = tp > 1 and enabled_sp \ - and num_tokens % tp == 0 + is_rs = is_residual_scattered_for_sp(self.vllm_config, num_tokens) # When sequence parallelism is enabled, the "residual" tensor is sharded # across tensor parallel ranks, so each rank only needs its own slice. if sync_self: assert intermediate_tensors is not None for k, v in intermediate_tensors.items(): - is_scattered = k == "residual" and is_residual_scattered + is_scattered = k == "residual" and is_rs copy_len = num_tokens // tp if is_scattered else \ num_tokens self.intermediate_tensors[k][:copy_len].copy_( @@ -1655,8 +1649,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return IntermediateTensors({ k: - v[:num_tokens // tp] - if k == "residual" and is_residual_scattered else v[:num_tokens] + v[:num_tokens // + tp] if k == "residual" and is_rs else v[:num_tokens] for k, v in self.intermediate_tensors.items() }) @@ -1741,6 +1735,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): pooler_output=pooler_output, ) + def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH + and hasattr(self, "cudagraph_batch_sizes") + and self.cudagraph_batch_sizes + and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): + # Use CUDA graphs. + # Add padding to the batch size. + return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens) + + # Eager mode. + # Pad tokens to multiple of tensor_parallel_size when + # enabled collective fusion for SP + tp_size = self.vllm_config.parallel_config.tensor_parallel_size + if (self.compilation_config.pass_config.enable_sequence_parallelism + and tp_size > 1): + return round_up(num_scheduled_tokens, tp_size) + return num_scheduled_tokens + def _preprocess( self, scheduler_output: "SchedulerOutput", @@ -1750,24 +1763,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Optional[IntermediateTensors], dict[str, Any]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH - and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): - # Use CUDA graphs. - # Add padding to the batch size. - num_input_tokens = self.vllm_config.pad_for_cudagraph( - num_scheduled_tokens) - else: - # Eager mode. - # Pad tokens to multiple of tensor_parallel_size when - # enabled collective fusion for SP - tp_size = self.vllm_config.parallel_config.tensor_parallel_size - if self.compilation_config.pass_config. \ - enable_sequence_parallelism and tp_size > 1: - num_input_tokens = round_up(num_scheduled_tokens, tp_size) - else: - num_input_tokens = num_scheduled_tokens - + num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens) # Padding for DP num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens) num_input_tokens += num_pad @@ -2108,8 +2104,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert not self.is_pooling_model if not get_pp_group().is_last_rank: + all_gather_tensors = { + "residual": + not is_residual_scattered_for_sp( + self.vllm_config, num_input_tokens) + } get_pp_group().send_tensor_dict( - hidden_states.tensors, all_gather_group=get_tp_group()) + hidden_states.tensors, + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors) logits = None else: sample_hidden_states = hidden_states[logits_indices] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 37dd431fd68f8..6855526583f04 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -32,6 +32,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput) from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_model_runner import GPUModelRunner +from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -428,10 +429,19 @@ class Worker(WorkerBase): ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]: intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + num_input_tokens = self.model_runner._get_num_input_tokens( + num_scheduled_tokens) + all_gather_tensors = { + "residual": + not is_residual_scattered_for_sp(self.vllm_config, + num_input_tokens) + } if forward_pass and not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group())) + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors)) output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) @@ -444,7 +454,8 @@ class Worker(WorkerBase): "external_launcher") and not get_pp_group().is_last_rank get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group()) + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors) kv_connector_output = output.kv_connector_output if not kv_connector_output: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index be05d02ff29fe..5ac7470c1ac90 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Optional import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.config import ModelConfig, SchedulerConfig +from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config @@ -288,3 +288,28 @@ def bind_kv_cache( for layer_name, kv_cache in kv_caches.items(): # NOTE: Use list because of v0 PP virtual engine. forward_context[layer_name].kv_cache = [kv_cache] + + +def is_residual_scattered_for_sp(vllm_config: VllmConfig, + num_input_tokens: int) -> bool: + """Check if the residual tensor is scattered for sequence parallelism. + + The residual tensor is scattered across tensor parallel ranks when sequence + parallelism and tensor parallelism is enabled, and the number of + input tokens is one of the compilation sizes. + """ + if not vllm_config.compilation_config.pass_config.\ + enable_sequence_parallelism: + return False + + tp = vllm_config.parallel_config.tensor_parallel_size + + if tp == 1: + return False + + # When sequence parallelism is enabled, we always pad num_input_tokens + # to be a multiple of tensor_parallel_size (tp) earlier. + assert num_input_tokens % tp == 0 + + # Currently, SP is only enabled for static size fx graphs. + return (num_input_tokens in vllm_config.compilation_config.compile_sizes) From 8c546102658f97b10d13bcf25193b65edc6ea6ff Mon Sep 17 00:00:00 2001 From: vllmellm <vllm.ellm@embeddedllm.com> Date: Tue, 16 Sep 2025 12:45:38 +0800 Subject: [PATCH 308/584] [Bug] [Spec Dec]: Fix kv_cache dtype mismatch for Eagle3 drafter on FP8 target (#24505) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> --- vllm/model_executor/models/llama_eagle3.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index bceb6cc42768e..99b77729b5018 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -9,7 +9,7 @@ import torch.nn as nn from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear @@ -33,10 +33,14 @@ class LlamaDecoderLayer(LlamaDecoderLayer): def __init__( self, config: LlamaConfig, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: - super().__init__(config, quant_config=quant_config, prefix=prefix) + super().__init__(config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix) # override qkv self.self_attn.qkv_proj = QKVParallelLinear( @@ -114,6 +118,8 @@ class LlamaModel(nn.Module): speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size + current_vllm_config = get_current_vllm_config() + self.embed_tokens = VocabParallelEmbedding( self.config.vocab_size, self.config.hidden_size, @@ -123,6 +129,7 @@ class LlamaModel(nn.Module): self.layers = nn.ModuleList([ LlamaDecoderLayer( config=self.config, + cache_config=current_vllm_config.cache_config, prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"), ) ]) From 238c4c170596e78c0f9b2edea8aa1449e12fc679 Mon Sep 17 00:00:00 2001 From: "Saman A. Pour" <samanamp@outlook.com> Date: Mon, 15 Sep 2025 22:06:03 -0700 Subject: [PATCH 309/584] [QWEN NEXT] Fused MoE kernels Optimization configs (#24924) Signed-off-by: Saman Keon <samanamp@outlook.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> --- ...vice_name=NVIDIA_GB200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ ...vice_name=NVIDIA_GB200,dtype=fp8_w8a8.json | 146 +++++++++++++++++ ...vice_name=NVIDIA_GB200,dtype=fp8_w8a8.json | 146 +++++++++++++++++ 3 files changed, 439 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..22e3d09676d06 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} + diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8bac7af0c2dac --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..5910027e17f9b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} From 04ad0dc275ca38f2890b7dd21a9fa75311128aa2 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Tue, 16 Sep 2025 14:10:54 +0800 Subject: [PATCH 310/584] [benchmark] Add triton version in the moe tuned config (#24769) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- benchmarks/kernels/benchmark_moe.py | 2 +- vllm/model_executor/layers/fused_moe/fused_moe.py | 5 ++++- vllm/triton_utils/importing.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 94f3f1ae11f27..837b2b0c10447 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -560,7 +560,7 @@ def save_configs( filename = os.path.join(save_dir, filename) print(f"Writing best config to {filename}...") with open(filename, "w") as f: - json.dump(configs, f, indent=4) + json.dump({"triton_version": triton.__version__, **configs}, f, indent=4) f.write("\n") diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 06edfb0552e84..30e46ffa7b176 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -720,7 +720,10 @@ def get_moe_configs( logger.info("Using configuration from %s for MoE layer.", config_file_path) # If a configuration has been found, return it - return {int(key): val for key, val in json.load(f).items()} + tuned_config = json.load(f) + # Delete triton_version from tuned_config + tuned_config.pop("triton_version", None) + return {int(key): val for key, val in tuned_config.items()} # If no optimized configuration is available, we will use the default # configuration diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 372200027bf95..2a06a9b7d11e4 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -68,7 +68,7 @@ class TritonPlaceholder(types.ModuleType): def __init__(self): super().__init__("triton") - self.__version__ = "3.3.0" + self.__version__ = "3.4.0" self.jit = self._dummy_decorator("jit") self.autotune = self._dummy_decorator("autotune") self.heuristics = self._dummy_decorator("heuristics") From 68dbde5dbb11b9250454d0c9f21a8b3da960b341 Mon Sep 17 00:00:00 2001 From: Cheng Kuan Yong Jason <jasoncky96@gmail.com> Date: Tue, 16 Sep 2025 15:16:32 +0800 Subject: [PATCH 311/584] [Bugfix] remove duplicate tokens streamed in required tool choice streaming (#23312) Signed-off-by: Jason Cheng <jasoncky96@gmail.com> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> --- vllm/entrypoints/openai/serving_chat.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 579f6f537ee2d..6c9c1ae85f570 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -828,9 +828,6 @@ class OpenAIServingChat(OpenAIServing): history_tool_call_cnt += 1 tools_streamed[i] = True - # update the previous values for the next iteration - previous_texts[i] = current_text - # handle streaming deltas for tools with "auto" tool choice # and reasoning parser elif tool_choice_auto and self.reasoning_parser: From 27fcfe7bcfbec033b12a40c2e82342ea2b25bb43 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:51:01 +0300 Subject: [PATCH 312/584] [Mamba] Support TP>1 with quantization for mamba2 mixer in case `n_groups % tp_size == 0` (#24593) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../layers/mamba/mamba_mixer2.py | 201 ++++++++++-------- 1 file changed, 118 insertions(+), 83 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 1623a2fd562c7..23e19da430e14 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -19,6 +19,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank, from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata, @@ -261,12 +262,14 @@ class MambaMixer2(MambaBase, CustomOp): ), "Tensor parallel world size must divide num heads." assert (n_groups % self.tp_size) == 0 or n_groups == 1, ( - "If tensor parallel world size does not divide num_heads, " + "If tensor parallel world size does not divide num_groups, " "then num_groups must equal 1.") - assert ( - self.tp_size == 1 or quant_config is None - ), "Tensor parallel currently not supported for quantized models." + assert (n_groups % self.tp_size == 0) or self.tp_size == 1 or \ + quant_config is None, ( + "Tensor parallel currently supported for quantized models only " + "if tensor parallel world size divides num groups." + ) self.ssm_state_size = ssm_state_size self.conv_kernel_size = conv_kernel_size @@ -285,94 +288,84 @@ class MambaMixer2(MambaBase, CustomOp): n_groups, self.tp_size) self.n_groups = n_groups + groups - self.conv_dim = intermediate_size + 2 * self.n_groups * ssm_state_size - self.conv1d = ColumnParallelLinear( - input_size=conv_kernel_size, - output_size=self.conv_dim, - bias=use_conv_bias, - quant_config=None, - prefix=f"{prefix}.conv1d", - ) - # unsqueeze to fit conv1d weights shape into the linear weights shape. - # Can't do this in `weight_loader` since it already exists in - # `ColumnParallelLinear` and `set_weight_attrs` - # doesn't allow to override it - self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + self.groups_ssm_state_size = self.n_groups * self.ssm_state_size + self.conv_dim = intermediate_size + 2 * self.groups_ssm_state_size - self.in_proj = ColumnParallelLinear( - input_size=hidden_size, - output_size=intermediate_size + self.conv_dim + self.num_heads, - bias=use_bias, - quant_config=quant_config, - prefix=f"{prefix}.in_proj", - ) + if n_groups % self.tp_size == 0: + self.conv1d = MergedColumnParallelLinear( + input_size=conv_kernel_size, + output_sizes=[ + intermediate_size, + self.groups_ssm_state_size, + self.groups_ssm_state_size, + ], + bias=use_conv_bias, + quant_config=None, + prefix=f"{prefix}.conv1d", + ) - # - because in_proj is a concatenation of 3 weights, we - # need to interleave them before sharding - # - use the custom weight loader mamba_v2_sharded_weight_loader - # for conv1d.bias, covn1d.weight and in_proj.weight - # - need to set these settings, to assign the groups to the head shards - group_shard_settings = ( - self.n_groups * self.ssm_state_size, # expected model size - (self.n_groups - n_groups) * - self.ssm_state_size, # extra dims assigned - n_groups == 1, # if there was only one group - ) - intermediate_settings = (intermediate_size, 0, False) - head_settings = (self.num_heads, 0, False) + self.in_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[ + intermediate_size, + intermediate_size, + self.groups_ssm_state_size, + self.groups_ssm_state_size, + self.num_heads, + ], + bias=use_bias, + quant_config=quant_config, + prefix=f"{prefix}.in_proj", + ) + else: + # This is the n_groups == 1 case, + # where we need to duplicate groups if TP>1. - # - the weight already has a "weight_loader" attribute - # which set_weight_attrs will raise if we do not - # delete before trying to override it - # - ditto for the other two weights below - delattr(self.conv1d.bias, "weight_loader") - set_weight_attrs( - self.conv1d.bias, - { - "weight_loader": - mamba_v2_sharded_weight_loader( - [ - intermediate_settings, - group_shard_settings, - group_shard_settings, - ], - self.tp_size, - tp_rank, - ) - }, - ) + self.conv1d = ColumnParallelLinear( + input_size=conv_kernel_size, + output_size=self.conv_dim, + bias=use_conv_bias, + quant_config=None, + prefix=f"{prefix}.conv1d", + ) - delattr(self.conv1d.weight, "weight_loader") - set_weight_attrs( - self.conv1d.weight, - { - "weight_loader": - mamba_v2_sharded_weight_loader( - [ - intermediate_settings, - group_shard_settings, - group_shard_settings, - ], - self.tp_size, - tp_rank, - ) - }, - ) + self.in_proj = ColumnParallelLinear( + input_size=hidden_size, + output_size=intermediate_size + self.conv_dim + self.num_heads, + bias=use_bias, + quant_config=quant_config, + prefix=f"{prefix}.in_proj", + ) - if quant_config is None: - # - quant layers do not have a weight loader - delattr(self.in_proj.weight, "weight_loader") + # - because in_proj is a concatenation of 3 weights, we + # need to interleave them before sharding + # - use the custom weight loader mamba_v2_sharded_weight_loader + # for conv1d.bias, covn1d.weight and in_proj.weight + # - need to set these settings, to assign the groups + # to the head shards + group_shard_settings = ( + self.groups_ssm_state_size, # expected model size + (self.n_groups - n_groups) * + self.ssm_state_size, # extra dims assigned + n_groups == 1, # if there was only one group + ) + intermediate_settings = (intermediate_size, 0, False) + head_settings = (self.num_heads, 0, False) + + # - the weight already has a "weight_loader" attribute + # which set_weight_attrs will raise if we do not + # delete before trying to override it + # - ditto for the other two weights below + delattr(self.conv1d.bias, "weight_loader") set_weight_attrs( - self.in_proj.weight, + self.conv1d.bias, { "weight_loader": mamba_v2_sharded_weight_loader( [ - intermediate_settings, # for gate intermediate_settings, group_shard_settings, group_shard_settings, - head_settings, # for dt ], self.tp_size, tp_rank, @@ -380,6 +373,50 @@ class MambaMixer2(MambaBase, CustomOp): }, ) + delattr(self.conv1d.weight, "weight_loader") + set_weight_attrs( + self.conv1d.weight, + { + "weight_loader": + mamba_v2_sharded_weight_loader( + [ + intermediate_settings, + group_shard_settings, + group_shard_settings, + ], + self.tp_size, + tp_rank, + ) + }, + ) + + if quant_config is None: + # - quant layers do not have a weight loader + delattr(self.in_proj.weight, "weight_loader") + set_weight_attrs( + self.in_proj.weight, + { + "weight_loader": + mamba_v2_sharded_weight_loader( + [ + intermediate_settings, # for gate + intermediate_settings, + group_shard_settings, + group_shard_settings, + head_settings, # for dt + ], + self.tp_size, + tp_rank, + ) + }, + ) + + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `MergedColumnParallelLinear`, + # and `set_weight_attrs` doesn't allow to override it + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + # - these are TPed by heads to reduce the size of the # temporal shape self.A = nn.Parameter( @@ -498,8 +535,6 @@ class MambaMixer2(MambaBase, CustomOp): chunk_indices_p = mamba2_metadata.chunk_indices chunk_offsets_p = mamba2_metadata.chunk_offsets - groups_time_state_size = self.n_groups * self.ssm_state_size - # 1. Gated MLP's linear projection projected_states, _ = self.in_proj(hidden_states) @@ -524,8 +559,8 @@ class MambaMixer2(MambaBase, CustomOp): hidden_states_B_C, [ self.intermediate_size // self.tp_size, - groups_time_state_size // self.tp_size, - groups_time_state_size // self.tp_size, + self.groups_ssm_state_size // self.tp_size, + self.groups_ssm_state_size // self.tp_size, ], dim=-1, ) From 7ea5c73ad794290b1be12e41227b89941c05752c Mon Sep 17 00:00:00 2001 From: Chen Bruce <cszwwdz@vip.qq.com> Date: Tue, 16 Sep 2025 18:55:16 +0800 Subject: [PATCH 313/584] [Feat][EPLB] A novel static EPLB placement strategy for MoE models. (#23745) Signed-off-by: bruceszchen <bruceszchen@tencent.com> Signed-off-by: Chen Bruce <bruceszchen@tencent.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Chen Bruce <cszwwdz@vip.qq.com> Co-authored-by: lemon412 <lemon412@foxmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/distributed/test_expert_placement.py | 194 ++++++++++++++++++ vllm/config/parallel.py | 10 + vllm/engine/arg_utils.py | 7 + vllm/model_executor/layers/fused_moe/layer.py | 66 ++++-- 4 files changed, 265 insertions(+), 12 deletions(-) create mode 100644 tests/distributed/test_expert_placement.py diff --git a/tests/distributed/test_expert_placement.py b/tests/distributed/test_expert_placement.py new file mode 100644 index 0000000000000..a3b1b3193deb0 --- /dev/null +++ b/tests/distributed/test_expert_placement.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.model_executor.layers.fused_moe.layer import determine_expert_map + + +def verify_round_robin_pattern(expert_map, ep_rank, ep_size, + global_num_experts): + """Verify that the expert map follows the round_robin pattern.""" + # Calculate expected local experts (supporting non-divisible cases) + base_experts = global_num_experts // ep_size + remainder = global_num_experts % ep_size + + if ep_rank < remainder: + local_num_experts = base_experts + 1 + else: + local_num_experts = base_experts + + # Expected expert IDs for this rank in round_robin pattern + # For non-divisible cases, ranks with extra experts start earlier + expected_expert_ids = [] + for expert_idx in range(local_num_experts): + global_expert_id = ep_rank + expert_idx * ep_size + expected_expert_ids.append(global_expert_id) + + # Check that only expected experts are mapped to this rank + for global_expert_id in range(global_num_experts): + if global_expert_id in expected_expert_ids: + local_expert_id = expert_map[global_expert_id] + expected_local_id = expected_expert_ids.index(global_expert_id) + assert ( + local_expert_id == expected_local_id + ), f"Global expert {global_expert_id} should map to local expert " \ + f"{expected_local_id}, got {local_expert_id}" + else: + assert ( + expert_map[global_expert_id] == -1 + ), f"Global expert {global_expert_id} should not be mapped to " \ + f"this rank" + + # Verify that all local expert IDs are consecutive starting from 0 + local_expert_ids = [ + expert_map[global_id] for global_id in expected_expert_ids + ] + expected_local_ids = list(range(local_num_experts)) + assert ( + local_expert_ids == expected_local_ids + ), f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}" + + +@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"]) +@pytest.mark.parametrize("world_size", [2, 4]) +def test_expert_placement_various_sizes(expert_placement_strategy, world_size): + """Test round_robin expert placement with various expert counts.""" + + # Test with different global_num_experts values + # Include both divisible and non-divisible cases + if world_size == 2: + test_cases = [ + (4, 2), # 4 experts (divisible) + (8, 2), # 8 experts (divisible) + (9, 2), # 9 experts (non-divisible) + (16, 2), # 16 experts (divisible) + (17, 2), # 17 experts (non-divisible) + ] + elif world_size == 4: + test_cases = [ + (8, 4), # 8 experts (divisible) + (16, 4), # 16 experts (divisible) + (18, 4), # 18 experts (non-divisible) + (32, 4), # 32 experts (divisible) + (33, 4), # 33 experts (non-divisible) + ] + else: + test_cases = [] + + for test_global_experts, test_ep_size in test_cases: + # Ensure ep_size matches world_size + assert (test_ep_size == world_size + ), f"ep_size {test_ep_size} must equal world_size {world_size}" + + # Test each rank + for ep_rank in range(world_size): + # Calculate expected local experts + base_experts = test_global_experts // test_ep_size + remainder = test_global_experts % test_ep_size + if ep_rank < remainder: + expected_test_local = base_experts + 1 + else: + expected_test_local = base_experts + + test_local_experts, test_expert_map = determine_expert_map( + ep_size=test_ep_size, + ep_rank=ep_rank, + global_num_experts=test_global_experts, + expert_placement_strategy=expert_placement_strategy, + ) + + assert ( + test_local_experts == expected_test_local + ), f"For {test_global_experts} experts on {test_ep_size} ranks, " \ + f"rank {ep_rank}: expected {expected_test_local} local" \ + f"experts, got {test_local_experts}" + + if test_expert_map is not None: + assert test_expert_map.shape == ( + test_global_experts, + ), f"Expected expert map shape ({test_global_experts},), " \ + f"got {test_expert_map.shape}" + + # Verify round_robin pattern for this test case + verify_round_robin_pattern(test_expert_map, ep_rank, + test_ep_size, test_global_experts) + + +@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"]) +@pytest.mark.parametrize("world_size", [2, 4]) +def test_expert_placement_edge_cases(expert_placement_strategy, world_size): + """Test edge cases for round_robin expert placement.""" + + # Test case 1: ep_size = 1 (should return None for expert_map) + local_num_experts, expert_map = determine_expert_map( + ep_size=1, + ep_rank=0, + global_num_experts=8, + expert_placement_strategy=expert_placement_strategy, + ) + assert local_num_experts == 8, "For ep_size=1, should get all experts" + assert expert_map is None, "For ep_size=1, expert_map should be None" + + # Test case 2: ep_size = 0 (should raise assertion) + with pytest.raises(AssertionError): + determine_expert_map( + ep_size=0, + ep_rank=0, + global_num_experts=8, + expert_placement_strategy=expert_placement_strategy, + ) + + +def test_determine_expert_map_comprehensive(): + """Test of determine_expert_map function with various configurations.""" + + # Test cases: (ep_size, ep_rank, global_num_experts, + # expert_placement_strategy, expected_local, expected_map_pattern) + test_cases = [ + # Round robin placement tests + (2, 0, 8, "round_robin", 4, [0, -1, 1, -1, 2, -1, 3, + -1]), # rank 0 gets even experts + (2, 1, 8, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1, + 3]), # rank 1 gets odd experts + (2, 0, 9, "round_robin", 5, [0, -1, 1, -1, 2, -1, 3, -1, 4 + ]), # rank 0 gets 5 experts (even + last) + (2, 1, 9, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1, 3, + -1]), # rank 1 gets 4 experts (odd) + + # 4-rank tests + (4, 0, 8, "round_robin", 2, [0, -1, -1, -1, 1, -1, -1, + -1]), # rank 0 gets experts 0, 4 + (4, 1, 8, "round_robin", 2, [-1, 0, -1, -1, -1, 1, -1, + -1]), # rank 1 gets experts 1, 5 + (4, 2, 8, "round_robin", 2, [-1, -1, 0, -1, -1, -1, 1, + -1]), # rank 2 gets experts 2, 6 + (4, 3, 8, "round_robin", 2, [-1, -1, -1, 0, -1, -1, -1, + 1]), # rank 3 gets experts 3, 7 + ] + + for ep_size, ep_rank, global_num_experts, expert_placement_strategy, \ + expected_local, expected_map_pattern in test_cases: + local_num_experts, expert_map = determine_expert_map( + ep_size=ep_size, + ep_rank=ep_rank, + global_num_experts=global_num_experts, + expert_placement_strategy=expert_placement_strategy, + ) + + assert local_num_experts == expected_local, \ + f"ep_size={ep_size}, ep_rank={ep_rank}, " \ + f"global_num_experts={global_num_experts}, " \ + f"expert_placement_strategy={expert_placement_strategy}: " \ + f"expected {expected_local} local experts, got {local_num_experts}" + + if expected_map_pattern is None: + assert expert_map is None, "Expected expert_map to be None" + else: + assert expert_map is not None, "Expected expert_map to not be None" + actual_map = expert_map.tolist() + assert actual_map == expected_map_pattern, \ + f"ep_size={ep_size}, ep_rank={ep_rank}, " \ + f"global_num_experts={global_num_experts}, " \ + f"expert_placement_strategy={expert_placement_strategy}: " \ + f"expected map {expected_map_pattern}, got {actual_map}" diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 2f8ad5c6b6b04..231406bf60524 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -29,6 +29,7 @@ else: logger = init_logger(__name__) +ExpertPlacementStrategy = Literal["linear", "round_robin"] DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] @@ -102,6 +103,15 @@ class ParallelConfig: """Enable expert parallelism load balancing for MoE layers.""" eplb_config: EPLBConfig = field(default_factory=EPLBConfig) """Expert parallelism configuration.""" + expert_placement_strategy: ExpertPlacementStrategy = "linear" + """The expert placement strategy for MoE layers:\n + - "linear": Experts are placed in a contiguous manner. For example, with 4 + experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have + experts [2, 3].\n + - "round_robin": Experts are placed in a round-robin manner. For example, + with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1 + will have experts [1, 3]. This strategy can help improve load balancing + for grouped expert models with no redundant experts.""" num_redundant_experts: Optional[int] = None """`num_redundant_experts` is deprecated and has been replaced with `eplb_config.num_redundant_experts`. This will be removed in v0.12.0. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 595d318fbaafe..20d998d613d47 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -34,6 +34,7 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, SpeculativeConfig, TaskOption, TokenizerMode, VllmConfig, get_attr_docs) from vllm.config.multimodal import MMCacheType, MultiModalConfig +from vllm.config.parallel import ExpertPlacementStrategy from vllm.config.utils import get_field from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform @@ -328,6 +329,8 @@ class EngineArgs: enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb + expert_placement_strategy: ExpertPlacementStrategy = \ + ParallelConfig.expert_placement_strategy num_redundant_experts: int = EPLBConfig.num_redundant_experts eplb_window_size: int = EPLBConfig.window_size eplb_step_interval: int = EPLBConfig.step_interval @@ -696,6 +699,9 @@ class EngineArgs: **parallel_kwargs["enable_eplb"]) parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"]) + parallel_group.add_argument( + "--expert-placement-strategy", + **parallel_kwargs["expert_placement_strategy"]) parallel_group.add_argument( "--num-redundant-experts", type=int, @@ -1335,6 +1341,7 @@ class EngineArgs: enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, eplb_config=self.eplb_config, + expert_placement_strategy=self.expert_placement_strategy, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, ray_workers_use_nsight=self.ray_workers_use_nsight, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index a90a71159f721..c62897c91816e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -4,7 +4,7 @@ from abc import abstractmethod from collections.abc import Iterable from enum import Enum -from typing import Callable, Literal, Optional, Union, overload +from typing import Callable, Literal, Optional, Union, get_args, overload import torch import torch.nn.functional as F @@ -12,6 +12,7 @@ from torch.nn.parameter import UninitializedParameter import vllm.envs as envs from vllm.config import get_current_vllm_config +from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import (get_dp_group, get_ep_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -675,8 +676,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def determine_expert_map( - ep_size: int, ep_rank: int, - global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]: + ep_size: int, + ep_rank: int, + global_num_experts: int, + expert_placement_strategy: ExpertPlacementStrategy = "linear", +) -> tuple[int, Optional[torch.Tensor]]: """ Calculates how many experts should be assigned to each rank for EP and creates a mapping from global to local expert index. Experts are @@ -684,8 +688,11 @@ def determine_expert_map( last rank. Args: - ep_size (int): The size of the expert parallel group - global_num_experts (int): The total number of experts in the model. + ep_size: The size of the expert parallel group + ep_rank: The rank of the current process in the expert parallel + group + global_num_experts: The total number of experts in the model. + expert_placement_strategy: The expert placement strategy. Returns: tuple[int, Optional[torch.Tensor]]: A tuple containing: @@ -711,9 +718,23 @@ def determine_expert_map( # Create a tensor of size num_experts filled with -1 expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32) # Create an expert map for the local experts - start_idx = ep_rank * base_experts + min(ep_rank, remainder) - expert_map[start_idx:start_idx + local_num_experts] = torch.arange( - 0, local_num_experts, dtype=torch.int32) + if expert_placement_strategy == "linear": + start_idx = ep_rank * base_experts + min(ep_rank, remainder) + expert_map[start_idx:start_idx + local_num_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32) + elif expert_placement_strategy == "round_robin": + local_log_experts = torch.arange(ep_rank, + global_num_experts, + ep_size, + dtype=torch.int32) + + expert_map[local_log_experts] = torch.arange(0, + local_num_experts, + dtype=torch.int32) + else: + raise ValueError("Unsupported expert placement strategy " + f"'{expert_placement_strategy}', expected one of " + f"{get_args(ExpertPlacementStrategy)}") return (local_num_experts, expert_map) @@ -846,15 +867,36 @@ class FusedMoE(CustomOp): else: assert num_redundant_experts == 0, \ "Redundant experts are only supported with EPLB." + + expert_placement_strategy = ( + vllm_config.parallel_config.expert_placement_strategy) + if expert_placement_strategy == "round_robin": + # TODO(Bruce): will support round robin expert placement with + # EPLB enabled in the future. + round_robin_supported = ((num_expert_group is not None + and num_expert_group > 1) + and num_redundant_experts == 0 + and not self.enable_eplb) + + if not round_robin_supported: + logger.warning( + "Round-robin expert placement is only supported for " + "models with multiple expert groups and no redundant " + "experts. Falling back to linear expert placement.") + expert_placement_strategy = "linear" + self.local_num_experts, self.expert_map = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, - global_num_experts=self.global_num_experts) + global_num_experts=self.global_num_experts, + expert_placement_strategy=expert_placement_strategy, + ) logger.info_once( - "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" " number of experts: %s/%s. Experts local to global index map:" - " %s.", self.ep_rank, self.ep_size, self.local_num_experts, - self.global_num_experts, + " %s.", self.ep_rank, self.ep_size, expert_placement_strategy, + self.local_num_experts, self.global_num_experts, get_compressed_expert_map(self.expert_map)) else: self.local_num_experts, self.expert_map = (self.global_num_experts, From 0faf3cc3e84a83bac56492d297f2c9909fec7de9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:51:35 +0100 Subject: [PATCH 314/584] Move `SpeculativeConfig` from `config/__init__.py` to `config/speculative.py` (#24904) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 525 +---------------------------------- vllm/config/speculative.py | 554 +++++++++++++++++++++++++++++++++++++ 2 files changed, 556 insertions(+), 523 deletions(-) create mode 100644 vllm/config/speculative.py diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0847fba878aa2..6bb0fef23719a 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -24,7 +24,7 @@ from pydantic import (ConfigDict, SkipValidation, field_validator, model_validator) from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE -from typing_extensions import Self, assert_never, runtime_checkable +from typing_extensions import assert_never, runtime_checkable import vllm.envs as envs from vllm import version @@ -41,6 +41,7 @@ from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy +from vllm.config.speculative import SpeculativeConfig from vllm.config.utils import ConfigType, config from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods @@ -1846,528 +1847,6 @@ class DeviceConfig: self.device = torch.device(self.device_type) -SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", - "mlp_speculator", "draft_model", "deepseek_mtp", - "ernie_mtp", "qwen3_next_mtp"] - - -@config -@dataclass -class SpeculativeConfig: - """Configuration for speculative decoding.""" - - # General speculative decoding control - num_speculative_tokens: SkipValidation[int] = None # type: ignore - """The number of speculative tokens, if provided. It will default to the - number in the draft model config if present, otherwise, it is required.""" - model: Optional[str] = None - """The name of the draft model, eagle head, or additional weights, if - provided.""" - method: Optional[SpeculativeMethod] = None - """The name of the speculative method to use. If users provide and set the - `model` param, the speculative method type will be detected automatically - if possible, if `model` param is not provided, the method name must be - provided. - - If using `ngram` method, the related configuration `prompt_lookup_max` and - `prompt_lookup_min` should be considered.""" - draft_tensor_parallel_size: Optional[int] = None - """The degree of the tensor parallelism for the draft model. Can only be 1 - or the same as the target model's tensor parallel size.""" - disable_logprobs: bool = True - """If set to True, token log probabilities are not returned during - speculative decoding. If set to False, token log probabilities are returned - according to the log probability settings in SamplingParams.""" - - # Draft model configuration - quantization: Optional[me_quant.QuantizationMethods] = None - """Quantization method that was used to quantize the draft model weights. - If `None`, we assume the model weights are not quantized. Note that it only - takes effect when using the draft model-based speculative method.""" - max_model_len: Optional[int] = None - """The maximum model length of the draft model. Used when testing the - ability to skip speculation for some sequences.""" - revision: Optional[str] = None - """The specific model version to use for the draft model. It can be a - branch name, a tag name, or a commit id. If unspecified, will use the - default version.""" - code_revision: Optional[str] = None - """The specific revision to use for the draft model code on Hugging Face - Hub. It can be a branch name, a tag name, or a commit id. If unspecified, - will use the default version.""" - - # Advanced control - disable_by_batch_size: Optional[int] = None - """Disable speculative decoding for new incoming requests when the number - of enqueued requests is larger than this value, if provided.""" - - # Ngram proposer configuration - prompt_lookup_max: Optional[int] = None - """Maximum size of ngram token window when using Ngram proposer, required - when method is set to ngram.""" - prompt_lookup_min: Optional[int] = None - """Minimum size of ngram token window when using Ngram proposer, if - provided. Defaults to 1.""" - - speculative_token_tree: Optional[str] = None - """Specifies the tree structure for speculative token generation. - """ - # required configuration params passed from engine - target_model_config: SkipValidation[ModelConfig] = None # type: ignore - """The configuration of the target model.""" - target_parallel_config: SkipValidation[ - ParallelConfig] = None # type: ignore - """The parallel configuration for the target model.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore - """Whether vLLM is configured to use chunked prefill or not. Used for - raising an error since it's not yet compatible with speculative decode.""" - disable_log_stats: SkipValidation[bool] = None # type: ignore - """Whether to disable the periodic printing of stage times in speculative - decoding.""" - - # params generated in the post-init stage - draft_model_config: SkipValidation[ModelConfig] = None # type: ignore - """The configuration of the draft model initialized internal.""" - draft_parallel_config: SkipValidation[ - ParallelConfig] = None # type: ignore - """The parallel configuration for the draft model initialized internal.""" - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - # Eagle3 affects the computation graph because it returns intermediate - # hidden states in addition to the final hidden state. - factors.append(self.method == "eagle3") - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - @staticmethod - def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: - if hf_config.model_type == "deepseek_v3": - hf_config.model_type = "deepseek_mtp" - if hf_config.model_type == "deepseek_mtp": - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["DeepSeekMTPModel"] - }) - - if hf_config.architectures[0] == "MiMoForCausalLM": - hf_config.model_type = "mimo_mtp" - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "num_hidden_layers": 0, - "n_predict": n_predict, - "architectures": ["MiMoMTPModel"] - }) - - if hf_config.architectures[0] == "Glm4MoeForCausalLM": - hf_config.model_type = "glm4_moe_mtp" - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "num_hidden_layers": 0, - "n_predict": n_predict, - "architectures": ["Glm4MoeMTPModel"] - }) - - if hf_config.model_type == "ernie4_5_moe": - hf_config.model_type = "ernie_mtp" - if hf_config.model_type == "ernie_mtp": - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["ErnieMTPModel"] - }) - - if hf_config.model_type == "qwen3_next": - hf_config.model_type = "qwen3_next_mtp" - if hf_config.model_type == "qwen3_next_mtp": - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["Qwen3NextMTP"] - }) - - return hf_config - - def __post_init__(self): - - # Note: "method" is a new parameter that helps to extend the - # configuration of non-model-based proposers, and the "model" parameter - # will be used to set the draft model, eagle head, or additional weight - # when needed. If users do not specify "method", the speculative method - # will be detected automatically if possible. If the speculative method - # can not be detected, it will be considered as the "draft_model" by - # default. - - if self.model is None and self.num_speculative_tokens is not None: - # TODO(Shangming): Refactor mtp configuration logic when supporting - # mtp acceleration for more models besides deepseek_v3 - if self.target_model_config and \ - (self.target_model_config.hf_text_config.model_type \ - == "deepseek_v3" or - self.target_model_config.hf_text_config.model_type in - ("mimo","ernie4_5_moe", "qwen3_next")): - # use the draft model from the same model: - self.model = self.target_model_config.model - # Align the quantization of draft model for cases such as - # --quantization fp8 with a bf16 checkpoint. - if not self.quantization: - self.quantization = self.target_model_config.quantization - elif self.method in ("ngram", "[ngram]"): - self.model = "ngram" - else: - raise ValueError("num_speculative_tokens was provided without " - "speculative model.") - - # Automatically configure the method for ngram when "model" is used - # instead of "method" - if self.method is None and (self.model is not None - and self.model in ("ngram", "[ngram]")): - self.method = "ngram" - - if self.method in ("ngram", "[ngram]"): - # Unified to "ngram" internally - self.method = "ngram" - # Set default values if not provided - if (self.prompt_lookup_min is None - and self.prompt_lookup_max is None): - # TODO(woosuk): Tune these values. They are arbitrarily chosen. - self.prompt_lookup_min = 5 - self.prompt_lookup_max = 5 - elif self.prompt_lookup_min is None: - assert self.prompt_lookup_max is not None - self.prompt_lookup_min = self.prompt_lookup_max - elif self.prompt_lookup_max is None: - assert self.prompt_lookup_min is not None - self.prompt_lookup_max = self.prompt_lookup_min - - # Validate values - if self.prompt_lookup_min < 1: - raise ValueError( - f"prompt_lookup_min={self.prompt_lookup_min} must be > 0") - if self.prompt_lookup_max < 1: - raise ValueError( - f"prompt_lookup_max={self.prompt_lookup_max} must be > 0") - if self.prompt_lookup_min > self.prompt_lookup_max: - raise ValueError( - f"prompt_lookup_min={self.prompt_lookup_min} must " - f"be <= prompt_lookup_max={self.prompt_lookup_max}") - - # TODO: current we still need extract vocab_size from target model - # config, in future, we may try refactor it out, and set - # draft related config as None here. - self.draft_model_config = self.target_model_config - self.draft_parallel_config = self.target_parallel_config - else: - self.prompt_lookup_max = 0 - self.prompt_lookup_min = 0 - - if self.model is not None: - self.draft_model_config = ModelConfig( - model=self.model, - runner="draft", - tokenizer=self.target_model_config.tokenizer, - tokenizer_mode=self.target_model_config.tokenizer_mode, - trust_remote_code=self.target_model_config. - trust_remote_code, - allowed_local_media_path=self.target_model_config. - allowed_local_media_path, - dtype=self.target_model_config.dtype, - seed=self.target_model_config.seed, - revision=self.revision, - code_revision=self.code_revision, - tokenizer_revision=self.target_model_config. - tokenizer_revision, - spec_target_max_model_len=self.target_model_config. - max_model_len, - quantization=self.quantization, - enforce_eager=self.target_model_config.enforce_eager, - max_seq_len_to_capture=self.target_model_config. - max_seq_len_to_capture, - max_logprobs=self.target_model_config.max_logprobs, - hf_overrides=SpeculativeConfig.hf_config_override, - ) - - # Automatically detect the method - if self.method in ('eagle', 'eagle3'): - pass - # examples: - # yuhuili/EAGLE-LLaMA3-Instruct-8B - # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B - # AngelSlim/Qwen3-8B_eagle3 - elif "eagle-" in self.draft_model_config.model.lower(): - self.method = "eagle" - elif "eagle3" in self.draft_model_config.model.lower(): - self.method = "eagle3" - elif self.draft_model_config.hf_config.model_type == "medusa": - self.method = "medusa" - elif (self.draft_model_config.hf_config.model_type == - "mlp_speculator"): - self.method = "mlp_speculator" - elif (self.draft_model_config.hf_config.model_type - in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")): - self.method = "deepseek_mtp" - if self.num_speculative_tokens > 1: - logger.warning( - "All Deepseek MTP models only have " \ - "one layer. Might need some code changes " \ - "to support multiple layers." - ) - elif (self.draft_model_config.hf_config.model_type == - "ernie_mtp"): - self.method = "ernie_mtp" - if self.num_speculative_tokens > 1: - logger.warning( - "All Ernie MTP models only have " \ - "one layer. Might need some code changes " \ - "to support multiple layers." - ) - elif (self.draft_model_config.hf_config.model_type == - "qwen3_next_mtp"): - self.method = "qwen3_next_mtp" - if self.num_speculative_tokens > 1: - logger.warning( - "All Qwen3Next MTP models only have " \ - "one layer. Might need some code changes " \ - "to support multiple layers." - ) - else: - self.method = "draft_model" - raise NotImplementedError( - "Speculative decoding with draft model is not " - "supported yet. Please consider using other " - "speculative decoding methods such as ngram, medusa, " - "eagle, or deepseek_mtp.") - - # Replace hf_config for EAGLE draft_model - if self.method in ("eagle", "eagle3"): - if self.enable_chunked_prefill and not envs.VLLM_USE_V1: - raise ValueError( - "Chunked prefill and EAGLE are not compatible " - "when using V0.") - - from vllm.transformers_utils.configs import ( - SpeculatorsConfig) - from vllm.transformers_utils.configs.eagle import ( - EAGLEConfig) - - if isinstance(self.draft_model_config.hf_config, - (EAGLEConfig, SpeculatorsConfig)): - pass - else: - eagle_config = EAGLEConfig( - self.draft_model_config.hf_config, - method=self.method, - model_type="eagle") - self.draft_model_config.hf_config = eagle_config - - if (self.num_speculative_tokens is not None - and hasattr(self.draft_model_config.hf_config, - "num_lookahead_tokens")): - self.draft_model_config.hf_config.num_lookahead_tokens = \ - self.num_speculative_tokens - - n_predict = getattr(self.draft_model_config.hf_config, - "n_predict", None) - if n_predict is not None: - if self.num_speculative_tokens is None: - # Default to max value defined in draft model config. - self.num_speculative_tokens = n_predict - elif self.num_speculative_tokens > n_predict and \ - self.num_speculative_tokens % n_predict != 0: - # Ensure divisibility for MTP module reuse. - raise ValueError( - f"num_speculative_tokens:{self.num_speculative_tokens}" - f" must be divisible by {n_predict=}") - - if self.speculative_token_tree is None: - # Generate chain of tokens. - self.speculative_token_tree = str([ - (i + 1) * (0, ) - for i in range(self.num_speculative_tokens) - ]) - else: - # Sort the token tree breadth-first. - tree_choices = ast.literal_eval( - self.speculative_token_tree) - self.speculative_token_tree = str( - sorted(tree_choices, key=lambda t: (len(t), t))) - - self.draft_tensor_parallel_size = \ - SpeculativeConfig._verify_and_get_draft_tp( - self.target_parallel_config, - self.draft_tensor_parallel_size, - self.draft_model_config.hf_config - ) - - self.draft_model_config.max_model_len = ( - SpeculativeConfig._maybe_override_draft_max_model_len( - self.max_model_len, - self.draft_model_config.max_model_len, - self.target_model_config.max_model_len, - )) - - self.draft_parallel_config = ( - SpeculativeConfig.create_draft_parallel_config( - self.target_parallel_config, - self.draft_tensor_parallel_size)) - - @staticmethod - def _maybe_override_draft_max_model_len( - speculative_max_model_len: Optional[int], - draft_max_model_len: int, - target_max_model_len: int, - ) -> int: - """Determine the max sequence len for the draft model. This is usually - the draft_max_model_len, but may be the target_max_model_len if it is - less than the draft_max_model_len, or may be speculative_max_model_len - if it is specified. - - This is necessary so that sequences do not exceed the capacity of the - draft model or the target model. - - speculative_max_model_len is mainly used for testing that sequences can - skip speculation. - """ - - if speculative_max_model_len is not None: - - if speculative_max_model_len > draft_max_model_len: - raise ValueError(f"{speculative_max_model_len=} cannot be " - f"larger than {draft_max_model_len=}") - - if speculative_max_model_len > target_max_model_len: - raise ValueError(f"{speculative_max_model_len=} cannot be " - f"larger than {target_max_model_len=}") - - return speculative_max_model_len - - return min( - draft_max_model_len, - target_max_model_len, - ) - - @staticmethod - def _verify_and_get_draft_tp( - target_parallel_config: ParallelConfig, - speculative_draft_tensor_parallel_size: Optional[int], - draft_hf_config: PretrainedConfig) -> int: - """ - Verifies and adjusts the tensor parallel size for a draft model - specified using speculative_draft_tensor_parallel_size. - """ - # If speculative_draft_tensor_parallel_size is unset then set it - # appropriately else verify that it is set correctly. - if speculative_draft_tensor_parallel_size is None: - if draft_hf_config.model_type == "mlp_speculator": - speculative_draft_tensor_parallel_size = 1 - if target_parallel_config.tensor_parallel_size > 1: - logger.warning( - "%s cannot currently be run with tp>1; " - "setting speculative_draft_tensor_parallel_size=1", - draft_hf_config.model_type) - else: - speculative_draft_tensor_parallel_size = \ - target_parallel_config.tensor_parallel_size - elif speculative_draft_tensor_parallel_size not in ( - 1, target_parallel_config.tensor_parallel_size): - raise ValueError( - f"{speculative_draft_tensor_parallel_size=} cannot be " - f"other value than 1 or target model tensor_parallel_size") - return speculative_draft_tensor_parallel_size - - @staticmethod - def create_draft_parallel_config( - target_parallel_config: ParallelConfig, - speculative_draft_tensor_parallel_size: int, - ) -> ParallelConfig: - """Create a parallel config for use by the draft worker. - - This is mostly a copy of the target parallel config, except the tp_size. - """ - draft_parallel_config = ParallelConfig( - pipeline_parallel_size=target_parallel_config. - pipeline_parallel_size, - tensor_parallel_size=speculative_draft_tensor_parallel_size, - distributed_executor_backend=target_parallel_config. - distributed_executor_backend, - max_parallel_loading_workers=target_parallel_config. - max_parallel_loading_workers, - disable_custom_all_reduce=target_parallel_config. - disable_custom_all_reduce, - ray_workers_use_nsight=target_parallel_config. - ray_workers_use_nsight, - placement_group=target_parallel_config.placement_group, - ) - - return draft_parallel_config - - @model_validator(mode='after') - def _verify_args(self) -> Self: - if self.num_speculative_tokens is None: - raise ValueError( - "num_speculative_tokens must be provided with " - "speculative model unless the draft model config contains an " - "n_predict parameter.") - - if self.num_speculative_tokens <= 0: - raise ValueError("Expected num_speculative_tokens to be greater " - f"than zero ({self.num_speculative_tokens}).") - - if self.draft_model_config: - self.draft_model_config.verify_with_parallel_config( - self.draft_parallel_config) - - if (self.disable_by_batch_size is not None - and self.disable_by_batch_size < 2): - raise ValueError("Expect the batch size threshold of disabling " - "speculative decoding is > 1, but got " - f"{self.disable_by_batch_size=}") - - eagle3_target_supported = ["llama", "qwen"] - if self.method == "eagle3" and self.target_model_config and not any( - supported_model in - self.target_model_config.hf_text_config.model_type - for supported_model in eagle3_target_supported): - raise ValueError( - f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501 - f"Got {self.target_model_config.hf_text_config.model_type=}") - - return self - - @property - def num_lookahead_slots(self) -> int: - """The number of additional slots the scheduler should allocate per - step, in addition to the slots allocated for each known token. - - This is equal to the number of speculative tokens, as each speculative - token must be scored. - """ - return self.num_speculative_tokens - - def use_eagle(self) -> bool: - return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp", - "qwen3_next_mtp") - - def __repr__(self) -> str: - method = self.method - model = None if method == "ngram" else self.draft_model_config.model - num_spec_tokens = self.num_speculative_tokens - return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" - - @config @dataclass class PoolerConfig: diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py new file mode 100644 index 0000000000000..b2d50e3852337 --- /dev/null +++ b/vllm/config/speculative.py @@ -0,0 +1,554 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import ast +import hashlib +from typing import TYPE_CHECKING, Any, Literal, Optional + +from pydantic import SkipValidation, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config.parallel import ParallelConfig +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils import LazyLoader + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + import vllm.model_executor.layers.quantization as me_quant + from vllm.config import ModelConfig +else: + PretrainedConfig = Any + ModelConfig = Any + + me_quant = LazyLoader("model_executor", globals(), + "vllm.model_executor.layers.quantization") + +logger = init_logger(__name__) + +SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", + "mlp_speculator", "draft_model", "deepseek_mtp", + "ernie_mtp", "qwen3_next_mtp"] + + +@config +@dataclass +class SpeculativeConfig: + """Configuration for speculative decoding.""" + + # General speculative decoding control + num_speculative_tokens: SkipValidation[int] = None # type: ignore + """The number of speculative tokens, if provided. It will default to the + number in the draft model config if present, otherwise, it is required.""" + model: Optional[str] = None + """The name of the draft model, eagle head, or additional weights, if + provided.""" + method: Optional[SpeculativeMethod] = None + """The name of the speculative method to use. If users provide and set the + `model` param, the speculative method type will be detected automatically + if possible, if `model` param is not provided, the method name must be + provided. + + If using `ngram` method, the related configuration `prompt_lookup_max` and + `prompt_lookup_min` should be considered.""" + draft_tensor_parallel_size: Optional[int] = None + """The degree of the tensor parallelism for the draft model. Can only be 1 + or the same as the target model's tensor parallel size.""" + disable_logprobs: bool = True + """If set to True, token log probabilities are not returned during + speculative decoding. If set to False, token log probabilities are returned + according to the log probability settings in SamplingParams.""" + + # Draft model configuration + quantization: Optional[me_quant.QuantizationMethods] = None + """Quantization method that was used to quantize the draft model weights. + If `None`, we assume the model weights are not quantized. Note that it only + takes effect when using the draft model-based speculative method.""" + max_model_len: Optional[int] = None + """The maximum model length of the draft model. Used when testing the + ability to skip speculation for some sequences.""" + revision: Optional[str] = None + """The specific model version to use for the draft model. It can be a + branch name, a tag name, or a commit id. If unspecified, will use the + default version.""" + code_revision: Optional[str] = None + """The specific revision to use for the draft model code on Hugging Face + Hub. It can be a branch name, a tag name, or a commit id. If unspecified, + will use the default version.""" + + # Advanced control + disable_by_batch_size: Optional[int] = None + """Disable speculative decoding for new incoming requests when the number + of enqueued requests is larger than this value, if provided.""" + + # Ngram proposer configuration + prompt_lookup_max: Optional[int] = None + """Maximum size of ngram token window when using Ngram proposer, required + when method is set to ngram.""" + prompt_lookup_min: Optional[int] = None + """Minimum size of ngram token window when using Ngram proposer, if + provided. Defaults to 1.""" + + speculative_token_tree: Optional[str] = None + """Specifies the tree structure for speculative token generation. + """ + # required configuration params passed from engine + target_model_config: SkipValidation[ModelConfig] = None # type: ignore + """The configuration of the target model.""" + target_parallel_config: SkipValidation[ + ParallelConfig] = None # type: ignore + """The parallel configuration for the target model.""" + enable_chunked_prefill: SkipValidation[bool] = None # type: ignore + """Whether vLLM is configured to use chunked prefill or not. Used for + raising an error since it's not yet compatible with speculative decode.""" + disable_log_stats: SkipValidation[bool] = None # type: ignore + """Whether to disable the periodic printing of stage times in speculative + decoding.""" + + # params generated in the post-init stage + draft_model_config: SkipValidation[ModelConfig] = None # type: ignore + """The configuration of the draft model initialized internal.""" + draft_parallel_config: SkipValidation[ + ParallelConfig] = None # type: ignore + """The parallel configuration for the draft model initialized internal.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + # Eagle3 affects the computation graph because it returns intermediate + # hidden states in addition to the final hidden state. + factors.append(self.method == "eagle3") + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + @staticmethod + def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: + if hf_config.model_type == "deepseek_v3": + hf_config.model_type = "deepseek_mtp" + if hf_config.model_type == "deepseek_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["DeepSeekMTPModel"] + }) + + if hf_config.architectures[0] == "MiMoForCausalLM": + hf_config.model_type = "mimo_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "num_hidden_layers": 0, + "n_predict": n_predict, + "architectures": ["MiMoMTPModel"] + }) + + if hf_config.architectures[0] == "Glm4MoeForCausalLM": + hf_config.model_type = "glm4_moe_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "num_hidden_layers": 0, + "n_predict": n_predict, + "architectures": ["Glm4MoeMTPModel"] + }) + + if hf_config.model_type == "ernie4_5_moe": + hf_config.model_type = "ernie_mtp" + if hf_config.model_type == "ernie_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["ErnieMTPModel"] + }) + + if hf_config.model_type == "qwen3_next": + hf_config.model_type = "qwen3_next_mtp" + if hf_config.model_type == "qwen3_next_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["Qwen3NextMTP"] + }) + + return hf_config + + def __post_init__(self): + + # Note: "method" is a new parameter that helps to extend the + # configuration of non-model-based proposers, and the "model" parameter + # will be used to set the draft model, eagle head, or additional weight + # when needed. If users do not specify "method", the speculative method + # will be detected automatically if possible. If the speculative method + # can not be detected, it will be considered as the "draft_model" by + # default. + + if self.model is None and self.num_speculative_tokens is not None: + # TODO(Shangming): Refactor mtp configuration logic when supporting + # mtp acceleration for more models besides deepseek_v3 + if self.target_model_config and \ + (self.target_model_config.hf_text_config.model_type \ + == "deepseek_v3" or + self.target_model_config.hf_text_config.model_type in + ("mimo","ernie4_5_moe", "qwen3_next")): + # use the draft model from the same model: + self.model = self.target_model_config.model + # Align the quantization of draft model for cases such as + # --quantization fp8 with a bf16 checkpoint. + if not self.quantization: + self.quantization = self.target_model_config.quantization + elif self.method in ("ngram", "[ngram]"): + self.model = "ngram" + else: + raise ValueError("num_speculative_tokens was provided without " + "speculative model.") + + # Automatically configure the method for ngram when "model" is used + # instead of "method" + if self.method is None and (self.model is not None + and self.model in ("ngram", "[ngram]")): + self.method = "ngram" + + if self.method in ("ngram", "[ngram]"): + # Unified to "ngram" internally + self.method = "ngram" + # Set default values if not provided + if (self.prompt_lookup_min is None + and self.prompt_lookup_max is None): + # TODO(woosuk): Tune these values. They are arbitrarily chosen. + self.prompt_lookup_min = 5 + self.prompt_lookup_max = 5 + elif self.prompt_lookup_min is None: + assert self.prompt_lookup_max is not None + self.prompt_lookup_min = self.prompt_lookup_max + elif self.prompt_lookup_max is None: + assert self.prompt_lookup_min is not None + self.prompt_lookup_max = self.prompt_lookup_min + + # Validate values + if self.prompt_lookup_min < 1: + raise ValueError( + f"prompt_lookup_min={self.prompt_lookup_min} must be > 0") + if self.prompt_lookup_max < 1: + raise ValueError( + f"prompt_lookup_max={self.prompt_lookup_max} must be > 0") + if self.prompt_lookup_min > self.prompt_lookup_max: + raise ValueError( + f"prompt_lookup_min={self.prompt_lookup_min} must " + f"be <= prompt_lookup_max={self.prompt_lookup_max}") + + # TODO: current we still need extract vocab_size from target model + # config, in future, we may try refactor it out, and set + # draft related config as None here. + self.draft_model_config = self.target_model_config + self.draft_parallel_config = self.target_parallel_config + else: + self.prompt_lookup_max = 0 + self.prompt_lookup_min = 0 + + if self.model is not None: + # TODO: Move this import to the top once `ModelConfig` + # lives in `vllm.config.model`. + from vllm.config import ModelConfig + self.draft_model_config = ModelConfig( + model=self.model, + runner="draft", + tokenizer=self.target_model_config.tokenizer, + tokenizer_mode=self.target_model_config.tokenizer_mode, + trust_remote_code=self.target_model_config. + trust_remote_code, + allowed_local_media_path=self.target_model_config. + allowed_local_media_path, + dtype=self.target_model_config.dtype, + seed=self.target_model_config.seed, + revision=self.revision, + code_revision=self.code_revision, + tokenizer_revision=self.target_model_config. + tokenizer_revision, + spec_target_max_model_len=self.target_model_config. + max_model_len, + quantization=self.quantization, + enforce_eager=self.target_model_config.enforce_eager, + max_seq_len_to_capture=self.target_model_config. + max_seq_len_to_capture, + max_logprobs=self.target_model_config.max_logprobs, + hf_overrides=SpeculativeConfig.hf_config_override, + ) + + # Automatically detect the method + if self.method in ('eagle', 'eagle3'): + pass + # examples: + # yuhuili/EAGLE-LLaMA3-Instruct-8B + # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B + # AngelSlim/Qwen3-8B_eagle3 + elif "eagle-" in self.draft_model_config.model.lower(): + self.method = "eagle" + elif "eagle3" in self.draft_model_config.model.lower(): + self.method = "eagle3" + elif self.draft_model_config.hf_config.model_type == "medusa": + self.method = "medusa" + elif (self.draft_model_config.hf_config.model_type == + "mlp_speculator"): + self.method = "mlp_speculator" + elif (self.draft_model_config.hf_config.model_type + in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")): + self.method = "deepseek_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Deepseek MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) + elif (self.draft_model_config.hf_config.model_type == + "ernie_mtp"): + self.method = "ernie_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Ernie MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) + elif (self.draft_model_config.hf_config.model_type == + "qwen3_next_mtp"): + self.method = "qwen3_next_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Qwen3Next MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) + else: + self.method = "draft_model" + raise NotImplementedError( + "Speculative decoding with draft model is not " + "supported yet. Please consider using other " + "speculative decoding methods such as ngram, medusa, " + "eagle, or deepseek_mtp.") + + # Replace hf_config for EAGLE draft_model + if self.method in ("eagle", "eagle3"): + if self.enable_chunked_prefill and not envs.VLLM_USE_V1: + raise ValueError( + "Chunked prefill and EAGLE are not compatible " + "when using V0.") + + from vllm.transformers_utils.configs import ( + SpeculatorsConfig) + from vllm.transformers_utils.configs.eagle import ( + EAGLEConfig) + + if isinstance(self.draft_model_config.hf_config, + (EAGLEConfig, SpeculatorsConfig)): + pass + else: + eagle_config = EAGLEConfig( + self.draft_model_config.hf_config, + method=self.method, + model_type="eagle") + self.draft_model_config.hf_config = eagle_config + + if (self.num_speculative_tokens is not None + and hasattr(self.draft_model_config.hf_config, + "num_lookahead_tokens")): + self.draft_model_config.hf_config.num_lookahead_tokens = \ + self.num_speculative_tokens + + n_predict = getattr(self.draft_model_config.hf_config, + "n_predict", None) + if n_predict is not None: + if self.num_speculative_tokens is None: + # Default to max value defined in draft model config. + self.num_speculative_tokens = n_predict + elif self.num_speculative_tokens > n_predict and \ + self.num_speculative_tokens % n_predict != 0: + # Ensure divisibility for MTP module reuse. + raise ValueError( + f"num_speculative_tokens:{self.num_speculative_tokens}" + f" must be divisible by {n_predict=}") + + if self.speculative_token_tree is None: + # Generate chain of tokens. + self.speculative_token_tree = str([ + (i + 1) * (0, ) + for i in range(self.num_speculative_tokens) + ]) + else: + # Sort the token tree breadth-first. + tree_choices = ast.literal_eval( + self.speculative_token_tree) + self.speculative_token_tree = str( + sorted(tree_choices, key=lambda t: (len(t), t))) + + self.draft_tensor_parallel_size = \ + SpeculativeConfig._verify_and_get_draft_tp( + self.target_parallel_config, + self.draft_tensor_parallel_size, + self.draft_model_config.hf_config + ) + + self.draft_model_config.max_model_len = ( + SpeculativeConfig._maybe_override_draft_max_model_len( + self.max_model_len, + self.draft_model_config.max_model_len, + self.target_model_config.max_model_len, + )) + + self.draft_parallel_config = ( + SpeculativeConfig.create_draft_parallel_config( + self.target_parallel_config, + self.draft_tensor_parallel_size)) + + @staticmethod + def _maybe_override_draft_max_model_len( + speculative_max_model_len: Optional[int], + draft_max_model_len: int, + target_max_model_len: int, + ) -> int: + """Determine the max sequence len for the draft model. This is usually + the draft_max_model_len, but may be the target_max_model_len if it is + less than the draft_max_model_len, or may be speculative_max_model_len + if it is specified. + + This is necessary so that sequences do not exceed the capacity of the + draft model or the target model. + + speculative_max_model_len is mainly used for testing that sequences can + skip speculation. + """ + + if speculative_max_model_len is not None: + + if speculative_max_model_len > draft_max_model_len: + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {draft_max_model_len=}") + + if speculative_max_model_len > target_max_model_len: + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {target_max_model_len=}") + + return speculative_max_model_len + + return min( + draft_max_model_len, + target_max_model_len, + ) + + @staticmethod + def _verify_and_get_draft_tp( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: Optional[int], + draft_hf_config: PretrainedConfig) -> int: + """ + Verifies and adjusts the tensor parallel size for a draft model + specified using speculative_draft_tensor_parallel_size. + """ + # If speculative_draft_tensor_parallel_size is unset then set it + # appropriately else verify that it is set correctly. + if speculative_draft_tensor_parallel_size is None: + if draft_hf_config.model_type == "mlp_speculator": + speculative_draft_tensor_parallel_size = 1 + if target_parallel_config.tensor_parallel_size > 1: + logger.warning( + "%s cannot currently be run with tp>1; " + "setting speculative_draft_tensor_parallel_size=1", + draft_hf_config.model_type) + else: + speculative_draft_tensor_parallel_size = \ + target_parallel_config.tensor_parallel_size + elif speculative_draft_tensor_parallel_size not in ( + 1, target_parallel_config.tensor_parallel_size): + raise ValueError( + f"{speculative_draft_tensor_parallel_size=} cannot be " + f"other value than 1 or target model tensor_parallel_size") + return speculative_draft_tensor_parallel_size + + @staticmethod + def create_draft_parallel_config( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: int, + ) -> ParallelConfig: + """Create a parallel config for use by the draft worker. + + This is mostly a copy of the target parallel config, except the tp_size. + """ + draft_parallel_config = ParallelConfig( + pipeline_parallel_size=target_parallel_config. + pipeline_parallel_size, + tensor_parallel_size=speculative_draft_tensor_parallel_size, + distributed_executor_backend=target_parallel_config. + distributed_executor_backend, + max_parallel_loading_workers=target_parallel_config. + max_parallel_loading_workers, + disable_custom_all_reduce=target_parallel_config. + disable_custom_all_reduce, + ray_workers_use_nsight=target_parallel_config. + ray_workers_use_nsight, + placement_group=target_parallel_config.placement_group, + ) + + return draft_parallel_config + + @model_validator(mode='after') + def _verify_args(self) -> Self: + if self.num_speculative_tokens is None: + raise ValueError( + "num_speculative_tokens must be provided with " + "speculative model unless the draft model config contains an " + "n_predict parameter.") + + if self.num_speculative_tokens <= 0: + raise ValueError("Expected num_speculative_tokens to be greater " + f"than zero ({self.num_speculative_tokens}).") + + if self.draft_model_config: + self.draft_model_config.verify_with_parallel_config( + self.draft_parallel_config) + + if (self.disable_by_batch_size is not None + and self.disable_by_batch_size < 2): + raise ValueError("Expect the batch size threshold of disabling " + "speculative decoding is > 1, but got " + f"{self.disable_by_batch_size=}") + + eagle3_target_supported = ["llama", "qwen"] + if self.method == "eagle3" and self.target_model_config and not any( + supported_model in + self.target_model_config.hf_text_config.model_type + for supported_model in eagle3_target_supported): + raise ValueError( + f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501 + f"Got {self.target_model_config.hf_text_config.model_type=}") + + return self + + @property + def num_lookahead_slots(self) -> int: + """The number of additional slots the scheduler should allocate per + step, in addition to the slots allocated for each known token. + + This is equal to the number of speculative tokens, as each speculative + token must be scored. + """ + return self.num_speculative_tokens + + def use_eagle(self) -> bool: + return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp", + "qwen3_next_mtp") + + def __repr__(self) -> str: + method = self.method + model = None if method == "ngram" else self.draft_model_config.model + num_spec_tokens = self.num_speculative_tokens + return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" From 85e0df139236d90acfe6f940ff1c96157e07dae0 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" <yeq@meta.com> Date: Tue, 16 Sep 2025 05:52:57 -0700 Subject: [PATCH 315/584] [Docs] move benchmarks README to contributing guides (#24820) --- benchmarks/README.md | 880 +------------------------------- docs/contributing/benchmarks.md | 794 +++++++++++++++++++++++++++- 2 files changed, 800 insertions(+), 874 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index ee172642033de..269a4d51ec2ef 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,874 +1,20 @@ -# Benchmarking vLLM +# Benchmarks -This README guides you through running benchmark tests with the extensive -datasets supported on vLLM. It’s a living document, updated as new features and datasets -become available. +This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation. -## Dataset Overview +## Contents -<table style="width:100%; border-collapse: collapse;"> - <thead> - <tr> - <th style="width:15%; text-align: left;">Dataset</th> - <th style="width:10%; text-align: center;">Online</th> - <th style="width:10%; text-align: center;">Offline</th> - <th style="width:65%; text-align: left;">Data Path</th> - </tr> - </thead> - <tbody> - <tr> - <td><strong>ShareGPT</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td> - </tr> - <tr> - <td><strong>ShareGPT4V (Image)</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td> - <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code> - <br> - <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div> - <code>wget http://images.cocodataset.org/zips/train2017.zip</code> - </td> - </tr> - <tr> - <td><strong>ShareGPT4Video (Video)</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td> - <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code> - </td> - </tr> - <tr> - <td><strong>BurstGPT</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td> - </tr> - <tr> - <td><strong>Sonnet (deprecated)</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td>Local file: <code>benchmarks/sonnet.txt</code></td> - </tr> - <tr> - <td><strong>Random</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>synthetic</code></td> - </tr> - <tr> - <td><strong>RandomMultiModal (Image/Video)</strong></td> - <td style="text-align: center;">🟡</td> - <td style="text-align: center;">🚧</td> - <td><code>synthetic</code> </td> - </tr> - <tr> - <td><strong>Prefix Repetition</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>synthetic</code></td> - </tr> - <tr> - <td><strong>HuggingFace-VisionArena</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>lmarena-ai/VisionArena-Chat</code></td> - </tr> - <tr> - <td><strong>HuggingFace-InstructCoder</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>likaixin/InstructCoder</code></td> - </tr> - <tr> - <td><strong>HuggingFace-AIMO</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td> - </tr> - <tr> - <td><strong>HuggingFace-Other</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td> - </tr> - <tr> - <td><strong>HuggingFace-MTBench</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>philschmid/mt-bench</code></td> - </tr> - <tr> - <td><strong>HuggingFace-Blazedit</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td> - </tr> - <tr> - <td><strong>Spec Bench</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td> - </tr> - <tr> - <td><strong>Custom</strong></td> - <td style="text-align: center;">✅</td> - <td style="text-align: center;">✅</td> - <td>Local file: <code>data.jsonl</code></td> - </tr> - </tbody> -</table> +- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput) +- **Throughput benchmarks**: Scripts for testing offline batch inference performance +- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference +- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.) -✅: supported +## Usage -🟡: Partial support +For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli). -🚧: to be supported +For full CLI reference see: -**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`. -For local `dataset-path`, please set `hf-name` to its Hugging Face ID like - -```bash ---dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat -``` - -## 🚀 Example - Online Benchmark - -<details> -<summary>Show more</summary> - -<br/> - -First start serving your model - -```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B -``` - -Then run the benchmarking script - -```bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -vllm bench serve \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --endpoint /v1/completions \ - --dataset-name sharegpt \ - --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ - --num-prompts 10 -``` - -If successful, you will see the following output - -```text -============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 5.78 -Total input tokens: 1369 -Total generated tokens: 2212 -Request throughput (req/s): 1.73 -Output token throughput (tok/s): 382.89 -Total Token throughput (tok/s): 619.85 ----------------Time to First Token---------------- -Mean TTFT (ms): 71.54 -Median TTFT (ms): 73.88 -P99 TTFT (ms): 79.49 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 7.91 -Median TPOT (ms): 7.96 -P99 TPOT (ms): 8.03 ----------------Inter-token Latency---------------- -Mean ITL (ms): 7.74 -Median ITL (ms): 7.70 -P99 ITL (ms): 8.39 -================================================== -``` - -### Custom Dataset - -If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl - -```json -{"prompt": "What is the capital of India?"} -{"prompt": "What is the capital of Iran?"} -{"prompt": "What is the capital of China?"} -``` - -```bash -# start server -VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct -``` - -```bash -# run benchmarking script -vllm bench serve --port 9001 --save-result --save-detailed \ - --backend vllm \ - --model meta-llama/Llama-3.1-8B-Instruct \ - --endpoint /v1/completions \ - --dataset-name custom \ - --dataset-path <path-to-your-data-jsonl> \ - --custom-skip-chat-template \ - --num-prompts 80 \ - --max-concurrency 1 \ - --temperature=0.3 \ - --top-p=0.75 \ - --result-dir "./log/" -``` - -You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. - -### VisionArena Benchmark for Vision Language Models - -```bash -# need a model with vision capability here -vllm serve Qwen/Qwen2-VL-7B-Instruct -``` - -```bash -vllm bench serve \ - --backend openai-chat \ - --endpoint-type openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path lmarena-ai/VisionArena-Chat \ - --hf-split train \ - --num-prompts 1000 -``` - -### InstructCoder Benchmark with Speculative Decoding - -``` bash -VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --speculative-config $'{"method": "ngram", - "num_speculative_tokens": 5, "prompt_lookup_max": 5, - "prompt_lookup_min": 2}' -``` - -``` bash -vllm bench serve \ - --model meta-llama/Meta-Llama-3-8B-Instruct \ - --dataset-name hf \ - --dataset-path likaixin/InstructCoder \ - --num-prompts 2048 -``` - -### Spec Bench Benchmark with Speculative Decoding - -``` bash -VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --speculative-config $'{"method": "ngram", - "num_speculative_tokens": 5, "prompt_lookup_max": 5, - "prompt_lookup_min": 2}' -``` - -[SpecBench dataset](https://github.com/hemingkx/Spec-Bench) - -Run all categories: - -``` bash -# Download the dataset using: -# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl - -vllm bench serve \ - --model meta-llama/Meta-Llama-3-8B-Instruct \ - --dataset-name spec_bench \ - --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \ - --num-prompts -1 -``` - -Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`. - -Run only a specific category like "summarization": - -``` bash -vllm bench serve \ - --model meta-llama/Meta-Llama-3-8B-Instruct \ - --dataset-name spec_bench \ - --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \ - --num-prompts -1 - --spec-bench-category "summarization" -``` - -### Other HuggingFaceDataset Examples - -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct -``` - -`lmms-lab/LLaVA-OneVision-Data`: - -```bash -vllm bench serve \ - --backend openai-chat \ - --endpoint-type openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path lmms-lab/LLaVA-OneVision-Data \ - --hf-split train \ - --hf-subset "chart2text(cauldron)" \ - --num-prompts 10 -``` - -`Aeala/ShareGPT_Vicuna_unfiltered`: - -```bash -vllm bench serve \ - --backend openai-chat \ - --endpoint-type openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ - --hf-split train \ - --num-prompts 10 -``` - -`AI-MO/aimo-validation-aime`: - -``` bash -vllm bench serve \ - --model Qwen/QwQ-32B \ - --dataset-name hf \ - --dataset-path AI-MO/aimo-validation-aime \ - --num-prompts 10 \ - --seed 42 -``` - -`philschmid/mt-bench`: - -``` bash -vllm bench serve \ - --model Qwen/QwQ-32B \ - --dataset-name hf \ - --dataset-path philschmid/mt-bench \ - --num-prompts 80 -``` - -`vdaita/edit_5k_char` or `vdaita/edit_10k_char`: - -``` bash -vllm bench serve \ - --model Qwen/QwQ-32B \ - --dataset-name hf \ - --dataset-path vdaita/edit_5k_char \ - --num-prompts 90 \ - --blazedit-min-distance 0.01 \ - --blazedit-max-distance 0.99 -``` - -### Running With Sampling Parameters - -When using OpenAI-compatible backends such as `vllm`, optional sampling -parameters can be specified. Example client command: - -```bash -vllm bench serve \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --endpoint /v1/completions \ - --dataset-name sharegpt \ - --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ - --top-k 10 \ - --top-p 0.9 \ - --temperature 0.5 \ - --num-prompts 10 -``` - -### Running With Ramp-Up Request Rate - -The benchmark tool also supports ramping up the request rate over the -duration of the benchmark run. This can be useful for stress testing the -server or finding the maximum throughput that it can handle, given some latency budget. - -Two ramp-up strategies are supported: - -- `linear`: Increases the request rate linearly from a start value to an end value. -- `exponential`: Increases the request rate exponentially. - -The following arguments can be used to control the ramp-up: - -- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`). -- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark. -- `--ramp-up-end-rps`: The request rate at the end of the benchmark. - -</details> - -## 📈 Example - Offline Throughput Benchmark - -<details> -<summary>Show more</summary> - -<br/> - -```bash -vllm bench throughput \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset-name sonnet \ - --dataset-path vllm/benchmarks/sonnet.txt \ - --num-prompts 10 -``` - -If successful, you will see the following output - -```text -Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s -Total num prompt tokens: 5014 -Total num output tokens: 1500 -``` - -### VisionArena Benchmark for Vision Language Models - -```bash -vllm bench throughput \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path lmarena-ai/VisionArena-Chat \ - --num-prompts 1000 \ - --hf-split train -``` - -The `num prompt tokens` now includes image token counts - -```text -Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s -Total num prompt tokens: 14527 -Total num output tokens: 1280 -``` - -### InstructCoder Benchmark with Speculative Decoding - -``` bash -VLLM_WORKER_MULTIPROC_METHOD=spawn \ -VLLM_USE_V1=1 \ -vllm bench throughput \ - --dataset-name=hf \ - --dataset-path=likaixin/InstructCoder \ - --model=meta-llama/Meta-Llama-3-8B-Instruct \ - --input-len=1000 \ - --output-len=100 \ - --num-prompts=2048 \ - --async-engine \ - --speculative-config $'{"method": "ngram", - "num_speculative_tokens": 5, "prompt_lookup_max": 5, - "prompt_lookup_min": 2}' -``` - -```text -Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s -Total num prompt tokens: 261136 -Total num output tokens: 204800 -``` - -### Other HuggingFaceDataset Examples - -`lmms-lab/LLaVA-OneVision-Data`: - -```bash -vllm bench throughput \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path lmms-lab/LLaVA-OneVision-Data \ - --hf-split train \ - --hf-subset "chart2text(cauldron)" \ - --num-prompts 10 -``` - -`Aeala/ShareGPT_Vicuna_unfiltered`: - -```bash -vllm bench throughput \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ - --hf-split train \ - --num-prompts 10 -``` - -`AI-MO/aimo-validation-aime`: - -```bash -vllm bench throughput \ - --model Qwen/QwQ-32B \ - --backend vllm \ - --dataset-name hf \ - --dataset-path AI-MO/aimo-validation-aime \ - --hf-split train \ - --num-prompts 10 -``` - -Benchmark with LoRA adapters: - -``` bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -vllm bench throughput \ - --model meta-llama/Llama-2-7b-hf \ - --backend vllm \ - --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ - --dataset_name sharegpt \ - --num-prompts 10 \ - --max-loras 2 \ - --max-lora-rank 8 \ - --enable-lora \ - --lora-path yard1/llama-2-7b-sql-lora-test - ``` - -</details> - -## 🛠️ Example - Structured Output Benchmark - -<details> -<summary>Show more</summary> - -<br/> - -Benchmark the performance of structured output generation (JSON, grammar, regex). - -### Server Setup - -```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B -``` - -### JSON Schema Benchmark - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset json \ - --structured-output-ratio 1.0 \ - --request-rate 10 \ - --num-prompts 1000 -``` - -### Grammar-based Generation Benchmark - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset grammar \ - --structure-type grammar \ - --request-rate 10 \ - --num-prompts 1000 -``` - -### Regex-based Generation Benchmark - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset regex \ - --request-rate 10 \ - --num-prompts 1000 -``` - -### Choice-based Generation Benchmark - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset choice \ - --request-rate 10 \ - --num-prompts 1000 -``` - -### XGrammar Benchmark Dataset - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset xgrammar_bench \ - --request-rate 10 \ - --num-prompts 1000 -``` - -</details> - -## 📚 Example - Long Document QA Benchmark - -<details> -<summary>Show more</summary> - -<br/> - -Benchmark the performance of long document question-answering with prefix caching. - -### Basic Long Document QA Test - -```bash -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 16 \ - --document-length 2000 \ - --output-len 50 \ - --repeat-count 5 -``` - -### Different Repeat Modes - -```bash -# Random mode (default) - shuffle prompts randomly -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 8 \ - --document-length 3000 \ - --repeat-count 3 \ - --repeat-mode random - -# Tile mode - repeat entire prompt list in sequence -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 8 \ - --document-length 3000 \ - --repeat-count 3 \ - --repeat-mode tile - -# Interleave mode - repeat each prompt consecutively -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 8 \ - --document-length 3000 \ - --repeat-count 3 \ - --repeat-mode interleave -``` - -</details> - -## 🗂️ Example - Prefix Caching Benchmark - -<details> -<summary>Show more</summary> - -<br/> - -Benchmark the efficiency of automatic prefix caching. - -### Fixed Prompt with Prefix Caching - -```bash -python3 benchmarks/benchmark_prefix_caching.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-prompts 1 \ - --repeat-count 100 \ - --input-length-range 128:256 -``` - -### ShareGPT Dataset with Prefix Caching - -```bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -python3 benchmarks/benchmark_prefix_caching.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \ - --enable-prefix-caching \ - --num-prompts 20 \ - --repeat-count 5 \ - --input-length-range 128:256 -``` - -### Prefix Repetition Dataset - -```bash -vllm bench serve \ - --backend openai \ - --model meta-llama/Llama-2-7b-chat-hf \ - --dataset-name prefix_repetition \ - --num-prompts 100 \ - --prefix-repetition-prefix-len 512 \ - --prefix-repetition-suffix-len 128 \ - --prefix-repetition-num-prefixes 5 \ - --prefix-repetition-output-len 128 -``` - -</details> - -## ⚡ Example - Request Prioritization Benchmark - -<details> -<summary>Show more</summary> - -<br/> - -Benchmark the performance of request prioritization in vLLM. - -### Basic Prioritization Test - -```bash -python3 benchmarks/benchmark_prioritization.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --input-len 128 \ - --output-len 64 \ - --num-prompts 100 \ - --scheduling-policy priority -``` - -### Multiple Sequences per Prompt - -```bash -python3 benchmarks/benchmark_prioritization.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --input-len 128 \ - --output-len 64 \ - --num-prompts 100 \ - --scheduling-policy priority \ - --n 2 -``` - -</details> - -## 👁️ Example - Multi-Modal Benchmark - -<details> -<summary>Show more</summary> - -<br/> - -Benchmark the performance of multi-modal requests in vLLM. - -### Images (ShareGPT4V) - -Start vLLM: - -```bash -python -m vllm.entrypoints.openai.api_server \ - --model Qwen/Qwen2.5-VL-7B-Instruct \ - --dtype bfloat16 \ - --limit-mm-per-prompt '{"image": 1}' \ - --allowed-local-media-path /path/to/sharegpt4v/images -``` - -Send requests with images: - -```bash -vllm bench serve \ - --backend openai-chat \ - --model Qwen/Qwen2.5-VL-7B-Instruct \ - --dataset-name sharegpt \ - --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \ - --num-prompts 100 \ - --save-result \ - --result-dir ~/vllm_benchmark_results \ - --save-detailed \ - --endpoint /v1/chat/completion -``` - -### Videos (ShareGPT4Video) - -Start vLLM: - -```bash -python -m vllm.entrypoints.openai.api_server \ - --model Qwen/Qwen2.5-VL-7B-Instruct \ - --dtype bfloat16 \ - --limit-mm-per-prompt '{"video": 1}' \ - --allowed-local-media-path /path/to/sharegpt4video/videos -``` - -Send requests with videos: - -```bash -vllm bench serve \ - --backend openai-chat \ - --model Qwen/Qwen2.5-VL-7B-Instruct \ - --dataset-name sharegpt \ - --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \ - --num-prompts 100 \ - --save-result \ - --result-dir ~/vllm_benchmark_results \ - --save-detailed \ - --endpoint /v1/chat/completion -``` - -### Synthetic Random Images (random-mm) - -Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets. - -Notes: - -- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`. -- Video sampling is not yet implemented. - -Start the server (example): - -```bash -vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ - --dtype bfloat16 \ - --max-model-len 16384 \ - --limit-mm-per-prompt '{"image": 3, "video": 0}' \ - --mm-processor-kwargs max_pixels=1003520 -``` - -Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`. - -Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens: - -```bash -vllm bench serve \ - --backend openai-chat \ - --model Qwen/Qwen2.5-VL-3B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name random-mm \ - --num-prompts 100 \ - --max-concurrency 10 \ - --random-prefix-len 25 \ - --random-input-len 300 \ - --random-output-len 40 \ - --random-range-ratio 0.2 \ - --random-mm-base-items-per-request 2 \ - --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \ - --random-mm-bucket-config '{(224, 224, 1): 1.0}' \ - --request-rate inf \ - --ignore-eos \ - --seed 42 -``` - -The number of items per request can be controlled by passing multiple image buckets: - -```bash - --random-mm-base-items-per-request 2 \ - --random-mm-num-mm-items-range-ratio 0.5 \ - --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \ - --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \ -``` - -Flags specific to `random-mm`: - -- `--random-mm-base-items-per-request`: base number of multimodal items per request. -- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items. -- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'. -- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported). - -Behavioral notes: - -- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping. - -How sampling works: - -- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits. -- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added. -- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing. -This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`. -- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`. - -</details> +- <https://docs.vllm.ai/en/latest/cli/bench/latency.html> +- <https://docs.vllm.ai/en/latest/cli/bench/serve.html> +- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html> diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 25c2d2955ff2f..13582dadb46e0 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1,9 +1,789 @@ +--- +toc_depth: 4 +--- + # Benchmark Suites -vLLM contains two sets of benchmarks: +vLLM provides comprehensive benchmarking tools for performance testing and evaluation: -- [Performance benchmarks][performance-benchmarks] -- [Nightly benchmarks][nightly-benchmarks] +- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing +- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development +- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives + +[Benchmark CLI]: #benchmark-cli + +## Benchmark CLI + +This section guides you through running benchmark tests with the extensive +datasets supported on vLLM. It's a living document, updated as new features and datasets +become available. + +### Dataset Overview + +<style> +th { + min-width: 0 !important; +} +</style> + +| Dataset | Online | Offline | Data Path | +|---------|--------|---------|-----------| +| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` | +| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` | +| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` | +| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` | +| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` | +| Random | ✅ | ✅ | `synthetic` | +| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` | +| Prefix Repetition | ✅ | ✅ | `synthetic` | +| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` | +| HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` | +| HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` | +| HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` | +| HuggingFace-MTBench | ✅ | ✅ | `philschmid/mt-bench` | +| HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` | +| Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` | +| Custom | ✅ | ✅ | Local file: `data.jsonl` | + +Legend: + +- ✅ - supported +- 🟡 - Partial support +- 🚧 - to be supported + +!!! note + HuggingFace dataset's `dataset-name` should be set to `hf`. + For local `dataset-path`, please set `hf-name` to its Hugging Face ID like + + ```bash + --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat + ``` + +### Examples + +#### 🚀 Online Benchmark + +<details class="admonition abstract" markdown="1"> +<summary>Show more</summary> + +First start serving your model + +```bash +vllm serve NousResearch/Hermes-3-Llama-3.1-8B +``` + +Then run the benchmarking script + +```bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +vllm bench serve \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --endpoint /v1/completions \ + --dataset-name sharegpt \ + --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 10 +``` + +If successful, you will see the following output + +```text +============ Serving Benchmark Result ============ +Successful requests: 10 +Benchmark duration (s): 5.78 +Total input tokens: 1369 +Total generated tokens: 2212 +Request throughput (req/s): 1.73 +Output token throughput (tok/s): 382.89 +Total Token throughput (tok/s): 619.85 +---------------Time to First Token---------------- +Mean TTFT (ms): 71.54 +Median TTFT (ms): 73.88 +P99 TTFT (ms): 79.49 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 7.91 +Median TPOT (ms): 7.96 +P99 TPOT (ms): 8.03 +---------------Inter-token Latency---------------- +Mean ITL (ms): 7.74 +Median ITL (ms): 7.70 +P99 ITL (ms): 8.39 +================================================== +``` + +##### Custom Dataset + +If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl + +```json +{"prompt": "What is the capital of India?"} +{"prompt": "What is the capital of Iran?"} +{"prompt": "What is the capital of China?"} +``` + +```bash +# start server +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct +``` + +```bash +# run benchmarking script +vllm bench serve --port 9001 --save-result --save-detailed \ + --backend vllm \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --endpoint /v1/completions \ + --dataset-name custom \ + --dataset-path <path-to-your-data-jsonl> \ + --custom-skip-chat-template \ + --num-prompts 80 \ + --max-concurrency 1 \ + --temperature=0.3 \ + --top-p=0.75 \ + --result-dir "./log/" +``` + +You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. + +##### VisionArena Benchmark for Vision Language Models + +```bash +# need a model with vision capability here +vllm serve Qwen/Qwen2-VL-7B-Instruct +``` + +```bash +vllm bench serve \ + --backend openai-chat \ + --endpoint-type openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --hf-split train \ + --num-prompts 1000 +``` + +##### InstructCoder Benchmark with Speculative Decoding + +``` bash +VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' +``` + +``` bash +vllm bench serve \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name hf \ + --dataset-path likaixin/InstructCoder \ + --num-prompts 2048 +``` + +##### Spec Bench Benchmark with Speculative Decoding + +``` bash +VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' +``` + +[SpecBench dataset](https://github.com/hemingkx/Spec-Bench) + +Run all categories: + +``` bash +# Download the dataset using: +# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl + +vllm bench serve \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name spec_bench \ + --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \ + --num-prompts -1 +``` + +Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`. + +Run only a specific category like "summarization": + +``` bash +vllm bench serve \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name spec_bench \ + --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \ + --num-prompts -1 + --spec-bench-category "summarization" +``` + +##### Other HuggingFaceDataset Examples + +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct +``` + +`lmms-lab/LLaVA-OneVision-Data`: + +```bash +vllm bench serve \ + --backend openai-chat \ + --endpoint-type openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmms-lab/LLaVA-OneVision-Data \ + --hf-split train \ + --hf-subset "chart2text(cauldron)" \ + --num-prompts 10 +``` + +`Aeala/ShareGPT_Vicuna_unfiltered`: + +```bash +vllm bench serve \ + --backend openai-chat \ + --endpoint-type openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ + --hf-split train \ + --num-prompts 10 +``` + +`AI-MO/aimo-validation-aime`: + +``` bash +vllm bench serve \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path AI-MO/aimo-validation-aime \ + --num-prompts 10 \ + --seed 42 +``` + +`philschmid/mt-bench`: + +``` bash +vllm bench serve \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 80 +``` + +`vdaita/edit_5k_char` or `vdaita/edit_10k_char`: + +``` bash +vllm bench serve \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path vdaita/edit_5k_char \ + --num-prompts 90 \ + --blazedit-min-distance 0.01 \ + --blazedit-max-distance 0.99 +``` + +##### Running With Sampling Parameters + +When using OpenAI-compatible backends such as `vllm`, optional sampling +parameters can be specified. Example client command: + +```bash +vllm bench serve \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --endpoint /v1/completions \ + --dataset-name sharegpt \ + --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ + --top-k 10 \ + --top-p 0.9 \ + --temperature 0.5 \ + --num-prompts 10 +``` + +##### Running With Ramp-Up Request Rate + +The benchmark tool also supports ramping up the request rate over the +duration of the benchmark run. This can be useful for stress testing the +server or finding the maximum throughput that it can handle, given some latency budget. + +Two ramp-up strategies are supported: + +- `linear`: Increases the request rate linearly from a start value to an end value. +- `exponential`: Increases the request rate exponentially. + +The following arguments can be used to control the ramp-up: + +- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`). +- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark. +- `--ramp-up-end-rps`: The request rate at the end of the benchmark. + +</details> + +#### 📈 Offline Throughput Benchmark + +<details class="admonition abstract" markdown="1"> +<summary>Show more</summary> + +```bash +vllm bench throughput \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset-name sonnet \ + --dataset-path vllm/benchmarks/sonnet.txt \ + --num-prompts 10 +``` + +If successful, you will see the following output + +```text +Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s +Total num prompt tokens: 5014 +Total num output tokens: 1500 +``` + +##### VisionArena Benchmark for Vision Language Models + +```bash +vllm bench throughput \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --num-prompts 1000 \ + --hf-split train +``` + +The `num prompt tokens` now includes image token counts + +```text +Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s +Total num prompt tokens: 14527 +Total num output tokens: 1280 +``` + +##### InstructCoder Benchmark with Speculative Decoding + +``` bash +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +VLLM_USE_V1=1 \ +vllm bench throughput \ + --dataset-name=hf \ + --dataset-path=likaixin/InstructCoder \ + --model=meta-llama/Meta-Llama-3-8B-Instruct \ + --input-len=1000 \ + --output-len=100 \ + --num-prompts=2048 \ + --async-engine \ + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' +``` + +```text +Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s +Total num prompt tokens: 261136 +Total num output tokens: 204800 +``` + +##### Other HuggingFaceDataset Examples + +`lmms-lab/LLaVA-OneVision-Data`: + +```bash +vllm bench throughput \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path lmms-lab/LLaVA-OneVision-Data \ + --hf-split train \ + --hf-subset "chart2text(cauldron)" \ + --num-prompts 10 +``` + +`Aeala/ShareGPT_Vicuna_unfiltered`: + +```bash +vllm bench throughput \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ + --hf-split train \ + --num-prompts 10 +``` + +`AI-MO/aimo-validation-aime`: + +```bash +vllm bench throughput \ + --model Qwen/QwQ-32B \ + --backend vllm \ + --dataset-name hf \ + --dataset-path AI-MO/aimo-validation-aime \ + --hf-split train \ + --num-prompts 10 +``` + +Benchmark with LoRA adapters: + +``` bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +vllm bench throughput \ + --model meta-llama/Llama-2-7b-hf \ + --backend vllm \ + --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ + --dataset_name sharegpt \ + --num-prompts 10 \ + --max-loras 2 \ + --max-lora-rank 8 \ + --enable-lora \ + --lora-path yard1/llama-2-7b-sql-lora-test +``` + +</details> + +#### 🛠️ Structured Output Benchmark + +<details class="admonition abstract" markdown="1"> +<summary>Show more</summary> + +Benchmark the performance of structured output generation (JSON, grammar, regex). + +##### Server Setup + +```bash +vllm serve NousResearch/Hermes-3-Llama-3.1-8B +``` + +##### JSON Schema Benchmark + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset json \ + --structured-output-ratio 1.0 \ + --request-rate 10 \ + --num-prompts 1000 +``` + +##### Grammar-based Generation Benchmark + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset grammar \ + --structure-type grammar \ + --request-rate 10 \ + --num-prompts 1000 +``` + +##### Regex-based Generation Benchmark + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset regex \ + --request-rate 10 \ + --num-prompts 1000 +``` + +##### Choice-based Generation Benchmark + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset choice \ + --request-rate 10 \ + --num-prompts 1000 +``` + +##### XGrammar Benchmark Dataset + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset xgrammar_bench \ + --request-rate 10 \ + --num-prompts 1000 +``` + +</details> + +#### 📚 Long Document QA Benchmark + +<details class="admonition abstract" markdown="1"> +<summary>Show more</summary> + +Benchmark the performance of long document question-answering with prefix caching. + +##### Basic Long Document QA Test + +```bash +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 16 \ + --document-length 2000 \ + --output-len 50 \ + --repeat-count 5 +``` + +##### Different Repeat Modes + +```bash +# Random mode (default) - shuffle prompts randomly +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --document-length 3000 \ + --repeat-count 3 \ + --repeat-mode random + +# Tile mode - repeat entire prompt list in sequence +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --document-length 3000 \ + --repeat-count 3 \ + --repeat-mode tile + +# Interleave mode - repeat each prompt consecutively +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --document-length 3000 \ + --repeat-count 3 \ + --repeat-mode interleave +``` + +</details> + +#### 🗂️ Prefix Caching Benchmark + +<details class="admonition abstract" markdown="1"> +<summary>Show more</summary> + +Benchmark the efficiency of automatic prefix caching. + +##### Fixed Prompt with Prefix Caching + +```bash +python3 benchmarks/benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-prompts 1 \ + --repeat-count 100 \ + --input-length-range 128:256 +``` + +##### ShareGPT Dataset with Prefix Caching + +```bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +python3 benchmarks/benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \ + --enable-prefix-caching \ + --num-prompts 20 \ + --repeat-count 5 \ + --input-length-range 128:256 +``` + +##### Prefix Repetition Dataset + +```bash +vllm bench serve \ + --backend openai \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-name prefix_repetition \ + --num-prompts 100 \ + --prefix-repetition-prefix-len 512 \ + --prefix-repetition-suffix-len 128 \ + --prefix-repetition-num-prefixes 5 \ + --prefix-repetition-output-len 128 +``` + +</details> + +#### ⚡ Request Prioritization Benchmark + +<details class="admonition abstract" markdown="1"> +<summary>Show more</summary> + +Benchmark the performance of request prioritization in vLLM. + +##### Basic Prioritization Test + +```bash +python3 benchmarks/benchmark_prioritization.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --input-len 128 \ + --output-len 64 \ + --num-prompts 100 \ + --scheduling-policy priority +``` + +##### Multiple Sequences per Prompt + +```bash +python3 benchmarks/benchmark_prioritization.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --input-len 128 \ + --output-len 64 \ + --num-prompts 100 \ + --scheduling-policy priority \ + --n 2 +``` + +</details> + +#### 👁️ Multi-Modal Benchmark + +<details class="admonition abstract" markdown="1"> +<summary>Show more</summary> + +Benchmark the performance of multi-modal requests in vLLM. + +##### Images (ShareGPT4V) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"image": 1}' \ + --allowed-local-media-path /path/to/sharegpt4v/images +``` + +Send requests with images: + +```bash +vllm bench serve \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + +##### Videos (ShareGPT4Video) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"video": 1}' \ + --allowed-local-media-path /path/to/sharegpt4video/videos +``` + +Send requests with videos: + +```bash +vllm bench serve \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + +##### Synthetic Random Images (random-mm) + +Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets. + +Notes: + +- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`. +- Video sampling is not yet implemented. + +Start the server (example): + +```bash +vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ + --dtype bfloat16 \ + --max-model-len 16384 \ + --limit-mm-per-prompt '{"image": 3, "video": 0}' \ + --mm-processor-kwargs max_pixels=1003520 +``` + +Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`. + +Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens: + +```bash +vllm bench serve \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-3B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name random-mm \ + --num-prompts 100 \ + --max-concurrency 10 \ + --random-prefix-len 25 \ + --random-input-len 300 \ + --random-output-len 40 \ + --random-range-ratio 0.2 \ + --random-mm-base-items-per-request 2 \ + --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \ + --random-mm-bucket-config '{(224, 224, 1): 1.0}' \ + --request-rate inf \ + --ignore-eos \ + --seed 42 +``` + +The number of items per request can be controlled by passing multiple image buckets: + +```bash + --random-mm-base-items-per-request 2 \ + --random-mm-num-mm-items-range-ratio 0.5 \ + --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \ + --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \ +``` + +Flags specific to `random-mm`: + +- `--random-mm-base-items-per-request`: base number of multimodal items per request. +- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items. +- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'. +- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported). + +Behavioral notes: + +- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping. + +How sampling works: + +- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits. +- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added. +- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing. +This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`. +- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`. + +</details> [](){ #performance-benchmarks } @@ -13,22 +793,22 @@ The performance benchmarks are used for development to confirm whether new chang ### Manually Trigger the benchmark -Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite. +Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite. For CPU environment, please use the image with "-cpu" postfix. -Here is an example for docker run command for CPU. +Here is an example for docker run command for CPU. ```bash docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu ``` -Then, run below command inside the docker instance. +Then, run below command inside the docker instance. ```bash bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh ``` -When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json. +When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json. #### Runtime environment variables From de3e53a75ba9f31f446926911b7c44561af3b2ee Mon Sep 17 00:00:00 2001 From: liangwen12year <36004580+liangwen12year@users.noreply.github.com> Date: Tue, 16 Sep 2025 08:53:40 -0400 Subject: [PATCH 316/584] feat: Add Grafana and Perces monitoring dashboards for vLLM (#23498) --- examples/online_serving/dashboards/README.md | 87 + .../dashboards/grafana/README.md | 59 + .../grafana/performance_statistics.json | 1405 +++++++++++++++++ .../dashboards/grafana/query_statistics.json | 760 +++++++++ .../dashboards/perses/README.md | 48 + .../perses/performance_statistics.yaml | 764 +++++++++ .../dashboards/perses/query_statistics.yaml | 392 +++++ 7 files changed, 3515 insertions(+) create mode 100644 examples/online_serving/dashboards/README.md create mode 100644 examples/online_serving/dashboards/grafana/README.md create mode 100644 examples/online_serving/dashboards/grafana/performance_statistics.json create mode 100644 examples/online_serving/dashboards/grafana/query_statistics.json create mode 100644 examples/online_serving/dashboards/perses/README.md create mode 100644 examples/online_serving/dashboards/perses/performance_statistics.yaml create mode 100644 examples/online_serving/dashboards/perses/query_statistics.yaml diff --git a/examples/online_serving/dashboards/README.md b/examples/online_serving/dashboards/README.md new file mode 100644 index 0000000000000..30cea6b24d57e --- /dev/null +++ b/examples/online_serving/dashboards/README.md @@ -0,0 +1,87 @@ +# Monitoring Dashboards + +This directory contains monitoring dashboard configurations for vLLM, providing +comprehensive observability for your vLLM deployments. + +## Dashboard Platforms + +We provide dashboards for two popular observability platforms: + +- **[Grafana](https://grafana.com)** +- **[Perses](https://perses.dev)** + +## Dashboard Format Approach + +All dashboards are provided in **native formats** that work across different +deployment methods: + +### Grafana (JSON) + +- ✅ Works with any Grafana instance (cloud, self-hosted, Docker) +- ✅ Direct import via Grafana UI or API +- ✅ Can be wrapped in Kubernetes operators when needed +- ✅ No vendor lock-in or deployment dependencies + +### Perses (YAML) + +- ✅ Works with standalone Perses instances +- ✅ Compatible with Perses API and CLI +- ✅ Supports Dashboard-as-Code workflows +- ✅ Can be wrapped in Kubernetes operators when needed + +## Dashboard Contents + +Both platforms provide equivalent monitoring capabilities: + +| Dashboard | Description | +|-----------|-------------| +| **Performance Statistics** | Tracks latency, throughput, and performance metrics | +| **Query Statistics** | Monitors request volume, query performance, and KPIs | + +## Quick Start + +First, navigate to this example's directory: + +```bash +cd examples/online_serving/dashboards +``` + +### Grafana + +Import the JSON directly into the Grafana UI, or use the API: + +```bash +curl -X POST http://grafana/api/dashboards/db \ + -H "Content-Type: application/json" \ + -d @grafana/performance_statistics.json +``` + +### Perses + +Import via the Perses CLI: + +```bash +percli apply -f perses/performance_statistics.yaml +``` + +## Requirements + +- **Prometheus** metrics from your vLLM deployment +- **Data source** configured in your monitoring platform +- **vLLM metrics** enabled and accessible + +## Platform-Specific Documentation + +For detailed deployment instructions and platform-specific options, see: + +- **[Grafana Documentation](./grafana)** - JSON dashboards, operator usage, manual import +- **[Perses Documentation](./perses)** - YAML specs, CLI usage, operator wrapping + +## Contributing + +When adding new dashboards, please: + +1. Provide native formats (JSON for Grafana, YAML specs for Perses) +2. Update platform-specific README files +3. Ensure dashboards work across deployment methods +4. Test with the latest platform versions diff --git a/examples/online_serving/dashboards/grafana/README.md b/examples/online_serving/dashboards/grafana/README.md new file mode 100644 index 0000000000000..e42b0f814367d --- /dev/null +++ b/examples/online_serving/dashboards/grafana/README.md @@ -0,0 +1,59 @@ +# Grafana Dashboards for vLLM Monitoring + +This directory contains Grafana dashboard configurations (as JSON) designed to monitor +vLLM performance and metrics. + +## Requirements + +- Grafana 8.0+ +- Prometheus data source configured in Grafana +- vLLM deployment with Prometheus metrics enabled + +## Dashboard Descriptions + +- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and + throughput for your vLLM service. +- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key + performance indicators for your vLLM service. + +## Deployment Options + +### Manual Import (Recommended) + +The easiest way to use these dashboards is to manually import the JSON configurations +directly into your Grafana instance: + +1. Navigate to your Grafana instance +2. Click the '+' icon in the sidebar +3. Select 'Import' +4. Copy and paste the JSON content from the dashboard files, or upload the JSON files + directly + +### Grafana Operator + +If you're using the [Grafana Operator](https://github.com/grafana-operator/grafana-operator) +in Kubernetes, you can wrap these JSON configurations in a `GrafanaDashboard` custom +resource: + +```yaml +# Note: Adjust the instanceSelector to match your Grafana instance's labels +# You can check with: kubectl get grafana -o yaml +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: vllm-performance-dashboard +spec: + instanceSelector: + matchLabels: + dashboards: grafana # Adjust to match your Grafana instance labels + folder: "vLLM Monitoring" + json: | + # Replace this comment with the complete JSON content from + # performance_statistics.json - The JSON should start with { and end with } +``` + +Then apply to your cluster: + +```bash +kubectl apply -f your-dashboard.yaml -n <namespace> +``` diff --git a/examples/online_serving/dashboards/grafana/performance_statistics.json b/examples/online_serving/dashboards/grafana/performance_statistics.json new file mode 100644 index 0000000000000..390d3dd6d2594 --- /dev/null +++ b/examples/online_serving/dashboards/grafana/performance_statistics.json @@ -0,0 +1,1405 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 26, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "Graph: E2E latency over time ", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "End-to-End latency of requests, showing average and key percentiles over time.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Latency", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__interval]) / rate(vllm:e2e_request_latency_seconds_count[$__interval])", + "format": "table", + "legendFormat": "E2E Latency", + "range": true, + "refId": "A" + } + ], + "title": "E2E Latency over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "99th percentile of End-to-End request latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "displayName": "P99", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "E2E Latency (P99)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "90th percentile of End-to-End request latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "displayName": "P90", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "E2E Latency (P90)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Average End-to-End request latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "displayName": "Average", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 5 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "(sum(increase(vllm:e2e_request_latency_seconds_sum[$__range])) / sum(increase(vllm:e2e_request_latency_seconds_count[$__range])))", + "legendFormat": "Average E2E Latency", + "range": true, + "refId": "A" + } + ], + "title": "E2E Latency (Avg)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "50th percentile (median) of End-to-End request latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "displayName": "P50", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "E2E Latency (P50)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 8, + "panels": [], + "title": "Graph: TTFT(Time To First Token) over time ", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Time to first token (TTFT) latency, showing average and key percentiles over time.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Latency", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(vllm:time_to_first_token_seconds_sum[$__interval]) / rate(vllm:time_to_first_token_seconds_count[$__interval])", + "format": "table", + "legendFormat": "TTFT (Avg)", + "range": true, + "refId": "A" + } + ], + "title": "TTFT Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "99th percentile of Time To First Token latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "displayName": "P99", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__range])))", + "legendFormat": "TTFT (p99)", + "range": true, + "refId": "A" + } + ], + "title": "TTFT (P99)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "90th percentile of Time To First Token latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "displayName": "P90", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__range])))", + "legendFormat": "TTFT (p90)", + "range": true, + "refId": "A" + } + ], + "title": "TTFT (P90)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Average Time To First Token latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "displayName": "Average", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 14 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "(sum(increase(vllm:time_to_first_token_seconds_sum[$__range])) / sum(increase(vllm:time_to_first_token_seconds_count[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "TTFT (Avg)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "50th percentile (median) of Time To First Token latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "P50", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 14 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orietitletChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "TTFT (P50)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 7, + "panels": [], + "title": "ITL (Iteration Latency / Time Per Output Token) over time.", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Iteration latency, or average time taken to generate a single output token, with percentiles.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Latency", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 17, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(vllm:time_per_output_token_seconds_sum[$__interval]) / rate(vllm:time_per_output_token_seconds_count[$__interval])", + "legendFormat": "ITL (Avg)", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))", + "hide": false, + "instant": false, + "legendFormat": "ITL (p50)", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))", + "hide": false, + "instant": false, + "legendFormat": "ITL (p90)", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))", + "hide": false, + "instant": false, + "legendFormat": "ITL (p99)", + "range": true, + "refId": "D" + } + ], + "title": "ITL (Time Per Output Token) Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "90th percentile of Iteration Latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 19 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "ITL (P90)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "99th percentile of Iteration Latency over the selected time range.\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 19 + }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "ITL (P99)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Average Iteration Latency (time per output token) over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 23 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "(sum(increase(vllm:time_per_output_token_seconds_sum[$__range])) / sum(increase(vllm:time_per_output_token_seconds_count[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "ITL (Avg)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "50th percentile (median) of Iteration Latency over the selected time range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 23 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "ITL (P50)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 6, + "panels": [], + "title": "TPS (Tokens Per Second)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Rate of tokens processed per second, including prompt and generation phases.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "tokens/sec (tps)" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(vllm:generation_tokens_total[$__interval])", + "legendFormat": "Generation TPS", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:prompt_tokens_total[$__interval])", + "hide": false, + "instant": false, + "legendFormat": "Prompt TPS", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:iteration_tokens_total_count[$__interval])", + "hide": false, + "instant": false, + "legendFormat": "Overall Iteration TPS", + "range": true, + "refId": "C" + } + ], + "title": "TPS (Tokens Per Second) Over Time", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "name": "DS_PROMETHEUS", + "type": "datasource", + "label": "datasource", + "query": "prometheus", + "refresh": 1, + "current": { + "text": "Prometheus", + "value": "prometheus" + } + }, + { + "current": { + "text": "avg : Average\n0.50 : P50\n0.90 : P90\n0.99 : P99\n0.999 : Max (Approx)", + "value": "avg : Average\n0.50 : P50\n0.90 : P90\n0.99 : P99\n0.999 : Max (Approx)" + }, + "label": "Aggregation", + "name": "agg_method", + "options": [ + { + "selected": true, + "text": "avg : Average\n0.50 : P50\n0.90 : P90\n0.99 : P99\n0.999 : Max (Approx)", + "value": "avg : Average\n0.50 : P50\n0.90 : P90\n0.99 : P99\n0.999 : Max (Approx)" + } + ], + "query": "avg : Average\n0.50 : P50\n0.90 : P90\n0.99 : P99\n0.999 : Max (Approx)", + "type": "custom" + }, + { + "current": { + "text": [ + "granite-33-2b-instruct" + ], + "value": [ + "granite-33-2b-instruct" + ] + }, + "definition": "label_values(vllm:generation_tokens_total,model_name)", + "includeAll": true, + "label": "Deployment_ID", + "multi": true, + "name": "Deployment_id", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(vllm:generation_tokens_total,model_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timezone": "browser", + "uid": "performance-statistics", + "title": "Performance Statistics", + "version": 40, + "weekStart": "" +} \ No newline at end of file diff --git a/examples/online_serving/dashboards/grafana/query_statistics.json b/examples/online_serving/dashboards/grafana/query_statistics.json new file mode 100644 index 0000000000000..880f6c5d71764 --- /dev/null +++ b/examples/online_serving/dashboards/grafana/query_statistics.json @@ -0,0 +1,760 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "High-level overview of VLLM model deployment behavior and key performance indicators. Designed for Data Scientists and Product Managers to monitor request volume, token throughput, and latency", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 47, + "links": [], + "panels": [ + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 20, + "panels": [], + "title": "Request Over Time", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "req/s" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 10, "x": 0, "y": 1 }, + "id": 1, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "editorMode": "code", + "expr": "sum by (model_name) (\n rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval])\n)", + "interval": "1", + "legendFormat": "{{model_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Successful Requests Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "req/s" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 10, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Requests Avg Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calcultaions": { "index": 0, "text": "Last (not null)" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 17, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "p50 Latency", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 10, "y": 4 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "p90 Latency", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 17, "y": 4 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "p99 Latency", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, + "id": 19, + "panels": [], + "title": "Size Distribution", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineWidth": 1, + "stacking": { "group": "A", "mode": "none" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 10, "x": 0, "y": 8 }, + "id": 6, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval]))", + "legendFormat": "{{model_name}} le={{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Input Token Size Distribution", + "type": "histogram" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "calculation ": { "index": 0, "text": "Last (not null)" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 10, "y": 8 }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Input Token Size p90", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 17, "y": 8 }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Input Token Size p50", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calcultaion": { "index": 0, "text": "mean" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 10, "y": 11 }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))\n/\nsum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Input Token Size Avg", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 17, "y": 11 }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Input Token Size p99", + "type": "stat" + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 18, + "panels": [], + "title": "Input Token Over Time", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 10, "x": 0, "y": 15 }, + "id": 11, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))", + "legendFormat": "{{model_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Input Tokens Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 10, "y": 15 }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Input Tokens/Sec Avg", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, + "id": 17, + "panels": [], + "title": "Output Token Over Time", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 10, "x": 0, "y": 22 }, + "id": 13, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))", + "legendFormat": "{{model_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Output Tokens Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 7, "x": 10, "y": 22 }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Output Tokens/Sec Avg", + "type": "stat" + } + ], + "preload": false, + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": { "text": "Prometheus", "value": "4184fc20-68a7-483a-8d9b-7caa59c680dd" }, + "label": "datasource", + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { "text": ["All"], "value": ["$__all"] }, + "definition": "label_values(vllm:request_success_total,model_name)", + "includeAll": true, + "label": "Deployment_ID", + "multi": true, + "name": "Deployment_id", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(vllm:request_success_total,model_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { "text": "All hours", "value": "All hours" }, + "hide": 2, + "label": "Rush Hours Only", + "name": "rush_hours", + "options": [ + { "selected": true, "text": "false", "value": "All hours" }, + { "selected": false, "text": "true", "value": "Rush hours" } + ], + "query": "false : All hours, true : Rush hours", + "type": "custom" + }, + { + "current": { "text": "All", "value": "All" }, + "hide": 2, + "label": "Rush Hours Type", + "name": "rush_hours_type", + "options": [ + { "selected": true, "text": "^All__.*$", "value": "All" }, + { "selected": false, "text": "^Static__.*$", "value": "Static" }, + { "selected": false, "text": "^Dynamic__.*$", "value": "Dynamic" } + ], + "query": "^All__.*$ : All, ^Static__.*$ : Static, ^Dynamic__.*$ : Dynamic", + "type": "custom" + }, + { + "current": { "text": "", "value": "" }, + "hide": 2, + "name": "query0", + "options": [], + "query": "", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { "from": "now-12h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Query Statistics_New4", + "uid": "query-statistics4", + "version": 2, + "weekStart": "" +} + diff --git a/examples/online_serving/dashboards/perses/README.md b/examples/online_serving/dashboards/perses/README.md new file mode 100644 index 0000000000000..ae04fd17b1b9d --- /dev/null +++ b/examples/online_serving/dashboards/perses/README.md @@ -0,0 +1,48 @@ +# Perses Dashboards for vLLM Monitoring + +This directory contains Perses dashboard configurations designed to monitor vLLM +performance and metrics. + +## Requirements + +- Perses instance (standalone or via operator) +- Prometheus data source configured in Perses +- vLLM deployment with Prometheus metrics enabled + +## Dashboard Format + +We provide dashboards in the **native Perses YAML format** that works across all +deployment methods: + +- **Files**: `*.yaml` (native Perses dashboard specifications) +- **Format**: Pure dashboard specifications that work everywhere +- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning +- **Kubernetes**: Directly compatible with Perses Operator + +## Dashboard Descriptions + +- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency + statistics +- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics + +## Deployment Options + +### Direct Import to Perses + +Import the dashboard specifications via Perses API or CLI: + +```bash +percli apply -f performance_statistics.yaml +``` + +### Perses Operator (Kubernetes) + +The native YAML format works directly with the Perses Operator: + +```bash +kubectl apply -f performance_statistics.yaml -n <namespace> +``` + +### File Provisioning + +Place the YAML files in a Perses provisioning folder for automatic loading. diff --git a/examples/online_serving/dashboards/perses/performance_statistics.yaml b/examples/online_serving/dashboards/perses/performance_statistics.yaml new file mode 100644 index 0000000000000..2e8d24c3324b9 --- /dev/null +++ b/examples/online_serving/dashboards/perses/performance_statistics.yaml @@ -0,0 +1,764 @@ +kind: PersesDashboard +metadata: + name: performance-statistics + createdAt: 0001-01-01T00:00:00Z + updatedAt: 0001-01-01T00:00:00Z + version: 0 + project: "" +spec: + display: + name: Performance Statistics + + variables: + - kind: ListVariable + spec: + display: + name: Deployment_ID + hidden: false + name: Deployment_id + allowAllValue: true + allowMultiple: true + defaultValue: + - $__all + sort: alphabetical-asc + plugin: + kind: PrometheusLabelValuesVariable + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + labelName: model_name + matchers: + # Any one vllm metric that always carries model_name + - vllm:generation_tokens_total{} + + panels: + "1": + kind: Panel + spec: + display: + name: E2E Latency over Time + plugin: + kind: TimeSeriesChart + spec: + legend: + mode: table + position: bottom + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + # avg latency by model = sum(rate(sum)) / sum(rate(count)) + query: > + sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval])) + / + sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval])) + seriesNameFormat: '{{model_name}}' + + "2": + kind: Panel + spec: + display: + name: E2E Latency (Avg) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + (sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range]))) + / + (sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range]))) + + "3": + kind: Panel + spec: + display: + name: E2E Latency (P50) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.50, + sum by (le, model_name) ( + rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "4": + kind: Panel + spec: + display: + name: E2E Latency (P90) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.90, + sum by (le, model_name) ( + rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "5": + kind: Panel + spec: + display: + name: E2E Latency (P99) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.99, + sum by (le, model_name) ( + rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "6": + kind: Panel + spec: + display: + name: TTFT over Time + plugin: + kind: TimeSeriesChart + spec: + legend: + mode: table + position: bottom + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval])) + / + sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval])) + seriesNameFormat: '{{model_name}}' + + "7": + kind: Panel + spec: + display: + name: TTFT (Avg) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + (sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range]))) + / + (sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range]))) + + "8": + kind: Panel + spec: + display: + name: TTFT (P50) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.50, + sum by (le, model_name) ( + rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "9": + kind: Panel + spec: + display: + name: TTFT (P90) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.90, + sum by (le, model_name) ( + rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "10": + kind: Panel + spec: + display: + name: TTFT (P99) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.99, + sum by (le, model_name) ( + rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "11": + kind: Panel + spec: + display: + name: ITL (Time per Output Token) over Time + plugin: + kind: TimeSeriesChart + spec: + legend: + mode: table + position: bottom + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval])) + / + sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval])) + seriesNameFormat: '{{model_name}}' + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.50, + sum by (le, model_name) ( + rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + seriesNameFormat: '{{model_name}} p50' + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.90, + sum by (le, model_name) ( + rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + seriesNameFormat: '{{model_name}} p90' + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.99, + sum by (le, model_name) ( + rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + seriesNameFormat: '{{model_name}} p99' + + "12": + kind: Panel + spec: + display: + name: ITL (Avg) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + (sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range]))) + / + (sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range]))) + + "13": + kind: Panel + spec: + display: + name: ITL (P50) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.50, + sum by (le, model_name) ( + rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "14": + kind: Panel + spec: + display: + name: ITL (P90) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.90, + sum by (le, model_name) ( + rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "15": + kind: Panel + spec: + display: + name: ITL (P99) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + histogram_quantile( + 0.99, + sum by (le, model_name) ( + rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) + ) + ) + + "16": + kind: Panel + spec: + display: + name: TPS (Tokens/sec) over Time + plugin: + kind: TimeSeriesChart + spec: + legend: + mode: table + position: bottom + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval])) + seriesNameFormat: '{{model_name}} generation' + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval])) + seriesNameFormat: '{{model_name}} prompt' + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + # overall iteration tokens/sec if exposed + query: > + rate(vllm:iteration_tokens_total_count[$__interval]) + seriesNameFormat: 'iteration overall' + + "17": + kind: Panel + spec: + display: + name: KV Cache Usage (avg %) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + # Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts) + query: > + 100 * avg(vllm:gpu_cache_usage_perc) + + "18": + kind: Panel + spec: + display: + name: Running Requests by Pod + plugin: + kind: TimeSeriesChart + spec: + legend: + mode: table + position: bottom + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + sum by (pod) (vllm:num_requests_running) + seriesNameFormat: '{{pod}}' + + "19": + kind: Panel + spec: + display: + name: Waiting Requests by Pod + plugin: + kind: TimeSeriesChart + spec: + legend: + mode: table + position: bottom + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: > + sum by (pod) (vllm:num_requests_waiting) + seriesNameFormat: '{{pod}}' + + "20": + kind: Panel + spec: + display: + name: Running Requests (sum) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: sum(vllm:num_requests_running) + + "21": + kind: Panel + spec: + display: + name: Waiting Requests (sum) + plugin: + kind: StatChart + spec: + calculation: last-number + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: + kind: PrometheusDatasource + name: accelerators-thanos-querier-datasource + query: sum(vllm:num_requests_waiting) + + layouts: + - kind: Grid + spec: + display: + title: Overview + items: + - x: 0 + y: 0 + width: 6 + height: 3 + content: { $ref: '#/spec/panels/17' } # KV cache % + - x: 6 + y: 0 + width: 6 + height: 3 + content: { $ref: '#/spec/panels/20' } # running sum + - x: 12 + y: 0 + width: 6 + height: 3 + content: { $ref: '#/spec/panels/21' } # waiting sum + + - kind: Grid + spec: + display: + title: E2E Latency + items: + - x: 0 + y: 1 + width: 10 + height: 6 + content: { $ref: '#/spec/panels/1' } + - x: 10 + y: 1 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/2' } + - x: 17 + y: 1 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/3' } + - x: 10 + y: 4 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/4' } + - x: 17 + y: 4 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/5' } + + - kind: Grid + spec: + display: + title: TTFT + items: + - x: 0 + y: 8 + width: 10 + height: 6 + content: { $ref: '#/spec/panels/6' } + - x: 10 + y: 8 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/7' } + - x: 17 + y: 8 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/8' } + - x: 10 + y: 11 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/9' } + - x: 17 + y: 11 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/10' } + + - kind: Grid + spec: + display: + title: ITL (Time per Output Token) + items: + - x: 0 + y: 15 + width: 10 + height: 6 + content: { $ref: '#/spec/panels/11' } + - x: 10 + y: 15 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/12' } + - x: 17 + y: 15 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/13' } + - x: 10 + y: 18 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/14' } + - x: 17 + y: 18 + width: 7 + height: 3 + content: { $ref: '#/spec/panels/15' } + + - kind: Grid + spec: + display: + title: TPS (Prompt / Generation / Iteration) + items: + - x: 0 + y: 22 + width: 14 + height: 6 + content: { $ref: '#/spec/panels/16' } + + - kind: Grid + spec: + display: + title: Per-Pod Request State + items: + - x: 0 + y: 28 + width: 12 + height: 6 + content: { $ref: '#/spec/panels/18' } + - x: 12 + y: 28 + width: 12 + height: 6 + content: { $ref: '#/spec/panels/19' } + diff --git a/examples/online_serving/dashboards/perses/query_statistics.yaml b/examples/online_serving/dashboards/perses/query_statistics.yaml new file mode 100644 index 0000000000000..28109aae81511 --- /dev/null +++ b/examples/online_serving/dashboards/perses/query_statistics.yaml @@ -0,0 +1,392 @@ +kind: PersesDashboard +metadata: + name: query-statistics + createdAt: 0001-01-01T00:00:00Z + updatedAt: 0001-01-01T00:00:00Z + version: 0 + project: "" +spec: + display: + name: Query Statistics_New + + variables: + - kind: ListVariable + spec: + name: NS + display: { name: Namespace } + allowMultiple: false + defaultValue: llm-d + plugin: + kind: PrometheusLabelValuesVariable + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + labelName: namespace + matchers: + - up{service=~".*vllm.*"} + + - kind: ListVariable + spec: + name: SVC + display: { name: Service } + allowMultiple: false + defaultValue: vllm-qwen2-0-5b-sim + plugin: + kind: PrometheusLabelValuesVariable + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + labelName: service + matchers: + - up{namespace="$NS",service=~".*vllm.*"} + + - kind: ListVariable + spec: + name: MODEL + display: { name: Model (real vLLM) } + allowAllValue: true + allowMultiple: true + defaultValue: ["$__all"] + plugin: + kind: PrometheusLabelValuesVariable + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + labelName: model_name + matchers: + - vllm:request_success_total{namespace="$NS",service="$SVC"} + + panels: + + # --- Core (works on Simulator & Real) --- + core_running_now: + kind: Panel + spec: + display: { name: Running Requests (now) } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0) + minStep: "15s" + + core_waiting_now: + kind: Panel + spec: + display: { name: Waiting Requests (now) } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0) + minStep: "15s" + + core_kv_usage_now: + kind: Panel + spec: + display: { name: KV Cache Usage (0–1) } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0) + minStep: "15s" + + core_running_ts: + kind: Panel + spec: + display: { name: Running Over Time } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0) + minStep: "15s" + + core_waiting_ts: + kind: Panel + spec: + display: { name: Waiting Over Time } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0) + minStep: "15s" + + core_targets_up: + kind: Panel + spec: + display: { name: Scrape Targets Up } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0) + minStep: "15s" + + # --- KV Cache as Percent (works on Simulator & Real) --- + core_kv_usage_pct_now: + kind: Panel + spec: + display: { name: KV Cache Usage (%) – now } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + # multiply by 100 to present percentage; omit format.unit to avoid schema conflicts + query: (avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) + minStep: "15s" + + core_kv_usage_pct_ts: + kind: Panel + spec: + display: { name: KV Cache Usage (%) – over time } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: (avg by (service) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) + minStep: "15s" + + # --- Per-Pod breakdowns (works on Simulator & Real) --- + per_pod_running_ts: + kind: Panel + spec: + display: { name: Running by Pod } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0) + minStep: "15s" + + per_pod_waiting_ts: + kind: Panel + spec: + display: { name: Waiting by Pod } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0) + minStep: "15s" + + per_pod_kv_pct_ts: + kind: Panel + spec: + display: { name: KV Cache (%) by Pod } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + # if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty + query: (avg by (pod) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) + minStep: "15s" + + # --- Real vLLM only (zeros on simulator) --- + real_req_rate_ts: + kind: Panel + spec: + display: { name: Request Rate (real vLLM) } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0) + minStep: "15s" + + real_p50: + kind: Panel + spec: + display: { name: p50 Latency (real vLLM) } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0) + minStep: "15s" + + real_p90: + kind: Panel + spec: + display: { name: p90 Latency (real vLLM) } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0) + minStep: "15s" + + real_p99: + kind: Panel + spec: + display: { name: p99 Latency (real vLLM) } + plugin: { kind: StatChart, spec: { calculation: last-number } } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0) + minStep: "15s" + + real_input_tokens_ts: + kind: Panel + spec: + display: { name: Input Tokens / sec (real vLLM) } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0) + minStep: "15s" + + real_output_tokens_ts: + kind: Panel + spec: + display: { name: Output Tokens / sec (real vLLM) } + plugin: + kind: TimeSeriesChart + spec: + legend: { mode: table, position: bottom } + visual: { display: line, lineWidth: 1, areaOpacity: 0.3 } + queries: + - kind: TimeSeriesQuery + spec: + plugin: + kind: PrometheusTimeSeriesQuery + spec: + datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } + query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0) + minStep: "15s" + + layouts: + - kind: Grid + spec: + display: { title: Core (Sim & Real) } + items: + - { x: 0, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_running_now' } } + - { x: 6, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } } + - { x: 12, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } } + - { x: 18, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_targets_up' } } + - { x: 0, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } } + - { x: 12, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } } + + - kind: Grid + spec: + display: { title: KV Cache (%) } + items: + - { x: 0, y: 9, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } } + - { x: 6, y: 9, width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } } + + - kind: Grid + spec: + display: { title: Per-Pod breakdowns } + items: + - { x: 0, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } } + - { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } } + - { x: 0, y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } } + + - kind: Grid + spec: + display: { title: Real vLLM only (shows 0 on simulator) } + items: + - { x: 0, y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } } + - { x: 12, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p50' } } + - { x: 16, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p90' } } + - { x: 20, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p99' } } + - { x: 0, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } } + - { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } } + From e4f0b4cd96a9a729399759d9a54a861aacf57c18 Mon Sep 17 00:00:00 2001 From: TeeKen Lau <13831887+teekenl@users.noreply.github.com> Date: Tue, 16 Sep 2025 23:08:46 +1000 Subject: [PATCH 317/584] (doc): set cmake c++ compatible standard when building on MacOS CPU. (#23483) Signed-off-by: teekenl <teekenlau@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../installation/cpu/apple.inc.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md index 124a41adf1ae2..7e2ed55008a57 100644 --- a/docs/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -52,6 +52,24 @@ uv pip install -e . 1 error generated. ``` + --- + + If the build fails with C++11/C++17 compatibility errors like the following, the issue is that the build system is defaulting to an older C++ standard: + + ```text + [...] error: 'constexpr' is not a type + [...] error: expected ';' before 'constexpr' + [...] error: 'constexpr' does not name a type + ``` + + **Solution**: Your compiler might be using an older C++ standard. Edit `cmake/cpu_extension.cmake` and add `set(CMAKE_CXX_STANDARD 17)` before `set(CMAKE_CXX_STANDARD_REQUIRED ON)`. + + To check your compiler's C++ standard support: + ```bash + clang++ -std=c++17 -pedantic -dM -E -x c++ /dev/null | grep __cplusplus + ``` + On Apple Clang 16 you should see: `#define __cplusplus 201703L` + # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] From 4e5affeaa14220b5699c667e6c3ee5402b4284ef Mon Sep 17 00:00:00 2001 From: Ming Yang <minos.future@gmail.com> Date: Tue, 16 Sep 2025 06:21:28 -0700 Subject: [PATCH 318/584] [CI] Add Decode Context Parallelism (DCP) test to CI (#24487) Signed-off-by: Ming Yang <minos.future@gmail.com> --- .buildkite/test-pipeline.yaml | 17 ++++++++++++++--- tests/distributed/test_context_parallel.py | 11 +++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index df95fcaa04382..f0fd808fd6dce 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -946,7 +946,6 @@ steps: commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py - # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support - label: LoRA TP Test (Distributed) # 17 min timeout_in_minutes: 30 @@ -1020,9 +1019,21 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -- label: Qwen MoE EP Test # optional +##### H200 test ##### +- label: Distrubted Tests (H200) # optional gpu: h200 optional: true + working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 23be703a30682..11685bc90c413 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -71,12 +71,13 @@ class CPTestSettings: parallel_setups = [] for eager_mode_val in [False]: for pp_multiplier in [1]: - for dcp_multiplier in [2, 4]: + for dcp_multiplier in [0.5, 1]: for chunked_prefill_val in [True]: parallel_setups.append( ParallelSetup(tp_size=tp_base, pp_size=pp_multiplier * pp_base, - dcp_size=dcp_multiplier * dcp_base, + dcp_size=int(dcp_multiplier * + tp_base), eager_mode=eager_mode_val, chunked_prefill=chunked_prefill_val)) return CPTestSettings( @@ -223,7 +224,9 @@ def _compare_cp_with_tp( CP_TEXT_GENERATION_MODELS = { # [MLA attention only] - "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(), + "deepseek-ai/DeepSeek-V2-Lite-Chat": + [CPTestSettings.detailed(), + CPTestSettings.detailed(tp_base=2)], } CP_TEST_MODELS = [ @@ -238,7 +241,7 @@ CP_TEST_MODELS = [ "runner", "test_options"), [ params for model_id, settings in CP_TEXT_GENERATION_MODELS.items() - for params in settings.iter_params(model_id) + for setting in settings for params in setting.iter_params(model_id) if model_id in CP_TEST_MODELS ], ) From 73cfb3c5eeb8b00a6e222751a28fd89a5f6229dc Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Tue, 16 Sep 2025 10:53:43 -0400 Subject: [PATCH 319/584] [Model] Clean up and simplify Mamba2 Metadata Usage in both V0 and V1 (#24331) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> --- .../layers/mamba/mamba2_metadata.py | 62 +++++++------------ .../layers/mamba/mamba_mixer2.py | 29 +++------ vllm/model_executor/models/plamo2.py | 29 ++++----- 3 files changed, 44 insertions(+), 76 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index 3256ac034aa11..368bfe3af1d3f 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -17,14 +17,13 @@ from vllm.v1.attention.backends.mamba2_attn import ( @dataclass class Mamba2Metadata: - - has_initial_states: torch.Tensor prep_initial_states: bool - chunk_size: int - seq_idx: torch.Tensor - chunk_indices: torch.Tensor - chunk_offsets: torch.Tensor + + has_initial_states_p: torch.Tensor + seq_idx_p: torch.Tensor + chunk_indices_p: torch.Tensor + chunk_offsets_p: torch.Tensor """ With continuous batching layout of `x` in vLLM, to enable a Triton program to handle a request in parallel, two supporting tensors are used @@ -68,7 +67,6 @@ def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: def prepare_mamba2_metadata( chunk_size: int, attn_metadata: AttentionMetadata, - mamba2_metadata=None, ) -> Mamba2Metadata: # compute number of prefill and decode requests @@ -76,11 +74,11 @@ def prepare_mamba2_metadata( num_prefills = attn_metadata.num_prefills num_prefill_tokens = attn_metadata.num_prefill_tokens - seq_idx = None - chunk_indices, chunk_offsets = None, None + seq_idx_p = None + chunk_indices_p, chunk_offsets_p = None, None # Need flags to indicate if there are initial states # currently we really only support the FlashAttention backend - has_initial_states = None + has_initial_states_p = None prep_initial_states = False # Compute seq_idx, chunk_indices and chunk_offsets for prefill only @@ -91,44 +89,30 @@ def prepare_mamba2_metadata( # precompute flag to avoid device syncs later in mamba2 layer # forwards # prep is only needed for mamba2 ssd prefill processing - has_initial_states = attn_metadata.context_lens_tensor > 0 - prep_initial_states = torch.any( - has_initial_states[:num_prefills]).item() - query_start_loc = attn_metadata.query_start_loc[:num_prefills + 1] - seq_idx = torch.repeat_interleave(torch.arange( - num_prefills, dtype=torch.int32, device=query_start_loc.device), - query_start_loc.diff(), - output_size=num_prefill_tokens) - seq_idx.unsqueeze_(0) + has_initial_states_p = ( + attn_metadata.context_lens_tensor[:num_prefills] > 0) + prep_initial_states = torch.any(has_initial_states_p).item() + query_start_loc_p = attn_metadata.query_start_loc[:num_prefills + 1] + seq_idx_p = torch.repeat_interleave(torch.arange( + num_prefills, dtype=torch.int32, device=query_start_loc_p.device), + query_start_loc_p.diff(), + output_size=num_prefill_tokens) + seq_idx_p.unsqueeze_(0) # We compute metadata for chunked prefill once at the top level model # forward and reuse them in mamba layers. If not needed, they will be # ignored inside mamba kernels. if prep_initial_states: - chunk_indices, chunk_offsets = \ + chunk_indices_p, chunk_offsets_p = \ _query_start_loc_to_chunk_indices_offsets( - query_start_loc, chunk_size, num_prefill_tokens) + query_start_loc_p, chunk_size, num_prefill_tokens) - if mamba2_metadata is not None: - mamba2_metadata.has_initial_states = has_initial_states - mamba2_metadata.prep_initial_states = prep_initial_states - mamba2_metadata.chunk_size = chunk_size - mamba2_metadata.seq_idx = seq_idx - mamba2_metadata.chunk_indices = chunk_indices - mamba2_metadata.chunk_offsets = chunk_offsets - # We use 1 reset flag: - # * mamba2_metadata.cu_seqlen is None - # update config specific to (each input) - # (become available at first layer, e.g. conv_weights) - mamba2_metadata.cu_seqlen = None # suppose to be updated at each input - - return mamba2_metadata - return Mamba2Metadata(has_initial_states=has_initial_states, + return Mamba2Metadata(has_initial_states_p=has_initial_states_p, prep_initial_states=prep_initial_states, chunk_size=chunk_size, - seq_idx=seq_idx, - chunk_indices=chunk_indices, - chunk_offsets=chunk_offsets) + seq_idx_p=seq_idx_p, + chunk_indices_p=chunk_indices_p, + chunk_offsets_p=chunk_offsets_p) def update_metadata(x: torch.Tensor, query_start_loc: torch.Tensor, diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 23e19da430e14..02e6a9138c05f 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -518,22 +518,19 @@ class MambaMixer2(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] state_indices_tensor = attn_metadata.state_indices_tensor - has_initial_states_p = attn_metadata.has_initial_states_p - prep_initial_states = attn_metadata.prep_initial_states - chunk_size = attn_metadata.chunk_size - seq_idx_p = attn_metadata.seq_idx_p - chunk_indices_p = attn_metadata.chunk_indices_p - chunk_offsets_p = attn_metadata.chunk_offsets_p else: conv_state = mamba_cache_params.conv_state ssm_state = mamba_cache_params.ssm_state state_indices_tensor = mamba_cache_params.state_indices_tensor - has_initial_states_p = mamba2_metadata.has_initial_states + + # Common members between V1 metadata and V0 metadata + if mamba2_metadata is not None: + has_initial_states_p = mamba2_metadata.has_initial_states_p prep_initial_states = mamba2_metadata.prep_initial_states chunk_size = mamba2_metadata.chunk_size - seq_idx_p = mamba2_metadata.seq_idx - chunk_indices_p = mamba2_metadata.chunk_indices - chunk_offsets_p = mamba2_metadata.chunk_offsets + seq_idx_p = mamba2_metadata.seq_idx_p + chunk_indices_p = mamba2_metadata.chunk_indices_p + chunk_offsets_p = mamba2_metadata.chunk_offsets_p # 1. Gated MLP's linear projection projected_states, _ = self.in_proj(hidden_states) @@ -677,15 +674,9 @@ class MambaMixer2(MambaBase, CustomOp): # 3. State Space Model sequence transformation initial_states = None if (has_initial_states_p is not None and prep_initial_states): - # making a copy of the states - if envs.VLLM_USE_V1: - initial_states = torch.where( - has_initial_states_p[:, None, None, None], - ssm_state[state_indices_tensor_p], 0) - else: - initial_states = torch.where( - has_initial_states_p[:num_prefills, None, None, None], - ssm_state[state_indices_tensor_p], 0) + initial_states = torch.where( + has_initial_states_p[:, None, None, None], + ssm_state[state_indices_tensor_p], 0) # NOTE: final output is an in-place update of out tensor varlen_state = mamba_chunk_scan_combined( diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index b9869f5e58800..ef96d272adfb5 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -279,22 +279,19 @@ class Plamo2MambaMixer(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] state_indices_tensor = attn_metadata.state_indices_tensor - has_initial_states_p = attn_metadata.has_initial_states_p - prep_initial_states = attn_metadata.prep_initial_states - chunk_size = attn_metadata.chunk_size - seq_idx_p = attn_metadata.seq_idx_p - chunk_indices_p = attn_metadata.chunk_indices_p - chunk_offsets_p = attn_metadata.chunk_offsets_p else: conv_state = mamba_cache_params.conv_state ssm_state = mamba_cache_params.ssm_state state_indices_tensor = mamba_cache_params.state_indices_tensor - has_initial_states_p = mamba2_metadata.has_initial_states + + # Common members between V1 metadata and V0 metadata + if mamba2_metadata is not None: + has_initial_states_p = mamba2_metadata.has_initial_states_p prep_initial_states = mamba2_metadata.prep_initial_states chunk_size = mamba2_metadata.chunk_size - seq_idx_p = mamba2_metadata.seq_idx - chunk_indices_p = mamba2_metadata.chunk_indices - chunk_offsets_p = mamba2_metadata.chunk_offsets + seq_idx_p = mamba2_metadata.seq_idx_p + chunk_indices_p = mamba2_metadata.chunk_indices_p + chunk_offsets_p = mamba2_metadata.chunk_offsets_p # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states) @@ -414,14 +411,10 @@ class Plamo2MambaMixer(MambaBase, CustomOp): initial_states = None if has_initial_states_p is not None and prep_initial_states: # making a copy of the states - if envs.VLLM_USE_V1: - initial_states = torch.where( - has_initial_states_p[:, None, None, None], - ssm_state[state_indices_tensor_p], 0) - else: - initial_states = torch.where( - has_initial_states_p[:num_prefills, None, None, None], - ssm_state[state_indices_tensor_p], 0) + initial_states = torch.where( + has_initial_states_p[:, None, None, None], + ssm_state[state_indices_tensor_p], 0) + varlen_state = mamba_chunk_scan_combined( hidden_states_p.view(1, num_prefill_tokens, self.num_heads // self.tp_size, From 08369289af49229930c61a76ec65b127ad3fba95 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Tue, 16 Sep 2025 16:32:47 +0100 Subject: [PATCH 320/584] [Core][MultiModalHasher] Don't convert memoryviews to bytes during hashing (#24925) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/multimodal/hasher.py | 47 +++++++++++++++------------------------ 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index da019d40a6fe4..0fb1363ce471a 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -20,22 +20,22 @@ logger = init_logger(__name__) class MultiModalHasher: @classmethod - def serialize_item(cls, obj: object) -> Union[bytes, memoryview]: + def serialize_item(cls, obj: object) -> Iterable[Union[bytes, memoryview]]: # Simple cases - if isinstance(obj, str): - return obj.encode("utf-8") if isinstance(obj, (bytes, memoryview)): - return obj + return (obj, ) + if isinstance(obj, str): + return (obj.encode("utf-8"), ) if isinstance(obj, (int, float)): - return np.array(obj).tobytes() + return (np.array(obj).tobytes(), ) if isinstance(obj, Image.Image): exif = obj.getexif() if Image.ExifTags.Base.ImageID in exif and isinstance( exif[Image.ExifTags.Base.ImageID], uuid.UUID): # If the image has exif ImageID tag, use that - return exif[Image.ExifTags.Base.ImageID].bytes - return cls.item_to_bytes( + return (exif[Image.ExifTags.Base.ImageID].bytes, ) + return cls.iter_item_to_bytes( "image", np.asarray(convert_image_mode(obj, "RGBA"))) if isinstance(obj, torch.Tensor): tensor_obj: torch.Tensor = obj.cpu() @@ -49,43 +49,34 @@ class MultiModalHasher: tensor_obj = tensor_obj.view( (tensor_obj.numel(), )).view(torch.uint8) - return cls.item_to_bytes( + return cls.iter_item_to_bytes( "tensor", { "original_dtype": str(tensor_dtype), "original_shape": tuple(tensor_shape), "data": tensor_obj.numpy(), }) - - return cls.item_to_bytes("tensor", tensor_obj.numpy()) + return cls.iter_item_to_bytes("tensor", tensor_obj.numpy()) if isinstance(obj, np.ndarray): # If the array is non-contiguous, we need to copy it first - arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes() - return cls.item_to_bytes("ndarray", { + arr_data = obj.view( + np.uint8).data if obj.flags.c_contiguous else obj.tobytes() + return cls.iter_item_to_bytes("ndarray", { "dtype": obj.dtype.str, "shape": obj.shape, "data": arr_data, }) - logger.warning( "No serialization method found for %s. " "Falling back to pickle.", type(obj)) - return pickle.dumps(obj) - - @classmethod - def item_to_bytes( - cls, - key: str, - obj: object, - ) -> bytes: - return b''.join(kb + vb for kb, vb in cls.iter_item_to_bytes(key, obj)) + return (pickle.dumps(obj), ) @classmethod def iter_item_to_bytes( cls, key: str, obj: object, - ) -> Iterable[tuple[bytes, Union[bytes, memoryview]]]: + ) -> Iterable[Union[bytes, memoryview]]: # Recursive cases if isinstance(obj, (list, tuple)): for i, elem in enumerate(obj): @@ -94,17 +85,15 @@ class MultiModalHasher: for k, v in obj.items(): yield from cls.iter_item_to_bytes(f"{key}.{k}", v) else: - key_bytes = key.encode("utf-8") - value_bytes = cls.serialize_item(obj) - yield key_bytes, value_bytes + yield key.encode("utf-8") + yield from cls.serialize_item(obj) @classmethod def hash_kwargs(cls, **kwargs: object) -> str: hasher = blake3() for k, v in kwargs.items(): - for k_bytes, v_bytes in cls.iter_item_to_bytes(k, v): - hasher.update(k_bytes) - hasher.update(v_bytes) + for bytes_ in cls.iter_item_to_bytes(k, v): + hasher.update(bytes_) return hasher.hexdigest() From 567939953b7a9cb0ded6bf0bb21a76917b8fed97 Mon Sep 17 00:00:00 2001 From: Sage Moore <sage@neuralmagic.com> Date: Tue, 16 Sep 2025 09:21:48 -0700 Subject: [PATCH 321/584] [Core/DBO][1/N] Add Dual-Batch Overlap mechanism to VLLM (#23693) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- examples/offline_inference/data_parallel.py | 8 + .../v1/attention/test_attention_splitting.py | 10 +- tests/v1/spec_decode/test_eagle.py | 8 +- vllm/config/__init__.py | 8 + vllm/config/parallel.py | 8 + .../device_communicators/all2all.py | 5 - vllm/engine/arg_utils.py | 10 + vllm/forward_context.py | 121 +++++-- .../fused_moe/deepep_ht_prepare_finalize.py | 26 +- .../fused_moe/deepep_ll_prepare_finalize.py | 45 ++- vllm/model_executor/layers/fused_moe/layer.py | 58 +++- .../layers/fused_moe/modular_kernel.py | 63 +++- .../layers/fused_moe/pplx_prepare_finalize.py | 47 ++- vllm/v1/attention/backends/utils.py | 18 +- vllm/v1/spec_decode/eagle.py | 12 +- vllm/v1/worker/cpu_model_runner.py | 9 +- vllm/v1/worker/gpu_model_runner.py | 275 +++++++++++++--- vllm/v1/worker/gpu_ubatch_wrapper.py | 303 ++++++++++++++++++ vllm/v1/worker/ubatch_splitting.py | 155 +++++++++ vllm/v1/worker/ubatch_utils.py | 19 ++ vllm/v1/worker/ubatching.py | 211 ++++++++++++ vllm/v1/worker/utils.py | 10 +- 22 files changed, 1257 insertions(+), 172 deletions(-) create mode 100644 vllm/v1/worker/gpu_ubatch_wrapper.py create mode 100644 vllm/v1/worker/ubatch_splitting.py create mode 100644 vllm/v1/worker/ubatch_utils.py create mode 100644 vllm/v1/worker/ubatching.py diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 36d805a32db7a..98fe36d0fb796 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -87,6 +87,11 @@ def parse_args(): default=0.8, help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), ) + parser.add_argument( + "--enable-dbo", + action="store_true", + help=("Enable microbatched execution"), + ) parser.add_argument( "--compilation-config", type=int, @@ -113,6 +118,7 @@ def main( max_model_len, compilation_config, gpu_memory_utilization, + enable_dbo, quantization, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) @@ -167,6 +173,7 @@ def main( max_num_seqs=max_num_seqs, max_model_len=max_model_len, gpu_memory_utilization=gpu_memory_utilization, + enable_dbo=enable_dbo, quantization=quantization, compilation_config=compilation_config, ) @@ -227,6 +234,7 @@ if __name__ == "__main__": args.max_model_len, args.compilation_config, args.gpu_memory_utilization, + args.enable_dbo, args.quantization, ), ) diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py index 3fc1011d5042e..c74dbb3ebb17e 100644 --- a/tests/v1/attention/test_attention_splitting.py +++ b/tests/v1/attention/test_attention_splitting.py @@ -6,7 +6,7 @@ import torch from tests.v1.attention.test_attention_backends import BATCH_SPECS from tests.v1.attention.utils import create_common_attn_metadata -from vllm.v1.attention.backends.utils import (UbatchSlice, +from vllm.v1.attention.backends.utils import (UBatchSlice, _make_metadata_with_slice, slice_query_start_locs, split_attn_metadata) @@ -106,7 +106,7 @@ def mixed_small_metadata(): def test_make_metadata_with_slice_decode_batch(small_decode_metadata): """Test slicing decode batch metadata""" # Split first request only - ubatch_slice = UbatchSlice(slice(0, 1), slice(0, 1)) + ubatch_slice = UBatchSlice(slice(0, 1), slice(0, 1)) result = _make_metadata_with_slice(ubatch_slice, small_decode_metadata) @@ -120,7 +120,7 @@ def test_make_metadata_with_slice_decode_batch(small_decode_metadata): def test_make_metadata_with_slice_mixed_batch(mixed_small_metadata): """Test slicing mixed batch metadata""" - ubatch_slice = UbatchSlice(slice(1, 3), + ubatch_slice = UBatchSlice(slice(1, 3), slice(1, 7)) # Requests 1-3, tokens 1-7 result = _make_metadata_with_slice(ubatch_slice, mixed_small_metadata) @@ -137,8 +137,8 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata): num_tokens = large_decode_metadata.num_reqs mid_point = num_tokens // 2 ubatch_slices = [ - UbatchSlice(slice(0, mid_point), slice(0, mid_point)), - UbatchSlice(slice(mid_point, num_tokens), slice(mid_point, + UBatchSlice(slice(0, mid_point), slice(0, mid_point)), + UBatchSlice(slice(mid_point, num_tokens), slice(mid_point, num_tokens)), ] diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index ddedc61aae296..ccab04628a163 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -365,7 +365,9 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): # Mock runner for attention metadata building proposer.runner = mock.MagicMock() proposer.runner.attn_groups.append([mock.MagicMock()]) - proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder + proposer.runner.attn_groups[0][0].metadata_builders = [ + attn_metadata_builder + ] result = proposer.propose(target_token_ids=target_token_ids, target_positions=target_positions, @@ -489,7 +491,9 @@ def test_propose_tree(spec_token_tree): # Mock runner for attention metadata building. proposer.runner = mock.MagicMock() proposer.runner.attn_groups.append([mock.MagicMock()]) - proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder + proposer.runner.attn_groups[0][0].metadata_builders = [ + attn_metadata_builder + ] # Setup inputs for the proposer. target_token_ids = torch.randint(0, diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6bb0fef23719a..535802585d18b 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2848,6 +2848,14 @@ class VllmConfig: "when cudagraph_mode piecewise cudagraphs is used, "\ f"cudagraph_mode={self.compilation_config.cudagraph_mode}" + if self.parallel_config.enable_dbo: + a2a_backend = envs.VLLM_ALL2ALL_BACKEND + assert a2a_backend == "deepep_low_latency", \ + "Microbatching currently only supports the deepep_low_latency "\ + f"all2all backend. {a2a_backend} is not supported. To fix set "\ + "the VLLM_ALL2ALL_BACKEND environment variable to "\ + "deepep_low_latency and install the DeepEP kerenls." + if not self.instance_id: self.instance_id = random_uuid()[:5] diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 231406bf60524..8e92e54a96780 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -137,6 +137,14 @@ class ParallelConfig: disable_custom_all_reduce: bool = False """Disable the custom all-reduce kernel and fall back to NCCL.""" + enable_dbo: bool = False + """Enable microbatching for the model executor.""" + + dbo_decode_token_threshold: int = 32 + """The threshold for microbatching. If the number of tokens in the + request is greater than this threshold, microbatching will be used. + Otherwise, the request will be processed in a single batch.""" + ray_workers_use_nsight: bool = False """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 7c0f30b9aab8c..427fd040fcb71 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -251,9 +251,4 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): logger.debug("DeepEP all2all args %s", buffer_kwargs) handle: deep_ep.Buffer = self.handle_cache.get_or_create( buffer_kwargs, deep_ep.Buffer) - # It is dangerous to set num sms outside this function. num_sms is not - # a part of the hash-key that identifies this object. If we are in a - # situation where we make objects with different num_sms, the hash key - # in get_or_create must be updated. - handle.set_num_sms(self.num_sms) return handle diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 20d998d613d47..4831cb5348c77 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -327,6 +327,9 @@ class EngineArgs: data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel + enable_dbo: bool = ParallelConfig.enable_dbo + dbo_decode_token_threshold: int = \ + ParallelConfig.dbo_decode_token_threshold eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb expert_placement_strategy: ExpertPlacementStrategy = \ @@ -695,6 +698,11 @@ class EngineArgs: parallel_group.add_argument( "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]) + parallel_group.add_argument("--enable-dbo", + **parallel_kwargs["enable_dbo"]) + parallel_group.add_argument( + "--dbo-decode-token-threshold", + **parallel_kwargs["dbo_decode_token_threshold"]) parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"]) parallel_group.add_argument("--eplb-config", @@ -1339,6 +1347,8 @@ class EngineArgs: data_parallel_backend=self.data_parallel_backend, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, enable_expert_parallel=self.enable_expert_parallel, + enable_dbo=self.enable_dbo, + dbo_decode_token_threshold=self.dbo_decode_token_threshold, enable_eplb=self.enable_eplb, eplb_config=self.eplb_config, expert_placement_strategy=self.expert_placement_strategy, diff --git a/vllm/forward_context.py b/vllm/forward_context.py index b3ddd7b9a7392..3b535423f7bca 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -14,6 +14,7 @@ import vllm.envs as envs from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.v1.worker.ubatch_utils import UBatchSlices, is_second_ubatch_empty if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -97,6 +98,53 @@ class DPMetadata: dist.all_reduce(num_tokens_tensor, group=group) return num_tokens_tensor.cpu() + @staticmethod + def should_ubatch_across_dp( + should_ubatch: bool, orig_num_tokens_per_ubatch: int, + padded_num_tokens_per_ubatch: int, dp_size: int, + dp_rank: int) -> tuple[bool, Optional[torch.Tensor]]: + """ + 1. Decides if each DP rank is going to microbatch. Either all ranks + run with microbatching or none of them do. If this function decides + not to run with microbatching. It will "abort" meaning that no padding + information will be returned to the caller. It will return (False, None) + + 2. Determines the total number of tokens that each rank will run. + All ranks will be padded out so that the run with the same number + of tokens + + Returns: tuple[ + should_ubatch: Are all DP ranks going to microbatch + num_tokens_after_padding: A tensor containing the total number of + tokens per-microbatch for each DP rank including padding. Will be + None if should_ubatch if False + ] + """ + + device = current_platform.device_type + tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32) + tensor[0][dp_rank] = orig_num_tokens_per_ubatch + tensor[1][dp_rank] = padded_num_tokens_per_ubatch + tensor[2][dp_rank] = 1 if should_ubatch else 0 + + from vllm.distributed.parallel_state import get_dp_group + dist.all_reduce(tensor, group=get_dp_group().device_group) + + result: bool = bool(torch.all(tensor[2] == 1).item()) + if not result: + return result, None + + orig_num_tokens_tensor = tensor[0, :] + padded_num_tokens_tensor = tensor[1, :] + + orig_min_num_tokens = int(orig_num_tokens_tensor.min().item()) + padded_max_num_tokens = int(padded_num_tokens_tensor.max().item()) + if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens): + logger.debug("Aborting ubatching %s %s", orig_min_num_tokens, + padded_max_num_tokens) + return False, None + return result, padded_num_tokens_tensor.cpu() + @staticmethod def make( parallel_config: ParallelConfig, @@ -119,14 +167,15 @@ class DPMetadata: # If num_tokens_across_dp is None, it will be computed by all_reduce # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize - assert (num_tokens_across_dp is None - or num_tokens_across_dp[dp_rank] == batchsize) + assert (num_tokens_across_dp is None or num_tokens_across_dp[dp_rank] + == batchsize), f"{num_tokens_across_dp[dp_rank]} {batchsize}" if num_tokens_across_dp is None: num_tokens_across_dp = DPMetadata.num_tokens_across_dp( batchsize, dp_size, dp_rank) max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp) cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0) - return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu) + return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu, + num_tokens_across_dp) @contextmanager def chunked_sizes(self, max_chunk_size_per_rank: int, chunk_idx: int): @@ -179,9 +228,12 @@ class ForwardContext: Type AttentionMetadata for v0, Type Dict[str, AttentionMetadata] for v1, map from layer_name of each attention layer to its attention metadata - set dynamically for each forward pass + Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one + for each microbatch. + Set dynamically for each forward pass """ - attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"]] + attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"], + list[dict[str, "AttentionMetadata"]]] # TODO: remove after making all virtual_engines share the same kv cache virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass @@ -191,6 +243,8 @@ class ForwardContext: cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE batch_descriptor: Optional[BatchDescriptor] = None + ubatch_slices: Optional[UBatchSlices] = None + def __post_init__(self): assert self.cudagraph_runtime_mode in [ CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \ @@ -208,6 +262,39 @@ def get_forward_context() -> ForwardContext: return _forward_context +def create_forward_context( + attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0, + dp_metadata: Optional[DPMetadata] = None, + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + batch_descriptor: Optional[BatchDescriptor] = None, + ubatch_slices: Optional[UBatchSlices] = None): + return ForwardContext(no_compile_layers=vllm_config.compilation_config. + static_forward_context, + virtual_engine=virtual_engine, + attn_metadata=attn_metadata, + dp_metadata=dp_metadata, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ubatch_slices=ubatch_slices) + + +@contextmanager +def override_forward_context(forward_context: Optional[ForwardContext]): + """A context manager that overrides the current forward context. + This is used to override the forward context for a specific + forward pass. + """ + global _forward_context + prev_context = _forward_context + _forward_context = forward_context + try: + yield + finally: + _forward_context = prev_context + + @contextmanager def set_forward_context( attn_metadata: Any, @@ -216,7 +303,8 @@ def set_forward_context( num_tokens: Optional[int] = None, num_tokens_across_dp: Optional[torch.Tensor] = None, cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, - batch_descriptor: Optional[BatchDescriptor] = None): + batch_descriptor: Optional[BatchDescriptor] = None, + ubatch_slices: Optional[UBatchSlices] = None): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -225,6 +313,7 @@ def set_forward_context( need_to_track_batchsize = track_batchsize and attn_metadata is not None if need_to_track_batchsize: forward_start_time = time.perf_counter() + dp_metadata: Optional[DPMetadata] = None if vllm_config.parallel_config.data_parallel_size > 1 and ( attn_metadata is not None or num_tokens is not None): @@ -232,20 +321,14 @@ def set_forward_context( attn_metadata, num_tokens or 0, num_tokens_across_dp) - global _forward_context - prev_context = _forward_context - _forward_context = ForwardContext( - no_compile_layers=vllm_config.compilation_config. - static_forward_context, - virtual_engine=virtual_engine, - attn_metadata=attn_metadata, - dp_metadata=dp_metadata, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, - ) + forward_context = create_forward_context(attn_metadata, vllm_config, + virtual_engine, dp_metadata, + cudagraph_runtime_mode, + batch_descriptor, ubatch_slices) try: - yield + with override_forward_context(forward_context): + yield finally: global last_logging_time, batchsize_logging_interval if need_to_track_batchsize: @@ -282,5 +365,3 @@ def set_forward_context( logger.info(("Batchsize forward time stats " "(batchsize, count, median_time(ms)): %s"), forward_stats) - - _forward_context = prev_context diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 2a3ae478f3eab..92cbb1742974c 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -191,7 +191,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> Callable: + ) -> tuple[Callable, mk.ReceiverType]: if apply_router_weight_on_input: topk = topk_ids.size(1) @@ -217,13 +217,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): a1q_scale = None a1_post_scale = a1_scale - return self._do_dispatch(tokens=a1q, - token_scales=a1q_scale, - rank_topk_ids=topk_ids, - rank_topk_weights=topk_weights, - num_experts=num_experts, - a1_scale=a1_post_scale, - quant_config=quant_config) + return (lambda *args: None, + self._do_dispatch(tokens=a1q, + token_scales=a1q_scale, + rank_topk_ids=topk_ids, + rank_topk_weights=topk_weights, + num_experts=num_experts, + a1_scale=a1_post_scale, + quant_config=quant_config)) def prepare( self, @@ -237,10 +238,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights, - topk_ids, num_experts, expert_map, - apply_router_weight_on_input, - quant_config) + (_, receiver) = self.prepare_async(a1, a1_scale, a2_scale, + topk_weights, topk_ids, num_experts, + expert_map, + apply_router_weight_on_input, + quant_config) return receiver() def finalize( diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 1849e49e0ab51..61f8297f0f148 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -11,6 +11,9 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input, normalize_batched_scales_shape) +from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled, + dbo_maybe_run_recv_hook, + dbo_register_recv_hook, dbo_yield) # DeepEP kernels quantize dispatch inputs in 128 element chunks. DEEPEP_QUANT_BLOCK_SIZE = 128 @@ -55,7 +58,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # The dispatch function returns a handle that the combine function # requires. We store the handle here so it is available to the # combine function. - self.handle = None + self.handles: list[Optional[tuple]] = [None, None] self.num_dispatchers_ = num_dispatchers def num_dispatchers(self) -> int: @@ -123,13 +126,15 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> mk.ReceiverType: + ) -> tuple[Callable, mk.ReceiverType]: hidden_size = a1.size(1) assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \ (f"Hidden Size {hidden_size} not in supported list of hidden sizes" f"{self.SUPPORTED_HIDDEN_SIZES}") + a2a_idx = dbo_current_ubatch_id() + if self.use_fp8_dispatch: assert hidden_size % 128 == 0, \ "DeepEP kernels quantize the inputs in blocks of shape 128" @@ -148,7 +153,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): a1 = a1 * topk_weights.to(a1.dtype) # Dispatch - expert_x, expert_num_tokens, self.handle, event, hook = \ + expert_x, expert_num_tokens, handle, _, hook= \ self.buffer.low_latency_dispatch(a1, topk_ids, self.max_tokens_per_rank, @@ -156,21 +161,19 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): use_fp8=self.use_fp8_dispatch, async_finish=False, return_recv_hook=True) + self.handles[a2a_idx] = handle - return lambda: self._receiver(hook, expert_x, expert_num_tokens, - a1_scale, a1.dtype, quant_config) + return (hook, lambda: self._receiver(expert_x, expert_num_tokens, + a1_scale, a1.dtype, quant_config)) def _receiver( self, - hook: Callable, expert_x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], expert_num_tokens: torch.Tensor, a1_scale, a1_dtype, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - hook() - expert_x, expert_x_scale = self._do_quant( expert_x, a1_scale, a1_dtype, quant_config.quant_dtype, quant_config.per_act_token_quant, quant_config.block_shape) @@ -192,10 +195,12 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights, - topk_ids, num_experts, expert_map, - apply_router_weight_on_input, - quant_config) + hook, receiver = self.prepare_async(a1, a1_scale, a2_scale, + topk_weights, topk_ids, + num_experts, expert_map, + apply_router_weight_on_input, + quant_config) + hook() return receiver() def finalize( @@ -210,7 +215,11 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") - assert self.handle is not None + + a2a_idx = dbo_current_ubatch_id() + do_recv_hook = dbo_enabled() + handle = self.handles[a2a_idx] + assert handle is not None combine_topk_weights = topk_weights if apply_router_weight_on_input: @@ -218,12 +227,16 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): combine_topk_weights = torch.ones_like(topk_weights) # TODO (varun) : Enable zero copy mode - _, event, hook = self.buffer.low_latency_combine( + dbo_maybe_run_recv_hook() + _, _, recv_hook = self.buffer.low_latency_combine( fused_expert_output, topk_ids, combine_topk_weights, - self.handle, + handle, async_finish=False, zero_copy=False, - return_recv_hook=False, + return_recv_hook=do_recv_hook, out=output) + if recv_hook is not None: + dbo_register_recv_hook(recv_hook) + dbo_yield() diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c62897c91816e..d22bb253f4a72 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -38,6 +38,7 @@ from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum from vllm.utils import (cdiv, direct_register_custom_op, has_deep_ep, has_pplx, round_up) +from vllm.v1.worker.ubatching import dbo_current_ubatch_id if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts @@ -992,16 +993,28 @@ class FusedMoE(CustomOp): if (self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels or self.moe_config.use_flashinfer_cutlass_kernels): - self.batched_hidden_states = torch.zeros( - (moe.max_num_tokens, self.hidden_size), - dtype=moe.in_dtype, - device=torch.cuda.current_device()) + if vllm_config.parallel_config.enable_dbo: + self.batched_hidden_states = torch.zeros( + (2, moe.max_num_tokens, self.hidden_size), + dtype=moe.in_dtype, + device=torch.cuda.current_device()) - # Note here we use `num_experts` which is logical expert count - self.batched_router_logits = torch.zeros( - (moe.max_num_tokens, num_experts), - dtype=moe.in_dtype, - device=torch.cuda.current_device()) + # Note here we use `num_experts` which is logical expert count + self.batched_router_logits = torch.zeros( + (2, moe.max_num_tokens, num_experts), + dtype=moe.in_dtype, + device=torch.cuda.current_device()) + else: + self.batched_hidden_states = torch.zeros( + (moe.max_num_tokens, self.hidden_size), + dtype=moe.in_dtype, + device=torch.cuda.current_device()) + + # Note here we use `num_experts` which is logical expert count + self.batched_router_logits = torch.zeros( + (moe.max_num_tokens, num_experts), + dtype=moe.in_dtype, + device=torch.cuda.current_device()) @property def shared_experts(self) -> Optional[torch.nn.Module]: @@ -1708,14 +1721,29 @@ class FusedMoE(CustomOp): hidden_states = full_hidden_states[chunk_start:chunk_end, :] router_logits = full_router_logits[chunk_start:chunk_end, :] - assert (self.batched_hidden_states.size(0) # type: ignore + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + # This is only true when DBO has been enabled in the config. + # Both tensors will have an outer dimension for the ubatch id + if self.batched_hidden_states.dim() == 3: + assert self.batched_router_logits.dim() == 3 + batch_buffer_idx = dbo_current_ubatch_id() + batched_hidden_states = self.batched_hidden_states[ + batch_buffer_idx, :] + batched_router_logits = self.batched_router_logits[ + batch_buffer_idx, :] + else: + batched_hidden_states = self.batched_hidden_states + batched_router_logits = self.batched_router_logits + + assert (batched_hidden_states.size(0) # type: ignore >= chunk_size) - assert (self.batched_router_logits.size(0) # type: ignore + assert (batched_router_logits.size(0) # type: ignore >= chunk_size) - staged_hidden_states = self.batched_hidden_states[: - chunk_size, :] # type: ignore - staged_router_logits = self.batched_router_logits[: - chunk_size, :] # type: ignore + staged_hidden_states = batched_hidden_states[: + chunk_size, :] # type: ignore + staged_router_logits = batched_router_logits[: + chunk_size, :] # type: ignore staged_hidden_states.copy_(hidden_states, non_blocking=True) staged_router_logits.copy_(router_logits, non_blocking=True) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 281563c3bfca2..33799b58d1998 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -13,6 +13,8 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( # yapf: disable _resize_cache, count_expert_num_tokens) from vllm.utils import cdiv +from vllm.v1.worker.ubatching import (dbo_enabled, dbo_maybe_run_recv_hook, + dbo_register_recv_hook, dbo_yield) # # This file defines a set of base classes used to make MoE kernels more modular. @@ -226,7 +228,7 @@ class FusedMoEPrepareAndFinalize(ABC): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> ReceiverType: + ) -> tuple[Callable, ReceiverType]: """ Perform any quantization (and/or) dispatching needed for this kernel but do not wait for results from other workers. @@ -496,6 +498,23 @@ def _chunk_scales(scales: Optional[torch.Tensor], start: int, return None +class SharedResizableBuffer: + + def __init__(self): + self.buffer = None + + def get(self, shape: tuple[int, ...], device: torch.device, + dtype: torch.dtype): + shape_numel = prod(shape) + if self.buffer is None or self.buffer.numel() < shape_numel: + self.buffer = torch.empty(shape_numel, device=device, dtype=dtype) + assert self.buffer.device == device, \ + f"Buffer device mismatch: {self.buffer.device} != {device}" + assert self.buffer.dtype == dtype, \ + f"Buffer dtype mismatch: {self.buffer.dtype} != {dtype}" + return self.buffer[:shape_numel].view(*shape) + + @final class FusedMoEModularKernel(torch.nn.Module): """ @@ -509,6 +528,9 @@ class FusedMoEModularKernel(torch.nn.Module): layer due to any layer specific state that may be used by the component objects. """ + fused_out_buffer = SharedResizableBuffer() + workspace13_buffer = SharedResizableBuffer() + workspace2_buffer = SharedResizableBuffer() def __init__( self, @@ -559,12 +581,12 @@ class FusedMoEModularKernel(torch.nn.Module): # We can reuse the memory between cache1 and cache3 because by the # time we need cache3, we're done with cache1. - workspace13 = torch.empty(prod(workspace13_shape), - device=a1.device, - dtype=workspace_dtype) - workspace2 = torch.empty(prod(workspace2_shape), - device=a1.device, - dtype=workspace_dtype) + workspace13 = self.workspace13_buffer.get(workspace13_shape, + device=a1.device, + dtype=workspace_dtype) + workspace2 = self.workspace2_buffer.get(workspace2_shape, + device=a1.device, + dtype=workspace_dtype) assert fused_out is None or fused_out.shape == fused_out_shape, ( f"fused_out {fused_out.shape} but expected {fused_out_shape}") @@ -656,9 +678,9 @@ class FusedMoEModularKernel(torch.nn.Module): (_, _, fused_out_shape, _) = self.fused_experts.workspace_shapes( a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts, expert_tokens_meta) - fused_out = torch.empty(fused_out_shape, - device=a1q.device, - dtype=a1.dtype) + fused_out = self.fused_out_buffer.get(fused_out_shape, + device=a1q.device, + dtype=a1.dtype) def slice_input_tensors( chunk_idx: int @@ -801,8 +823,10 @@ class FusedMoEModularKernel(torch.nn.Module): shared_output: torch.Tensor - if (not self.prepare_finalize.supports_async() - or self.shared_experts is None): + if not self.prepare_finalize.supports_async(): + # We shouldn't be running an a2a kernel that doesn't + # support async prepare/finalize + assert not dbo_enabled() # Run shared experts serially with dispatch. if self.shared_experts is not None: @@ -822,7 +846,8 @@ class FusedMoEModularKernel(torch.nn.Module): ) else: # Overlap shared expert compute with all2all dispatch. - receiver = self.prepare_finalize.prepare_async( + dbo_maybe_run_recv_hook() + hook, receiver = self.prepare_finalize.prepare_async( a1, a1_scale, a2_scale, @@ -834,8 +859,16 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.quant_config, ) - assert self.shared_experts is not None - shared_output = self.shared_experts(a1) + if self.shared_experts is not None: + shared_output = self.shared_experts(a1) + + # If DBO is being used, register the hook with the ubatch context + # and call it in dbo_maybe_run_recv_hook instead of passing it to + # the receiver. + dbo_register_recv_hook(hook) + dbo_yield() + if not dbo_enabled(): + hook() (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, _expert_topk_weights) = receiver() diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 2ae79e69f5554..b8c1c14317c46 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union +from typing import Callable, Optional, Union import pplx_kernels as pplx import torch @@ -103,7 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> mk.ReceiverType: + ) -> tuple[Callable, mk.ReceiverType]: num_tokens = a1.size(0) # M hidden_dim = a1.size(-1) # K @@ -214,30 +214,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_recv=False, ) - return lambda: self._receiver( - expert_num_tokens, - expert_x, - expert_x_scale, - a1q, - a1q_scale, - topk_ids, - bound_m, - orig_a_scale_block_shape, - ) - - def _receiver( - self, - expert_num_tokens: torch.Tensor, - expert_x: torch.Tensor, - expert_x_scale: Optional[torch.Tensor], - a1q: torch.Tensor, - a1q_scale: Optional[torch.Tensor], - topk_ids: torch.Tensor, - bound_m: Optional[torch.Tensor], - orig_a_scale_block_shape: Optional[int], - ) -> mk.PrepareResultType: - - self.a2a.dispatch( + hook = lambda: self.a2a.dispatch( out_expert_num_tokens=expert_num_tokens, out_expert_x=expert_x, out_expert_x_scale=expert_x_scale, @@ -249,6 +226,21 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_recv=True, ) + return (hook, lambda: self._receiver( + expert_num_tokens, + expert_x, + expert_x_scale, + orig_a_scale_block_shape, + )) + + def _receiver( + self, + expert_num_tokens: torch.Tensor, + expert_x: torch.Tensor, + expert_x_scale: Optional[torch.Tensor], + orig_a_scale_block_shape: Optional[int], + ) -> mk.PrepareResultType: + if expert_x_scale is not None: expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape] assert expert_x_scale.ndim == 3 @@ -270,7 +262,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - receiver = self.prepare_async( + hook, receiver = self.prepare_async( a1, a1_scale, a2_scale, @@ -281,6 +273,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input, quant_config, ) + hook() return receiver() def finalize( diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index ead70c910a8fa..63326d19194f0 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -28,6 +28,7 @@ from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout) from vllm.logger import init_logger from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.worker.ubatch_utils import UBatchSlice logger = init_logger(__name__) KVCacheLayoutType = Literal["NHD", "HND"] @@ -81,12 +82,6 @@ class CommonAttentionMetadata: encoder_seq_lens: Optional[np.ndarray] = None -@dataclass -class UbatchSlice: - request_slice: slice - token_slice: slice - - def slice_query_start_locs( query_start_loc: torch.Tensor, request_slice: slice, @@ -103,7 +98,7 @@ def slice_query_start_locs( def _make_metadata_with_slice( - ubatch_slice: UbatchSlice, + ubatch_slice: UBatchSlice, attn_metadata: CommonAttentionMetadata) -> CommonAttentionMetadata: """ This function creates a new CommonAttentionMetadata that corresponds to @@ -133,6 +128,11 @@ def _make_metadata_with_slice( torch.max(torch.abs(query_start_loc_cpu[1:] - query_start_loc_cpu[:-1])).item()) + # This is to account for the case where we are in a dummy + # run and query_start_loc_cpu is full of 0s + if max_query_len == 0: + max_query_len = attn_metadata.max_query_len + block_table_tensor = attn_metadata.block_table_tensor[request_slice] slot_mapping = attn_metadata.slot_mapping[token_slice] @@ -152,12 +152,12 @@ def _make_metadata_with_slice( def split_attn_metadata( - ubatch_slices: list[UbatchSlice], + ubatch_slices: list[UBatchSlice], common_attn_metadata: CommonAttentionMetadata, ) -> list[CommonAttentionMetadata]: """ Creates a new CommonAttentionMetadata instance that corresponds to the - requests for each UbatchSlice in ubatch_slices. + requests for each UBatchSlice in ubatch_slices. Note: This function does not modify common_attn_metadata """ diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 7132d507c722c..5154b29405b6e 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -27,6 +27,7 @@ from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.ubatching import dbo_current_ubatch_id logger = init_logger(__name__) @@ -179,9 +180,11 @@ class EagleProposer: assert self.runner is not None # FIXME: need to consider multiple kv_cache_groups - attn_metadata = self.runner.attn_groups[0][0].metadata_builder\ - .build_for_drafting(common_attn_metadata=common_attn_metadata, - draft_index=0) + ubatch_id = dbo_current_ubatch_id() + attn_metadata_builder = \ + self.runner.attn_groups[0][0].metadata_builders[ubatch_id] + attn_metadata = attn_metadata_builder.build_for_drafting( + common_attn_metadata=common_attn_metadata, draft_index=0) # At this moment, we assume all eagle layers belong to the same KV # cache group, thus using the same attention metadata. @@ -355,8 +358,9 @@ class EagleProposer: hidden_states: torch.Tensor, common_attn_metadata: CommonAttentionMetadata, ) -> list[torch.Tensor]: + ubatch_id = dbo_current_ubatch_id() tree_attn_metadata_builder = \ - self.runner.attn_groups[0][0].metadata_builder + self.runner.attn_groups[0][0].metadata_builders[ubatch_id] assert isinstance(tree_attn_metadata_builder, TreeAttentionMetadataBuilder) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index d5ec19b86b061..619ed88ab5b27 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -64,8 +64,13 @@ class CPUModelRunner(GPUModelRunner): if not self.attn_groups[0]: return - mb = getattr(self.attn_groups[0][0], "metadata_builder", None) - if not isinstance(mb, TorchSDPAMetadataBuilderV1): + mb = getattr(self.attn_groups[0][0], "metadata_builders", None) + if isinstance(mb, list): + if not isinstance(mb[0], TorchSDPAMetadataBuilderV1): + return + mb[0].reorder_batch(self.input_batch, scheduler_output) + return + elif not isinstance(mb, TorchSDPAMetadataBuilderV1): # Encoder-only / rerank models do not benefit from reordering, # so we safely skip here. return diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d4d1f814afc0c..2ae748dee43c9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -15,6 +15,7 @@ import torch import torch.distributed import torch.nn as nn from tqdm import tqdm +from typing_extensions import TypeAlias import vllm.envs as envs from vllm.attention import Attention, AttentionType @@ -55,11 +56,12 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, is_pin_memory_available, round_up, supports_dynamo) +from vllm.v1.attention.backends.flash_attn import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, create_fast_prefill_custom_backend, - reorder_batch_to_split_decodes_and_prefills) + reorder_batch_to_split_decodes_and_prefills, split_attn_metadata) from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher # yapf conflicts with isort for this block # yapf: disable @@ -85,9 +87,12 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch +from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.v1.worker.ubatch_splitting import get_dp_padding_ubatch, ubatch_split +from vllm.v1.worker.ubatch_utils import UBatchSlice, UBatchSlices from vllm.v1.worker.utils import is_residual_scattered_for_sp from .utils import (AttentionGroup, MultiModalBudget, @@ -105,6 +110,11 @@ else: logger = init_logger(__name__) +AttnMetadataDict: TypeAlias = dict[str, AttentionMetadata] +# list when ubatching is enabled +PerLayerAttnMetadata: TypeAlias = Union[list[AttnMetadataDict], + AttnMetadataDict] + # Wrapper for ModelRunnerOutput to support overlapped execution. class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): @@ -274,6 +284,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Request states. self.requests: dict[str, CachedRequestState] = {} + self.comm_stream = torch.cuda.Stream() # Input Batch # NOTE(Chen): Ideally, we should initialize the input batch inside @@ -872,10 +883,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return encoder_seq_lens def _prepare_inputs( - self, - scheduler_output: "SchedulerOutput", - ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata], - np.ndarray, Optional[CommonAttentionMetadata], int]: + self, scheduler_output: "SchedulerOutput" + ) -> tuple[PerLayerAttnMetadata, torch.Tensor, + Optional[SpecDecodeMetadata], np.ndarray, + Optional[CommonAttentionMetadata], int, Optional[UBatchSlices], + Optional[torch.Tensor]]: """ :return: tuple[ attn_metadata: layer-to-attention_metadata mapping, @@ -947,6 +959,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.query_start_loc.copy_to_gpu() query_start_loc = self.query_start_loc.gpu[:num_reqs + 1] + num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens + num_tokens_padded = num_tokens_unpadded + self.get_local_padding( + num_tokens_unpadded) + ubatch_slices, num_tokens_after_padding = \ + ubatch_split(max_num_scheduled_tokens, + num_tokens_unpadded, + num_tokens_padded, + self.vllm_config) + self.seq_lens.np[:num_reqs] = ( self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens) @@ -1001,7 +1022,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits_indices_padded = self._prepare_kv_sharing_fast_prefill( logits_indices) - attn_metadata: dict[str, Any] = {} + attn_metadata: PerLayerAttnMetadata = {} + if ubatch_slices is not None: + attn_metadata = [dict() for _ in range(len(ubatch_slices))] # Used in the below loop. query_start_loc_cpu = self.query_start_loc.cpu[:num_reqs + 1] @@ -1075,7 +1098,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): for attn_group in self.attn_groups[kv_cache_group_id]: # Prepare for cascade attention if enabled & beneficial. common_prefix_len = 0 - builder = attn_group.metadata_builder + builder = attn_group.get_metadata_builder() if self.cascade_attn_enabled: common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, @@ -1093,13 +1116,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_draft_tokens=self.num_draft_tokens.gpu[:num_reqs], ) - attn_metadata_i = builder.build( - common_prefix_len=common_prefix_len, - common_attn_metadata=common_attn_metadata, - **extra_attn_metadata_args) - - for layer_name in attn_group.layer_names: - attn_metadata[layer_name] = attn_metadata_i + if ubatch_slices is not None: + common_attn_metadata_list = split_attn_metadata( + ubatch_slices, common_attn_metadata) + for ubid, common_attn_metadata in enumerate( + common_attn_metadata_list): + assert common_attn_metadata.max_query_len == 1 + attn_metadata_i = (attn_group.get_metadata_builder( + ubatch_id=ubid).build( + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata)) + for layer_name in kv_cache_group_spec.layer_names: + assert type(attn_metadata) is list + attn_metadata[ubid][layer_name] = attn_metadata_i + else: + assert isinstance(attn_metadata, dict) + attn_metadata_i = builder.build( + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata, + **extra_attn_metadata_args) + for layer_name in attn_group.layer_names: + attn_metadata[layer_name] = attn_metadata_i # Hot-Swap lora model if self.lora_config: @@ -1107,7 +1144,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return (attn_metadata, logits_indices, spec_decode_metadata, num_scheduled_tokens, spec_decode_common_attn_metadata, - max_num_scheduled_tokens) + max_num_scheduled_tokens, ubatch_slices, + num_tokens_after_padding) def _compute_cascade_attn_prefix_len( self, @@ -1508,7 +1546,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. - if isinstance(self.model, CUDAGraphWrapper): + if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)): return self.model.unwrap() return self.model @@ -1675,6 +1713,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def get_dp_padding(self, num_tokens: int) -> tuple[int, Optional[torch.Tensor]]: + """ + Determines the total number of tokens that each rank will run. + All ranks will be padded out so that they run with the same number + of tokens + + Returns: tuple[ + num_pad_tokens: The number of tokens that will be added to the batch + num_tokens_after_padding: A tensor containing the total number of + tokens for each DP rank including padding. + ] + """ dp_size = self.vllm_config.parallel_config.data_parallel_size dp_rank = self.vllm_config.parallel_config.data_parallel_rank @@ -1698,6 +1747,39 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): dtype=torch.int32) return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding + def get_local_padding(self, num_tokens_unpadded: int) -> int: + + num_tokens_padded = num_tokens_unpadded + + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]): + # Use piecewise CUDA graphs. + # Add padding to the batch size. + num_tokens_padded = self.vllm_config.pad_for_cudagraph( + num_tokens_unpadded) + else: + # Eager mode. + # Pad tokens to multiple of tensor_parallel_size when + # enabled collective fusion for SP + tp_size = self.vllm_config.parallel_config.tensor_parallel_size + if self.vllm_config.compilation_config.pass_config. \ + enable_sequence_parallelism and tp_size > 1: + num_tokens_padded = round_up(num_tokens_unpadded, tp_size) + + num_pad_tokens = num_tokens_padded - num_tokens_unpadded + return num_pad_tokens + + # This is where the second ubatch is adjusted to account for the padding. + # Should be called after attention metadata creation. This just pads + # the second ubatch slice out to the total number of tokens + # (num_tokens + padding) + def pad_out_ubatch_slice(self, ubatch_slices: UBatchSlices, + num_total_tokens: int): + padded_second_ubatch_slice = slice(ubatch_slices[1].token_slice.start, + num_total_tokens) + ubatch_slices[1] = UBatchSlice(padded_second_ubatch_slice, + padded_second_ubatch_slice) + def _pool( self, hidden_states: torch.Tensor, @@ -1758,15 +1840,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self, scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, + ubatch_slices: Optional[UBatchSlices] = None, + num_tokens_after_padding: Optional[torch.Tensor] = None, ) -> tuple[int, int, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor], torch.Tensor, Optional[IntermediateTensors], dict[str, Any]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens) - # Padding for DP - num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens) - num_input_tokens += num_pad + if ubatch_slices: + assert num_tokens_after_padding is not None + num_input_tokens = int(num_tokens_after_padding[0].item() * 2) + self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens) + elif ubatch_slices is None: + num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens) + num_pad, num_tokens_after_padding = self.get_dp_padding( + num_input_tokens) + num_input_tokens += num_pad # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order @@ -1821,7 +1910,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return ( num_scheduled_tokens, num_input_tokens, - num_tokens_across_dp, + num_tokens_after_padding, input_ids, inputs_embeds, positions, @@ -2027,7 +2116,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Prepare the decoder inputs. (attn_metadata, logits_indices, spec_decode_metadata, num_scheduled_tokens_np, spec_decode_common_attn_metadata, - max_query_len) = self._prepare_inputs(scheduler_output) + max_query_len, ubatch_slices, num_tokens_after_padding + ) = self._prepare_inputs(scheduler_output) finally: if self.prepare_inputs_event is not None: @@ -2042,7 +2132,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): positions, intermediate_tensors, model_kwargs, - ) = self._preprocess(scheduler_output, intermediate_tensors) + ) = self._preprocess(scheduler_output, intermediate_tensors, + ubatch_slices, num_tokens_after_padding) + + if ubatch_slices is not None: + num_input_tokens = num_input_tokens // 2 uniform_decode = (max_query_len == self.uniform_decode_query_len) and ( @@ -2062,6 +2156,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, + ubatch_slices=ubatch_slices, ), record_function_or_nullcontext("Forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output): @@ -2441,10 +2536,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # CudagraphWraper and CudagraphDispatcher of vllm. # wrap the model with full cudagraph wrapper if needed. - if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): + if self.compilation_config.cudagraph_mode.has_full_cudagraphs() \ + and not self.parallel_config.enable_dbo: self.model = CUDAGraphWrapper(self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL) + elif self.parallel_config.enable_dbo: + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): + self.model = UBatchWrapper(self.model, self.vllm_config, + CUDAGraphMode.FULL, self.device) + else: + self.model = UBatchWrapper(self.model, self.vllm_config, + CUDAGraphMode.NONE, self.device) def reload_weights(self) -> None: assert getattr(self, "model", None) is not None, \ @@ -2642,6 +2745,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, force_attention: bool = False, uniform_decode: bool = False, + allow_microbatching: bool = False, skip_eplb: bool = False, is_profile: bool = False, create_mixed_batch: bool = False, @@ -2667,12 +2771,30 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): (1 token) and prefill (multiple tokens) requests. remove_lora: If False, dummy LoRAs are not destroyed after the run """ + ubatch_enabled = self.parallel_config.enable_dbo + num_tokens_across_dp = None + num_pad = 0 + should_ubatch = False + if ubatch_enabled: + should_ubatch = num_tokens >= \ + self.parallel_config.dbo_decode_token_threshold and \ + allow_microbatching + + (should_ubatch, num_tokens_across_dp) = get_dp_padding_ubatch( + num_tokens, num_tokens, should_ubatch, self.vllm_config) + + # Currently the dummy run should only be ubatching during + # cuda graph capture, meaning all DP ranks should already + # have the same batch size + if num_tokens_across_dp is not None: + assert int(num_tokens_across_dp[0]) == num_tokens // 2 + assert cudagraph_runtime_mode in { CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL } - # Padding for DP - num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) + if not should_ubatch: + num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) num_tokens += num_pad # If cudagraph_mode.decode_mode() == FULL and @@ -2690,6 +2812,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # for GQA/MQA. max_query_len = self.uniform_decode_query_len if uniform_decode else \ num_tokens + if allow_microbatching: + assert self.uniform_decode_query_len == 1 + assert uniform_decode is True + assert max_query_len == 1 # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively @@ -2728,12 +2854,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - attn_metadata: Optional[dict[str, Any]] = None + ubatch_slices = None + # We currently only microbatch if the number of tokens is + # over a certain threshold. + if should_ubatch: + # We only support decode-only cudagraphs + assert num_reqs == num_tokens + assert num_tokens % 2 == 0 + ubatch_slices = [ + UBatchSlice(slice(0, num_reqs // 2), slice(0, + num_tokens // 2)), + UBatchSlice(slice(num_reqs // 2, num_reqs), + slice(num_tokens // 2, num_tokens)) + ] + + attn_metadata: Optional[PerLayerAttnMetadata] = None # If force_attention is True, we always capture attention. Otherwise, # it only happens for cudagraph_runtime_mode=FULL. if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL: attn_metadata = {} + if ubatch_slices is not None: + attn_metadata = [dict() for _ in range(len(ubatch_slices))] if create_mixed_batch: # In the mixed batch mode (used for FI warmup), we use @@ -2766,12 +2908,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): slot_mapping=self.input_batch. block_table[kv_cache_group_id].slot_mapping[:num_tokens], causal=True) - for attn_group in self.attn_groups[kv_cache_group_id]: - attn_metadata_i = attn_group.metadata_builder\ - .build_for_cudagraph_capture(common_attn_metadata) - for layer_name in kv_cache_group_spec.layer_names: - attn_metadata[layer_name] = attn_metadata_i + if ubatch_slices is not None: + common_attn_metadata_list = split_attn_metadata( + ubatch_slices, common_attn_metadata) + for ubid, common_attn_metadata in enumerate( + common_attn_metadata_list): + assert common_attn_metadata.max_query_len == 1 + attn_metadata_i = (attn_group\ + .get_metadata_builder(ubatch_id=ubid)\ + .build_for_cudagraph_capture(common_attn_metadata)) + for layer_name in kv_cache_group_spec.layer_names: + assert type(attn_metadata) is list + attn_metadata[ubid][ + layer_name] = attn_metadata_i + else: + assert type(attn_metadata) is dict + attn_metadata_i = attn_group.get_metadata_builder()\ + .build_for_cudagraph_capture(common_attn_metadata) + for layer_name in kv_cache_group_spec.layer_names: + attn_metadata[layer_name] = attn_metadata_i with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens, remove_lora): @@ -2818,13 +2974,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): f"Cudagraph runtime mode mismatch at dummy_run. " f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.") + if ubatch_slices is not None: + num_tokens = num_tokens // 2 with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, self.vllm_config, num_tokens=num_tokens, num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor): + batch_descriptor=batch_descriptor, + ubatch_slices=ubatch_slices): outputs = self.model( input_ids=input_ids, positions=positions, @@ -3096,6 +3255,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): set_cudagraph_capturing_enabled(True) with freeze_gc(), graph_capture(device=self.device): cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE: cudagraph_runtime_mode = cudagraph_mode.mixed_mode() @@ -3153,6 +3313,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): desc="Capturing CUDA graphs ({}, {})".format( "decode" if uniform_decode else "mixed prefill-decode", cudagraph_runtime_mode.name)) + enable_dbo = self.parallel_config.enable_dbo + # DBO Only supports running Full cudagraphs with uniform + # decode lengths + if enable_dbo and uniform_decode: + for num_tokens in compilation_cases: + # If the number of tokens is greater than the microbatching + # threshold, don't generate a microbatched cudagraph + if (num_tokens + < self.parallel_config.dbo_decode_token_threshold): + continue + + # Warmup + for _ in range( + self.compilation_config.cudagraph_num_of_warmups): + force_attention = ( + cudagraph_runtime_mode == CUDAGraphMode.FULL) + self._dummy_run(num_tokens, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + force_attention=force_attention, + uniform_decode=True, + allow_microbatching=True, + skip_eplb=True) + + # Graph Capture + self._dummy_run(num_tokens, + cudagraph_runtime_mode=CUDAGraphMode.FULL, + uniform_decode=True, + allow_microbatching=True, + skip_eplb=True) # We skip EPLB here since we don't want to record dummy metrics for num_tokens in compilation_cases: for _ in range(self.compilation_config.cudagraph_num_of_warmups): @@ -3219,14 +3408,23 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) -> list[AttentionGroup]: attn_groups: list[AttentionGroup] = [] for attn_backend, layer_names in attn_backends_map.items(): - attn_metadata_builder_i = attn_backend.get_builder_cls()( + attn_metadata_builders = [] + attn_metadata_builders.append(attn_backend.get_builder_cls()( kv_cache_spec, layer_names, self.vllm_config, self.device, - ) + )) + if self.parallel_config.enable_dbo: + attn_metadata_builders.append( + attn_backend.get_builder_cls()( + kv_cache_spec, + layer_names, + self.vllm_config, + self.device, + )) attn_group = AttentionGroup(attn_backend, - attn_metadata_builder_i, + attn_metadata_builders, layer_names) attn_groups.append(attn_group) return attn_groups @@ -3246,11 +3444,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): min_cg_builder_name = None for attn_group in self._attn_group_iterator(): - builder = attn_group.metadata_builder + builder = attn_group.get_metadata_builder() if builder.cudagraph_support.value < min_cg_support.value: min_cg_support = builder.cudagraph_support min_cg_builder_name = builder.__class__.__name__ - # Flexible resolve the cudagraph mode cudagraph_mode = self.compilation_config.cudagraph_mode # check cudagraph for mixed batch is supported @@ -3316,7 +3513,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): is compatible (e.g., decode threshold is the same) """ for group in self._attn_group_iterator(): - attn_metadata_builder_i = group.metadata_builder + attn_metadata_builder_i = group.get_metadata_builder() # check that if any backends reorder batches; that the reordering # is compatible (e.g., decode threshold is the same) diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py new file mode 100644 index 0000000000000..5012ad0483c84 --- /dev/null +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -0,0 +1,303 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import dataclasses +import threading +from typing import Any, Callable, Optional + +import torch + +from vllm.compilation.cuda_graph import CUDAGraphWrapper +from vllm.config import CUDAGraphMode, VllmConfig +from vllm.forward_context import (create_forward_context, get_forward_context, + override_forward_context) +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors +from vllm.v1.worker.ubatching import UBatchContext, make_ubatch_contexts + +logger = init_logger(__name__) + + +@dataclasses.dataclass +class UbatchMetadata: + context: UBatchContext + input_ids: torch.Tensor + positions: torch.Tensor + inputs_embeds: Optional[torch.Tensor] + intermediate_tensors: Optional[IntermediateTensors] + num_tokens: int + + +@dataclasses.dataclass +class CUDAGraphMetaData: + cudagraph: torch.cuda.CUDAGraph + ubatch_metadata: UbatchMetadata + outputs: Optional[Any] = None + + +class UBatchWrapper: + + def __init__(self, runnable: Callable, vllm_config: VllmConfig, + runtime_mode: CUDAGraphMode, device: torch.cuda.device): + self.runnable = runnable + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config + self.comm_stream = torch.cuda.Stream(device=device) + # Two ubatch threads plus the main thread + self.ready_barrier = threading.Barrier(3) + + self.cudagraphs: dict[int, CUDAGraphMetaData] = {} + + self.cudagraph_wrapper = None + self.graph_pool = None + if runtime_mode is not CUDAGraphMode.NONE: + self.cudagraph_wrapper = CUDAGraphWrapper( + runnable, vllm_config, runtime_mode=runtime_mode) + self.graph_pool = current_platform.get_global_graph_pool() + + def __getattr__(self, key: str): + # allow accessing the attributes of the runnable. + if hasattr(self.runnable, key): + return getattr(self.runnable, key) + raise AttributeError(f"Attribute {key} not exists in the runnable of " + f"cudagraph wrapper: {self.runnable}") + + def unwrap(self) -> Callable: + # in case we need to access the original runnable. + return self.runnable + + def _capture_ubatches(self, ubatch_metadata, model) -> torch.Tensor: + """ + Capture a cudagraph for a microbatched run. + + The logic here is somewhat complicated because we need to make sure that + each of the ubatch threads initialize the cuda context before we start + the graph capture. + + The flow is as follows: + 1. The main thread starts up each ubatch thread. Each thread will + initialize its cuda context (torch.cuda.current_blas_handle()) + before going to sleep upon entering the ubatch_context. + + 2. The main thread starts the graph capture and wakes up the first + ubatch thread. + + 3. Each ubatch thread runs the model to completion and returns the + completed output tensors back to the main thread. + + 4. The main thread stores the captured cudagraph along with its metadata + and returns + """ + + @torch.inference_mode() + def _capture_ubatch_thread(results, ubatch_metadata): + ubatch_context = ubatch_metadata.context + with torch.cuda.stream(ubatch_context.compute_stream): + _ = torch.cuda.current_blas_handle() + with torch.cuda.stream(ubatch_context.comm_stream): + _ = torch.cuda.current_blas_handle() + with ubatch_context: + model_output = model( + input_ids=ubatch_metadata.input_ids, + positions=ubatch_metadata.positions, + intermediate_tensors=ubatch_metadata.intermediate_tensors, + inputs_embeds=ubatch_metadata.inputs_embeds, + ) + + results.append((ubatch_metadata.context.id, model_output)) + + results: list[tuple[int, torch.Tensor]] = [] + compute_stream = ubatch_metadata[0].context.compute_stream + num_tokens = ubatch_metadata[0].num_tokens + \ + ubatch_metadata[1].num_tokens + + # Ubatches will manually manage the forward context, so we override + # it to None here so we can have it restored correctly later + with override_forward_context(None): + ubatch_threads = [] + for metadata in ubatch_metadata: + thread = threading.Thread(target=_capture_ubatch_thread, + args=( + results, + metadata, + )) + ubatch_threads.append(thread) + thread.start() + self.ready_barrier.wait() # Wait for both threads to be ready + + # Capture the cudagraph + cudagraph_metadata = \ + CUDAGraphMetaData( + cudagraph=torch.cuda.CUDAGraph(), + ubatch_metadata=ubatch_metadata, + ) + with torch.cuda.graph(cudagraph_metadata.cudagraph, + stream=compute_stream, + pool=self.graph_pool): + ubatch_metadata[0].context.cpu_wait_event.set() + for thread in ubatch_threads: + thread.join() + sorted_results = [value for position, value in sorted(results)] + result = torch.cat(sorted_results, dim=0) + cudagraph_metadata.outputs = result + self.cudagraphs[num_tokens] = cudagraph_metadata + return cudagraph_metadata.outputs + + def _run_ubatches(self, ubatch_metadata, model) -> torch.Tensor: + + @torch.inference_mode() + def _ubatch_thread(results, model, ubatch_metadata): + with ubatch_metadata.context: + model_output = model( + input_ids=ubatch_metadata.input_ids, + positions=ubatch_metadata.positions, + intermediate_tensors=ubatch_metadata.intermediate_tensors, + inputs_embeds=ubatch_metadata.inputs_embeds, + ) + results.append((ubatch_metadata.context.id, model_output)) + + results: list[tuple[int, torch.Tensor]] = [] + + # Ubatch threads will manually manage the forward context, so we + # override it to None here so we can have it restored correctly + # after both threads have finished + with override_forward_context(None): + ubatch_threads = [] + for metadata in ubatch_metadata: + thread = threading.Thread(target=_ubatch_thread, + args=( + results, + model, + metadata, + )) + ubatch_threads.append(thread) + thread.start() + self.ready_barrier.wait() # Wait for both threads to be ready + ubatch_metadata[0].context.cpu_wait_event.set() + for thread in ubatch_threads: + thread.join() + sorted_results = [value for position, value in sorted(results)] + result = torch.cat(sorted_results, dim=0) + return result + + def _make_ubatch_metadata(self, ubatch_slices, attn_metadata, input_ids, + positions, inputs_embeds, intermediate_tensors, + compute_stream, dp_metadata, batch_descriptor, + cudagraph_runtime_mode) -> list[UbatchMetadata]: + + # Create one forward context per ubatch + forward_contexts = [] + for i, ubatch_slice in enumerate(ubatch_slices): + forward_contexts.append( + create_forward_context( + attn_metadata[i] if attn_metadata is not None else None, + self.vllm_config, + dp_metadata=dp_metadata, + batch_descriptor=batch_descriptor, + cudagraph_runtime_mode=cudagraph_runtime_mode)) + + ubatch_ctxs = make_ubatch_contexts( + num_micro_batches=len(ubatch_slices), + comm_stream=self.comm_stream, + compute_stream=compute_stream, + forward_contexts=forward_contexts, + ready_barrier=self.ready_barrier) + + ubatch_metadata: list[UbatchMetadata] = [] + for i, ubatch_slice in enumerate(ubatch_slices): + sliced_input_ids, sliced_positions, sliced_inputs_embeds, \ + sliced_intermediate_tensors = \ + self._slice_model_inputs( + ubatch_slice.token_slice, input_ids, positions, + inputs_embeds, intermediate_tensors) + ubatch_metadata.append( + UbatchMetadata( + context=ubatch_ctxs[i], + input_ids=sliced_input_ids, + positions=sliced_positions, + inputs_embeds=sliced_inputs_embeds, + intermediate_tensors=sliced_intermediate_tensors, + num_tokens=ubatch_slice.token_slice.stop - + ubatch_slice.token_slice.start)) + + return ubatch_metadata + + def _slice_model_inputs(self, tokens_slice: slice, input_ids, positions, + inputs_embeds, intermediate_tensors): + sliced_input_ids = input_ids[tokens_slice] + # if we are using mrope. Mrope adds an additional dimension to the + # positions tensor + if positions.ndim == 2: + sliced_positions = positions[:, tokens_slice] + else: + sliced_positions = positions[tokens_slice] + sliced_inputs_embeds = inputs_embeds[ + tokens_slice] if inputs_embeds else None + sliced_intermediate_tensors = intermediate_tensors[ + tokens_slice] if intermediate_tensors else None + + return (sliced_input_ids, sliced_positions, sliced_inputs_embeds, + sliced_intermediate_tensors) + + def __call__(self, *args, **kwargs): + forward_context = get_forward_context() + batch_descriptor = forward_context.batch_descriptor + ubatch_slices = forward_context.ubatch_slices + cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode + + # If there's no ubatching, just run the runnable object + if ubatch_slices is None: + if cudagraph_runtime_mode in (CUDAGraphMode.NONE, + CUDAGraphMode.PIECEWISE): + return self.runnable(*args, **kwargs) + else: + assert self.cudagraph_wrapper is not None + return self.cudagraph_wrapper(*args, **kwargs) + + attn_metadata = forward_context.attn_metadata + num_tokens = (ubatch_slices[0].token_slice.stop - + ubatch_slices[0].token_slice.start) * 2 + input_ids = kwargs['input_ids'] + positions = kwargs['positions'] + intermediate_tensors = kwargs['intermediate_tensors'] + inputs_embeds = kwargs['inputs_embeds'] + compute_stream = torch.cuda.current_stream() + + dp_metadata = forward_context.dp_metadata + + # We shouldn't be here unless we are running with multiple DP ranks + assert dp_metadata is not None + + if num_tokens not in self.cudagraphs \ + and cudagraph_runtime_mode is CUDAGraphMode.FULL: + ubatch_metadata = self._make_ubatch_metadata( + ubatch_slices=ubatch_slices, + attn_metadata=attn_metadata, + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + compute_stream=compute_stream, + dp_metadata=dp_metadata, + batch_descriptor=batch_descriptor, + cudagraph_runtime_mode=CUDAGraphMode.NONE) + + return self._capture_ubatches(ubatch_metadata, self.model) + elif num_tokens in self.cudagraphs: + cudagraph_metadata = self.cudagraphs[num_tokens] + cudagraph_metadata.cudagraph.replay() + return cudagraph_metadata.outputs + else: + ubatch_metadata = self._make_ubatch_metadata( + ubatch_slices=ubatch_slices, + attn_metadata=attn_metadata, + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + compute_stream=compute_stream, + dp_metadata=dp_metadata, + batch_descriptor=batch_descriptor, + cudagraph_runtime_mode=CUDAGraphMode.NONE) + return self._run_ubatches(ubatch_metadata, self.model) diff --git a/vllm/v1/worker/ubatch_splitting.py b/vllm/v1/worker/ubatch_splitting.py new file mode 100644 index 0000000000000..650f0ec5138db --- /dev/null +++ b/vllm/v1/worker/ubatch_splitting.py @@ -0,0 +1,155 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +from vllm.config import VllmConfig +from vllm.forward_context import DPMetadata +from vllm.logger import init_logger +from vllm.utils import round_up +from vllm.v1.worker.ubatch_utils import (UBatchSlice, UBatchSlices, + is_second_ubatch_empty) + +logger = init_logger(__name__) + + +def should_ubatch_with_num_tokens( + should_ubatch: bool, + orig_num_tokens_per_ubatch: int, + padded_num_tokens_per_ubatch: int, + vllm_config: VllmConfig, +) -> tuple[bool, Optional[torch.Tensor]]: + dp_size = vllm_config.parallel_config.data_parallel_size + dp_rank = vllm_config.parallel_config.data_parallel_rank + return DPMetadata.should_ubatch_across_dp(should_ubatch, + orig_num_tokens_per_ubatch, + padded_num_tokens_per_ubatch, + dp_size, dp_rank) + + +def get_dp_padding_ubatch( + num_tokens_unpadded: int, num_tokens_padded: int, + should_attempt_ubatching: bool, + vllm_config: VllmConfig) -> tuple[bool, Optional[torch.Tensor]]: + """ + 1. Decides if each DP rank is going to microbatch. Either all ranks + run with microbatching or none of them do. If this function decides + not to run with microbatching. It will "abort" meaning that no padding + information will be returned to the caller. It will return (False, None) + + 2. Determines the total number of tokens that each rank will run. + All ranks will be padded out so that the run with the same number + of tokens + + Returns: tuple[ + should_ubatch: Are all DP ranks going to microbatch + num_tokens_after_padding: A tensor containing the total number of + tokens per-microbatch for each DP rank including padding. Will be + None if should_ubatch if False + ] + + """ + assert num_tokens_padded >= num_tokens_unpadded + dp_size = vllm_config.parallel_config.data_parallel_size + if dp_size == 1: + # Early exit. + return False, None + + # If this DP rank doesn't want to attempt microbatching + if not should_attempt_ubatching: + (should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens( + False, 0, 0, vllm_config) + assert should_ubatch is False + assert num_tokens_across_dp is None + return should_ubatch, num_tokens_across_dp + + # Round up to the next multiple of two for even divisibility + num_tokens_padded = round_up(num_tokens_padded, 2) + num_tokens_per_ubatch = num_tokens_padded // 2 + should_ubatch = True + + # Sanity Check that the existing padding isn't giving us an empty second + # ubatch. Abort if so + if is_second_ubatch_empty(num_tokens_unpadded, num_tokens_padded): + logger.debug( + "Empty second µbatch detected: unpadded tokens: %s, padded " + "tokens: %s", num_tokens_unpadded, num_tokens_padded) + should_ubatch = False + + # Note that we compute the number of padded tokens per ubatch + (should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens( + should_ubatch, num_tokens_unpadded // 2, num_tokens_per_ubatch, + vllm_config) + if not should_ubatch: + assert num_tokens_across_dp is None + return should_ubatch, num_tokens_across_dp + + assert num_tokens_across_dp is not None + + max_tokens_across_dp_cpu = int(torch.max(num_tokens_across_dp).item()) + num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] * + dp_size, + device="cpu", + dtype=torch.int32) + return should_ubatch, num_tokens_after_padding + + +def ubatch_split( + max_num_scheduled_tokens: int, + num_tokens_unpadded: int, + num_tokens_padded: int, + vllm_config: VllmConfig, +) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]: + """ + Coordinates amongst all DP ranks to determine if and how the full batch + should be split into microbatches. + + Returns: tuple[ + ubatch_slices: if this is set then all DP ranks have agreed to + microbatch + num_tokens_after_padding: A tensor containing the total number of + tokens per-microbatch for each DP rank including padding. Will be + None if ubatch_slices is None + ] + + """ + parallel_config = vllm_config.parallel_config + # Don't bother with the should_ubatch handshaking unless microbatching + # is enabled + if not parallel_config.enable_dbo: + return (None, None) + + # Check preconditions for microbatching + should_attempt_ubatching = \ + parallel_config.enable_dbo and \ + num_tokens_unpadded >= \ + parallel_config.dbo_decode_token_threshold \ + and max_num_scheduled_tokens == 1 + + # Don't microbatch unless every other DP worker is also microbatching + num_tokens_after_padding = None + (should_ubatch, num_tokens_after_padding) = get_dp_padding_ubatch( + num_tokens_unpadded, num_tokens_padded, should_attempt_ubatching, + vllm_config) + if not should_ubatch: + return (None, None) + + # This doesn't actually pad the ubatch slices. It just initializes the + # split point to the padded value so that padding can be applied + # to the second ubatch in pad_out_ubatch_slice after attention + # metadata creation + assert num_tokens_after_padding is not None + total_num_tokens_per_ubatch = int(num_tokens_after_padding[0].item()) + padded_first_ubatch_slice = slice(0, total_num_tokens_per_ubatch) + padded_second_ubatch_slice = slice(total_num_tokens_per_ubatch, + num_tokens_unpadded) + + # Note there's an assumption here that there's 1 token per request + ubatch_slices = [ + UBatchSlice(padded_first_ubatch_slice, padded_first_ubatch_slice), + UBatchSlice(padded_second_ubatch_slice, padded_second_ubatch_slice) + ] + + return (ubatch_slices, num_tokens_after_padding) diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py new file mode 100644 index 0000000000000..6716d171cc701 --- /dev/null +++ b/vllm/v1/worker/ubatch_utils.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +from typing_extensions import TypeAlias + + +@dataclass +class UBatchSlice: + request_slice: slice + token_slice: slice + + +UBatchSlices: TypeAlias = list[UBatchSlice] + + +def is_second_ubatch_empty(orig_num_tokens_per_ubatch: int, + padded_num_tokens_per_ubatch: int) -> bool: + return padded_num_tokens_per_ubatch >= 2 * orig_num_tokens_per_ubatch diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py new file mode 100644 index 0000000000000..9aeaa9909dc81 --- /dev/null +++ b/vllm/v1/worker/ubatching.py @@ -0,0 +1,211 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import threading +from typing import Optional + +import torch + +from vllm import forward_context +from vllm.forward_context import ForwardContext +from vllm.utils import current_stream + +_THREAD_ID_TO_CONTEXT: dict = {} +_CURRENT_CONTEXTS: list[Optional['UBatchContext']] = [None, None] + + +class UBatchContext: + """ + Context manager for micro-batching synchronization using threading events. + """ + + def __init__(self, + id: int, + comm_stream: torch.cuda.Stream, + compute_stream: torch.cuda.Stream, + forward_context: ForwardContext, + ready_barrier: threading.Barrier, + cpu_wait_event: threading.Event, + cpu_signal_event: threading.Event, + gpu_comm_done_event: torch.cuda.Event, + gpu_compute_done_event: torch.cuda.Event, + schedule: str = "default"): + self.id = id + self.comm_stream = comm_stream + self.compute_stream = compute_stream + self.forward_context = forward_context + self.ready_barrier = ready_barrier + self.cpu_wait_event = cpu_wait_event + self.cpu_signal_event = cpu_signal_event + self.current_stream = compute_stream + self.gpu_comm_done_event = gpu_comm_done_event + self.gpu_compute_done_event = gpu_compute_done_event + self.schedule = schedule + self.recv_hook = None + + def __enter__(self): + global _CURRENT_CONTEXTS, _THREAD_ID_TO_CONTEXT + _THREAD_ID_TO_CONTEXT[threading.get_ident()] = self.id + _CURRENT_CONTEXTS[self.id] = self + self.ready_barrier.wait() + + self.cpu_wait_event.wait() + self.cpu_wait_event.clear() + self._restore_context() + # Assume we start on the compute stream + assert current_stream() == self.compute_stream + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + global _CURRENT_CONTEXTS, _THREAD_ID_TO_CONTEXT + _CURRENT_CONTEXTS[self.id] = None + del _THREAD_ID_TO_CONTEXT[threading.get_ident()] + self.maybe_run_recv_hook() + self.cpu_signal_event.set() + self.cpu_wait_event.clear() + self.current_stream = self.compute_stream + torch.cuda.set_stream(self.current_stream) + return False + + def _restore_context(self): + forward_context._forward_context = self.forward_context + torch.cuda.set_stream(self.current_stream) + + def update_stream(self, stream): + self.current_stream = stream + torch.cuda.set_stream(self.current_stream) + + def _signal_comm_done(self): + self.gpu_comm_done_event.record(self.comm_stream) + + def _signal_compute_done(self): + self.gpu_compute_done_event.record(self.compute_stream) + + def _wait_compute_done(self): + self.comm_stream.wait_event(self.gpu_compute_done_event) + + def _wait_comm_done(self): + self.compute_stream.wait_event(self.gpu_comm_done_event) + + def _cpu_yield(self): + # It is critical for correctness that only one thread is running + # at a time. These asserts just make sure that this is the only + # thread running before waking the other one up and going to sleep + assert forward_context._forward_context == self.forward_context + assert current_stream() == self.current_stream + assert not self.cpu_wait_event.is_set() + + self.cpu_signal_event.set() + self.cpu_wait_event.wait() + self.cpu_wait_event.clear() + self._restore_context() + + def switch_to_comm_sync(self): + self._signal_compute_done() + self.update_stream(self.comm_stream) + self._wait_comm_done() + + def maybe_run_recv_hook(self): + if self.recv_hook is not None: + self.recv_hook() + self.recv_hook = None + + def yield_(self): + self.current_stream = current_stream() + self._cpu_yield() + if self.current_stream != current_stream(): + self.update_stream(self.current_stream) + + def yield_and_switch_from_compute_to_comm(self): + assert current_stream() == self.compute_stream + self._signal_compute_done() + self._cpu_yield() + assert self.current_stream == self.compute_stream + self.update_stream(self.comm_stream) + self._wait_compute_done() + + def yield_and_switch_from_comm_to_compute(self): + assert current_stream() == self.comm_stream + self._signal_comm_done() + self._cpu_yield() + assert self.current_stream == self.comm_stream + self.update_stream(self.compute_stream) + self._wait_comm_done() + + +def dbo_enabled() -> bool: + return len(_THREAD_ID_TO_CONTEXT) > 0 + + +def dbo_current_ubatch_id() -> int: + if len(_THREAD_ID_TO_CONTEXT) == 0: + return 0 + return _THREAD_ID_TO_CONTEXT[threading.get_ident()] + + +def _register_ubatch_function(func): + + def wrapper(*args, **kwargs): + if len(_THREAD_ID_TO_CONTEXT) > 0: + ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()] + ctx = _CURRENT_CONTEXTS[ctx_idx] + func(ctx, *args, **kwargs) + + return wrapper + + +dbo_yield_and_switch_from_compute_to_comm = _register_ubatch_function( + UBatchContext.yield_and_switch_from_compute_to_comm) +dbo_yield_and_switch_from_comm_to_compute = _register_ubatch_function( + UBatchContext.yield_and_switch_from_comm_to_compute) +dbo_yield = _register_ubatch_function(UBatchContext.yield_) +dbo_maybe_run_recv_hook = _register_ubatch_function( + UBatchContext.maybe_run_recv_hook) +dbo_switch_to_comm_sync = _register_ubatch_function( + UBatchContext.switch_to_comm_sync) + + +def dbo_register_recv_hook(recv_hook): + if len(_THREAD_ID_TO_CONTEXT) > 0: + ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()] + next_ctx = _CURRENT_CONTEXTS[(ctx_idx + 1) % 2] + next_ctx.recv_hook = recv_hook + + +def make_ubatch_contexts( + num_micro_batches: int, + compute_stream: torch.cuda.Stream, + comm_stream: torch.cuda.Stream, + forward_contexts: list[ForwardContext], + ready_barrier: threading.Barrier, + schedule: str = "default", +) -> list[UBatchContext]: + assert num_micro_batches == 2, "only been tested with 2 micro-batches" + """ + Create a context manager for micro-batching synchronization. + """ + cpu_events = [threading.Event() for _ in range(num_micro_batches)] + gpu_comm_done_events = [ + torch.cuda.Event() for _ in range(num_micro_batches) + ] + gpu_compute_done_events = [ + torch.cuda.Event() for _ in range(num_micro_batches) + ] + + assert len(forward_contexts) == 2 + + ctxs = [] + for i in range(num_micro_batches): + ctx = UBatchContext(id=i, + compute_stream=compute_stream, + comm_stream=comm_stream, + forward_context=forward_contexts[i], + ready_barrier=ready_barrier, + cpu_wait_event=cpu_events[i], + cpu_signal_event=cpu_events[(i + 1) % + num_micro_batches], + gpu_comm_done_event=gpu_comm_done_events[i], + gpu_compute_done_event=gpu_compute_done_events[i], + schedule=schedule) + ctxs.append(ctx) + + return ctxs diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 5ac7470c1ac90..fc831a73a75e3 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -130,9 +130,17 @@ class MultiModalBudget: @dataclass class AttentionGroup: backend: type[AttentionBackend] - metadata_builder: AttentionMetadataBuilder + metadata_builders: list[AttentionMetadataBuilder] layer_names: list[str] + def get_metadata_builder(self, + ubatch_id: Optional[int] = None + ) -> AttentionMetadataBuilder: + if ubatch_id is None: + return self.metadata_builders[0] + assert len(self.metadata_builders) > ubatch_id + return self.metadata_builders[ubatch_id] + def sanity_check_mm_encoder_outputs( mm_embeddings: MultiModalEmbeddings, From faa7a5daac8244376a3a182d6eee8d2e0f6d8127 Mon Sep 17 00:00:00 2001 From: lianyibo <lianyibo1@kunlunit.com> Date: Wed, 17 Sep 2025 01:36:58 +0800 Subject: [PATCH 322/584] [Bugfix] Fix unable to run encoder model when disable_hybrid_kv_cache_manager is true (#24571) Signed-off-by: lianyibo <lianyibo1@kunlunit.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> --- vllm/v1/core/kv_cache_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index f225b73264049..9fab36aba91b3 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -754,6 +754,10 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: True if all layers have the same type, False otherwise. """ + if not kv_cache_spec: + # Encoder-only models do not have KV cache, kv_cache_type can be + # regarded as uniform. + return True try: kv_cache_spec_values = list(kv_cache_spec.values()) _ = kv_cache_spec_values[0].merge(kv_cache_spec_values) From d593cf28fa020dbae53dae19122aac8aeeeae0bc Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Wed, 17 Sep 2025 01:46:46 +0800 Subject: [PATCH 323/584] [Misc] Add removed encoder-decoder models to previously supported models list (#24961) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/model_executor/models/registry.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 6bb65ed6debc6..38f3d5c69b9ef 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -319,7 +319,17 @@ _SUBPROCESS_COMMAND = [ sys.executable, "-m", "vllm.model_executor.models.registry" ] -_PREVIOUSLY_SUPPORTED_MODELS = {"Phi3SmallForCausalLM": "0.9.2"} +_PREVIOUSLY_SUPPORTED_MODELS = { + "Phi3SmallForCausalLM": "0.9.2", + # encoder-decoder models except whisper + # have been removed for V0 deprecation. + "BartModel": "0.10.2", + "BartForConditionalGeneration": "0.10.2", + "DonutForConditionalGeneration": "0.10.2", + "Florence2ForConditionalGeneration": "0.10.2", + "MBartForConditionalGeneration": "0.10.2", + "MllamaForConditionalGeneration": "0.10.2", +} @dataclass(frozen=True) From cd1f885bcfe3b1731c552495ce70d2abf63d1373 Mon Sep 17 00:00:00 2001 From: Sugar <64777228+Sugar-zsg@users.noreply.github.com> Date: Wed, 17 Sep 2025 01:52:31 +0800 Subject: [PATCH 324/584] Directly get max encoder len from VLLM config in V1 (#24866) Signed-off-by: Sugar-zsg <952242923@qq.com> --- vllm/attention/layers/cross_attention.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index c24fa4e15f679..9400c5bffa380 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -14,7 +14,6 @@ from vllm.attention.layer import Attention from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import cdiv from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, subclass_attention_backend) @@ -23,9 +22,13 @@ from vllm.v1.kv_cache_interface import CrossAttentionSpec logger = init_logger(__name__) -def _get_max_encoder_len(vllm_config: VllmConfig) -> int: - return MULTIMODAL_REGISTRY.get_encdec_max_encoder_len( - vllm_config.model_config) +def _get_max_encoder_len(vllm_config: "VllmConfig") -> int: + """Gets the max number of encoder input tokens from the config. + """ + sc = vllm_config.scheduler_config + assert sc and isinstance(sc.max_num_encoder_input_tokens, int), \ + "max_num_encoder_input_tokens must be int for enc-dec models" + return sc.max_num_encoder_input_tokens def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray, From f4d6eb95cfdcee2c47bb408890346003bb5a5c20 Mon Sep 17 00:00:00 2001 From: Andrew Xia <axia@meta.com> Date: Tue, 16 Sep 2025 11:41:12 -0700 Subject: [PATCH 325/584] [gpt-oss][1b] streaming add item id, content id (#24788) Signed-off-by: Andrew Xia <axia@meta.com> --- .../openai/test_response_api_with_harmony.py | 23 +++++++++++++++++++ vllm/entrypoints/openai/serving_responses.py | 10 ++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 88b3795abe73e..0776f217f44a2 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -318,6 +318,9 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool): background=background, ) + current_item_id = "" + current_content_index = -1 + events = [] current_event_mode = None resp_id = None @@ -329,6 +332,26 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool): current_event_mode = event.type print(f"\n[{event.type}] ", end="", flush=True) + # verify current_item_id is correct + if event.type == "response.output_item.added": + assert event.item.id != current_item_id + current_item_id = event.item.id + elif event.type in [ + "response.output_text.delta", + "response.reasoning_text.delta" + ]: + assert event.item_id == current_item_id + + # verify content_index_id is correct + if event.type == "response.content_part.added": + assert event.content_index != current_content_index + current_content_index = event.content_index + elif event.type in [ + "response.output_text.delta", + "response.reasoning_text.delta" + ]: + assert event.content_index == current_content_index + if "text.delta" in event.type: print(event.delta, end="", flush=True) elif "reasoning_text.delta" in event.type: diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 9e285e6e51756..7be5e54208bd4 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -1260,9 +1260,9 @@ class OpenAIServingResponses(OpenAIServing): _increment_sequence_number_and_return: Callable[[BaseModel], BaseModel], ) -> AsyncGenerator[BaseModel, None]: - current_content_index = 0 # FIXME: this number is never changed + current_content_index = -1 current_output_index = 0 - current_item_id = "" # FIXME: this number is never changed + current_item_id: str = "" sent_output_item_added = False async for ctx in result_generator: @@ -1353,6 +1353,7 @@ class OpenAIServingResponses(OpenAIServing): and ctx.parser.current_recipient is None): if not sent_output_item_added: sent_output_item_added = True + current_item_id = f"msg_{random_uuid()}" yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( @@ -1368,6 +1369,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) + current_content_index += 1 yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartAddedEvent( @@ -1398,6 +1400,7 @@ class OpenAIServingResponses(OpenAIServing): and ctx.parser.current_recipient is None): if not sent_output_item_added: sent_output_item_added = True + current_item_id = f"msg_{random_uuid()}" yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( @@ -1412,6 +1415,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) + current_content_index += 1 yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartAddedEvent( @@ -1444,6 +1448,7 @@ class OpenAIServingResponses(OpenAIServing): ) and ctx.parser.current_recipient == "python": if not sent_output_item_added: sent_output_item_added = True + current_item_id = f"tool_{random_uuid()}" yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( @@ -1516,6 +1521,7 @@ class OpenAIServingResponses(OpenAIServing): raise ValueError( f"Unknown function name: {function_name}") + current_item_id = f"tool_{random_uuid()}" yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemAddedEvent( type="response.output_item.added", From 218454b9b26cd2185cdf84e3ec9f58538185d06b Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Tue, 16 Sep 2025 12:07:34 -0700 Subject: [PATCH 326/584] [MISC] Add code owners of vllm/v1 to vllm/v1/core (#24928) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- .github/CODEOWNERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e3dbd28fa91e9..73184d4e6b125 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -35,7 +35,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/triton_attn.py @tdoublep -/vllm/v1/core @heheda12345 +/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345 # Test ownership @@ -54,7 +54,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm -/tests/v1/core @heheda12345 +/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 /tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee /tests/models/language/generation/test_hybrid.py @tdoublep From dcf2f3ec067711ff69e5ab7478fca6ffb4f11daf Mon Sep 17 00:00:00 2001 From: Concurrensee <yida.wu@amd.com> Date: Tue, 16 Sep 2025 14:49:06 -0500 Subject: [PATCH 327/584] [ROCm] Add dependencies for ROCm (#24900) Signed-off-by: Yida Wu <yida.wu@amd.com> --- requirements/rocm-build.txt | 1 + requirements/rocm-test.txt | 1 + requirements/rocm.txt | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index affe562c24f6b..a86a8ab6df149 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -14,3 +14,4 @@ setuptools-scm>=8 wheel jinja2>=3.1.6 amdsmi==6.2.4 +timm>=1.0.17 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 25f950a99eceb..869fb28c3d85c 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -1,5 +1,6 @@ # Common dependencies -r common.txt +tblib==3.1.0 # entrypoints test # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 8e39951210714..c129dd345c81a 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -17,4 +17,5 @@ setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 -conch-triton-kernels==1.2.1 \ No newline at end of file +conch-triton-kernels==1.2.1 +timm>=1.0.17 \ No newline at end of file From 86daa875fe1a1cf76709af1637f95891d3ef0707 Mon Sep 17 00:00:00 2001 From: Andrew Xia <axia@meta.com> Date: Tue, 16 Sep 2025 12:56:16 -0700 Subject: [PATCH 328/584] [gpt-oss][1][bugfix] fix streaming final output (#24466) Signed-off-by: Andrew Xia <axia@meta.com> --- .../openai/test_response_api_with_harmony.py | 2 + tests/entrypoints/test_context.py | 83 ++++++++++++++++++- vllm/entrypoints/context.py | 11 ++- 3 files changed, 91 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 0776f217f44a2..eceaff672112f 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -364,6 +364,8 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool): events.append(event) assert len(events) > 0 + response_completed_event = events[-1] + assert len(response_completed_event.response.output) > 0 if background: starting_after = 5 diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py index 5e6a4c85ff790..2afe9758c2adf 100644 --- a/tests/entrypoints/test_context.py +++ b/tests/entrypoints/test_context.py @@ -4,7 +4,7 @@ from unittest.mock import MagicMock, patch import pytest -from openai_harmony import StreamState +from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext from vllm.outputs import CompletionOutput, RequestOutput @@ -312,9 +312,9 @@ async def test_negative_tool_tokens_edge_case(): @pytest.mark.asyncio async def test_streaming_multi_turn_token_counting(mock_parser): """Test token counting for streaming multi-turn conversations. - - This test focuses on how StreamingHarmonyContext counts tokens in a - multi-turn conversation with streaming (token-by-token) outputs and + + This test focuses on how StreamingHarmonyContext counts tokens in a + multi-turn conversation with streaming (token-by-token) outputs and message boundaries. """ # Create a streaming context @@ -423,3 +423,78 @@ async def test_streaming_multi_turn_token_counting(mock_parser): additional_tool_tokens = 13 - 8 - 3 # = 2 assert context.num_tool_output_tokens == expected_tool_tokens \ + additional_tool_tokens + + +@pytest.mark.asyncio +async def test_streaming_message_synchronization(mock_parser): + """Test message synchronization logic from lines 413-417 in context.py. + + This test verifies that when parser.messages contains more messages than + the context's _messages (minus initial messages), the context properly + extends its message list with the new parser messages. + """ + + # Create a streaming context with some initial messages + initial_messages = [ + Message( + author=Author(role=Role.USER, name="user"), + content=[TextContent(text="Hello")], + recipient=Role.ASSISTANT, + ) + ] + context = StreamingHarmonyContext(messages=initial_messages, + available_tools=[]) + + # Verify initial state + assert len(context._messages) == 1 + assert context.num_init_messages == 1 + + # Mock parser to have more messages than context + # Simulate parser having processed 3 new messages + mock_parser.messages = [ + Message( + author=Author(role=Role.ASSISTANT, name="assistant"), + content=[TextContent(text="Response 1")], + recipient=Role.USER, + ), + ] + + # This should trigger the message synchronization logic + context.append_output( + create_mock_request_output(prompt_token_ids=[1, 2, 3], + output_token_ids=[101], + finished=False)) + + # Verify that messages were synchronized + assert len(context._messages) == 2 + + # Verify the new messages were added correctly + assert context._messages[1].content[0].text == "Response 1" + + # Test the specific condition from line 413-414: + # len(self._messages) - self.num_init_messages < len(self.parser.messages) + messages_minus_init = len(context._messages) - context.num_init_messages + parser_messages_count = len(mock_parser.messages) + + # After synchronization, they should be equal (no longer less than) + assert messages_minus_init == parser_messages_count + + # Test edge case: add one more parser message + mock_parser.messages.append( + Message( + author=Author(role=Role.ASSISTANT, name="assistant"), + content=[TextContent(text="Response 4")], + recipient=Role.USER, + )) + + # Create another output to trigger synchronization again + mock_output2 = create_mock_request_output(prompt_token_ids=[1, 2, 3], + output_token_ids=[102], + finished=True) + + context.append_output(mock_output2) + + # Verify the fourth message was added, num_init_messages is still 1 + assert len(context._messages) == 3 + assert context.num_init_messages == 1 + assert context._messages[2].content[0].text == "Response 4" diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 6658f91595e51..8619452f2445f 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -151,6 +151,9 @@ class HarmonyContext(ConversationContext): self._update_decode_token_usage(output) # Move current turn to previous turn for next turn's calculations self.previous_turn = self.current_turn.copy() + # append_output is called only once before tool calling + # in non-streaming case + # so we can append all the parser messages to _messages output_msgs = self.parser.messages # The responses finish reason is set in the last message self.finish_reason = output.outputs[0].finish_reason @@ -387,7 +390,7 @@ class StreamingHarmonyContext(HarmonyContext): @property def messages(self) -> list: - return self.parser.messages + return self._messages def append_output(self, output: Union[RequestOutput, list[Message]]) -> None: @@ -412,6 +415,11 @@ class StreamingHarmonyContext(HarmonyContext): # Check if the current token is part of reasoning content self._update_num_reasoning_tokens() self.last_tok = tok + if len(self._messages) - self.num_init_messages < len( + self.parser.messages): + self._messages.extend( + self.parser.messages[len(self._messages) - + self.num_init_messages:]) else: # Handle the case of tool output in direct message format assert len(output) == 1, "Tool output should be a single message" @@ -424,6 +432,7 @@ class StreamingHarmonyContext(HarmonyContext): for tok in toks: self.parser.process(tok) self.last_tok = toks[-1] + # TODO: add tool_output messages to self._messages def is_expecting_start(self) -> bool: return self.parser.state == StreamState.EXPECT_START From 02d4b854543c3b2c65435a5ed9bb1c3a9856cfad Mon Sep 17 00:00:00 2001 From: Andrew Sansom <andrew@protopia.ai> Date: Tue, 16 Sep 2025 16:06:56 -0500 Subject: [PATCH 329/584] Use kwargs for long lists of `EngineCoreRequest` arguments in tests and fix extra kwargs (#24987) Signed-off-by: Andrew Sansom <andrew@protopia.ai> --- tests/detokenizer/test_min_tokens.py | 18 ++++++++---------- tests/tokenization/test_detokenize.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py index 887e83342536e..26003373c569c 100644 --- a/tests/detokenizer/test_min_tokens.py +++ b/tests/detokenizer/test_min_tokens.py @@ -31,16 +31,14 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str): stop=stop, min_tokens=min_tokens, ) - request = EngineCoreRequest("", - prompt_token_ids, - None, - None, - None, - params, - None, - None, - 0.0, - None, + request = EngineCoreRequest(request_id="", + prompt_token_ids=prompt_token_ids, + mm_features=None, + sampling_params=params, + pooling_params=None, + eos_token_id=None, + arrival_time=0.0, + lora_request=None, cache_salt=None, data_parallel_rank=None) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index ea7ccfbb2b456..527aad97d4fae 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -61,14 +61,14 @@ def _run_incremental_decode(tokenizer, skip_special_tokens=skip_special_tokens, spaces_between_special_tokens=spaces_between_special_tokens, ) - request = EngineCoreRequest("", - prompt_token_ids, - None, - params, - None, - None, - 0.0, - None, + request = EngineCoreRequest(request_id="", + prompt_token_ids=prompt_token_ids, + mm_features=None, + sampling_params=params, + pooling_params=None, + eos_token_id=None, + arrival_time=0.0, + lora_request=None, cache_salt=None, data_parallel_rank=None) From 3053a22b330cd7170dce6f33f3a2043c64a99599 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:27:11 -0700 Subject: [PATCH 330/584] fp8 kv cache support fix for torch.compile (#22758) Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> --- vllm/model_executor/layers/quantization/kv_cache.py | 4 +++- vllm/v1/attention/backends/triton_attn.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index 4c6fcda893a03..275a1c43fdd2b 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -125,7 +125,9 @@ class BaseKVCacheMethod(QuantizeMethodBase): # These are used in the final Attention.forward() layer._q_scale.copy_(q_scale) - layer._q_scale_float = q_scale + layer._q_scale_float = q_scale.item() if isinstance( + q_scale, torch.Tensor) else q_scale + layer._prob_scale.copy_(prob_scale) if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0 or prob_scale == 1.0): diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index c294a5a73cbdd..784912a122f68 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -361,7 +361,7 @@ class TritonAttentionImpl(AttentionImpl): key_cache = key_cache.view(self.fp8_dtype) value_cache = value_cache.view(self.fp8_dtype) num_tokens, num_heads, head_size = query.shape - assert layer._q_scale == 1.0, \ + assert layer._q_scale_float == 1.0, \ "A non 1.0 q_scale is not currently supported." if current_platform.is_cuda(): # Skip Q quantization on ROCm and XPU, enable this on cuda From dbebb7f812123b4a0efe5b085582d3345fe7f740 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Tue, 16 Sep 2025 17:45:10 -0400 Subject: [PATCH 331/584] [Perf] Reuse workspace for FP8+FP4 Marlin MoE (#20500) Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .../compressed_tensors/compressed_tensors_moe.py | 6 ++++-- vllm/model_executor/layers/quantization/fp8.py | 3 ++- vllm/model_executor/layers/quantization/modelopt.py | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c2b884c058d3a..5470deb768450 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -398,7 +398,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): quant_type_id=scalar_types.float4_e2m1f.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) # FlashInfer fused experts path if self.fused_experts is not None: @@ -940,7 +941,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): quant_type_id=scalar_types.float8_e4m3fn.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) assert self.fused_experts_func is not None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 49ff87df93c31..254cc2be05ee6 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1103,7 +1103,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): quant_type_id=scalar_types.float8_e4m3fn.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: assert self.block_quant is None assert (not renormalize and custom_routing_function is not None) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 9b99931e7b43f..60a79e53e8141 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1474,7 +1474,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): quant_type_id=scalar_types.float4_e2m1f.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) if self.fused_experts is not None: assert self.allow_flashinfer and \ From d119fc86140785e7efc8f125c17153544d1e0f20 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni001@gmail.com> Date: Tue, 16 Sep 2025 18:55:02 -0400 Subject: [PATCH 332/584] [CI][Bugfix] Fix failing Blackwell test (#24993) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- vllm/model_executor/layers/fused_moe/modular_kernel.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 33799b58d1998..efaa9cc058e41 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -506,12 +506,9 @@ class SharedResizableBuffer: def get(self, shape: tuple[int, ...], device: torch.device, dtype: torch.dtype): shape_numel = prod(shape) - if self.buffer is None or self.buffer.numel() < shape_numel: + if (self.buffer is None or self.buffer.numel() < shape_numel + or self.buffer.device != device or self.buffer.dtype != dtype): self.buffer = torch.empty(shape_numel, device=device, dtype=dtype) - assert self.buffer.device == device, \ - f"Buffer device mismatch: {self.buffer.device} != {device}" - assert self.buffer.dtype == dtype, \ - f"Buffer dtype mismatch: {self.buffer.dtype} != {dtype}" return self.buffer[:shape_numel].view(*shape) From 493b10f8bf38495654baa601e8ed8dc4ce1565b7 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Tue, 16 Sep 2025 21:13:21 -0400 Subject: [PATCH 333/584] [CI] GPT-OSS GPQA eval test for Blackwell (#24920) Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 14 +++ tests/evals/gpt_oss/__init__.py | 2 + tests/evals/gpt_oss/conftest.py | 18 ++++ tests/evals/gpt_oss/test_gpqa_correctness.py | 102 +++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 tests/evals/gpt_oss/__init__.py create mode 100644 tests/evals/gpt_oss/conftest.py create mode 100644 tests/evals/gpt_oss/test_gpqa_correctness.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f0fd808fd6dce..6f06099edd53f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -821,6 +821,20 @@ steps: - pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py +- label: GPT-OSS Eval (Blackwell) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2' + ##### 1 GPU test ##### ##### multi gpus test ##### diff --git a/tests/evals/gpt_oss/__init__.py b/tests/evals/gpt_oss/__init__.py new file mode 100644 index 0000000000000..0fec1fe5bcdfd --- /dev/null +++ b/tests/evals/gpt_oss/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project \ No newline at end of file diff --git a/tests/evals/gpt_oss/conftest.py b/tests/evals/gpt_oss/conftest.py new file mode 100644 index 0000000000000..35528c0a6a36a --- /dev/null +++ b/tests/evals/gpt_oss/conftest.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Pytest configuration for GPT-OSS evaluation tests. +""" + + +def pytest_addoption(parser): + """Add command line options for pytest.""" + parser.addoption("--model", action="store", help="Model name to evaluate") + parser.addoption("--metric", + action="store", + type=float, + help="Expected metric threshold") + parser.addoption("--server-args", + action="store", + default="", + help="Additional server arguments") diff --git a/tests/evals/gpt_oss/test_gpqa_correctness.py b/tests/evals/gpt_oss/test_gpqa_correctness.py new file mode 100644 index 0000000000000..4cc4041a60ce7 --- /dev/null +++ b/tests/evals/gpt_oss/test_gpqa_correctness.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +GPQA evaluation using vLLM server and GPT-OSS evaluation package. + +Usage: +pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py \ + --model openai/gpt-oss-20b \ + --metric 0.58 \ + --server-args "--tensor-parallel-size 2" +""" + +import subprocess +import sys + +import regex as re + +from tests.utils import RemoteOpenAIServer + +TOL = 0.05 # Absolute tolerance for accuracy comparison + + +def run_gpqa_eval(model_name: str, base_url: str) -> float: + """Run GPQA evaluation using the gpt-oss evaluation package.""" + + # Build the command to run the evaluation + cmd = [ + sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model", + model_name, "--reasoning-effort", "low", "--base-url", base_url + ] + + try: + # Run the evaluation + result = subprocess.run( + cmd, + text=True, + capture_output=True, + timeout=1800, # 30 minute timeout + env={"OPENAI_API_KEY": "dummy"}) + + print("Evaluation process output:\n", result.stdout) + + # Parse the output to extract the score + match = re.search(r"'metric':\s*([\d.]+)", result.stdout) + if match: + return float(match.group(1)) + + # If we still can't find it, raise an error + raise ValueError( + f"Could not parse score from evaluation output:\n{result.stdout}") + + except subprocess.TimeoutExpired as e: + raise RuntimeError("Evaluation timed out") from e + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"Evaluation failed with exit code {e.returncode}:\n" + f"stdout: {e.stdout}\nstderr: {e.stderr}") from e + + +def test_gpqa_correctness(request): + """Test GPQA correctness for GPT-OSS model.""" + + # Get command line arguments + model_name = request.config.getoption("--model") + expected_metric = request.config.getoption("--metric") + server_args_str = request.config.getoption("--server-args") + + # Parse server arguments + server_args = [] + if server_args_str: + server_args = server_args_str.split() + + # Add standard server arguments + server_args.extend([ + "--max-model-len", + "32768", + "--trust-remote-code", + ]) + + print(f"Starting GPQA evaluation for model: {model_name}") + print(f"Expected metric threshold: {expected_metric}") + print(f"Server args: {' '.join(server_args)}") + + # Launch server and run evaluation + with RemoteOpenAIServer(model_name, server_args, + max_wait_seconds=1800) as remote_server: + base_url = remote_server.url_for("v1") + print(f"Server started at: {base_url}") + + measured_metric = run_gpqa_eval(model_name, base_url) + + print(f"GPQA Results for {model_name}:") + print(f" Measured metric: {measured_metric:.4f}") + print(f" Expected metric: {expected_metric:.4f}") + print(f" Tolerance: {TOL:.4f}") + + # Verify metric is within tolerance + assert measured_metric >= expected_metric - TOL, ( + f"GPQA metric too low: {measured_metric:.4f} < " + f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}") + + print(f"✅ GPQA test passed for {model_name}") From cef32104b4b411e5093581ad77d2e09a50c2837c Mon Sep 17 00:00:00 2001 From: Tahsin Tunan <tahsintunan@gmail.com> Date: Wed, 17 Sep 2025 07:31:06 +0600 Subject: [PATCH 334/584] [FP8] Extend per-token-group quantization support to QuantFP8 (#24342) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com> --- .../kernels/bench_per_token_quant_fp8.py | 263 ++++++++++++++---- .../quantization/test_fp8_quant_group.py | 150 ++++++++++ .../layers/fused_moe/fused_moe.py | 4 +- .../layers/quantization/input_quant_fp8.py | 79 +++++- .../layers/quantization/utils/quant_utils.py | 9 + 5 files changed, 444 insertions(+), 61 deletions(-) create mode 100644 tests/kernels/quantization/test_fp8_quant_group.py diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py index 923d678f1f2db..9170361e974b6 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -2,14 +2,25 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from typing import Callable +from unittest.mock import patch +import pandas as pd import torch -from vllm import _custom_ops as ops -from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.triton_utils import triton +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + + +def with_triton_mode(fn): + """Temporarily force the Triton fallback path""" + + def wrapped(*args, **kwargs): + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + return fn(*args, **kwargs) + + return wrapped # TODO(luka): use standalone_compile utility @@ -21,78 +32,236 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int): return inner -torch._dynamo.config.recompile_limit = 8888 -compilation_config = CompilationConfig(custom_ops=["none"]) -with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)): - torch_per_token_quant_fp8 = torch.compile( - QuantFP8(False, GroupShape.PER_TOKEN), - fullgraph=True, - dynamic=False, # recompile for different shapes - ) +def bench_compile(fn: Callable): + # recompile for different shapes + fwd = torch.compile(fn, fullgraph=True, dynamic=False) # First dim is explicitly dynamic to simulate vLLM usage - torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0) + return with_dyn_arg(fwd, 0, 0) -def cuda_per_token_quant_fp8( - input: torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor]: - return ops.scaled_fp8_quant(input) +torch._dynamo.config.recompile_limit = 8888 -def calculate_diff(batch_size: int, seq_len: int): - """Calculate difference between Triton and CUDA implementations.""" +def calculate_diff( + batch_size: int, + hidden_size: int, + group_shape: GroupShape, + dtype: torch.dtype, +): + """Calculate the difference between Inductor and CUDA implementations.""" device = torch.device("cuda") - x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device) + x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device) - torch_out, torch_scale = torch_per_token_quant_fp8(x) - cuda_out, cuda_scale = cuda_per_token_quant_fp8(x) + quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False) - if torch.allclose( - cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5 - ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5): + torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x) + torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x) + cuda_out, cuda_scale = quant_fp8.forward_cuda(x) + + out_allclose = lambda o1, o2: torch.allclose( + o1.to(torch.float32), + o2.to(torch.float32), + rtol=1e-3, + atol=1e-5, + ) + scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5) + + if ( + out_allclose(cuda_out, torch_out) + and scale_allclose(cuda_scale, torch_scale) + and out_allclose(cuda_out, torch_eager_out) + and scale_allclose(cuda_scale, torch_eager_scale) + ): print("✅ All implementations match") else: print("❌ Implementations differ") -batch_size_range = [1, 16, 32, 64, 128] -seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] - -configs = list(itertools.product(batch_size_range, seq_len_range)) +configs = [] -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["batch_size", "seq_len"], - x_vals=configs, - line_arg="provider", - line_vals=["torch", "cuda"], - line_names=["Torch", "CUDA"], - styles=[("blue", "-"), ("green", "-")], - ylabel="us", - plot_name="per-token-dynamic-quant-fp8-performance", - args={}, - ) -) -def benchmark_quantization(batch_size, seq_len, provider): - dtype = torch.float16 +def benchmark_quantization( + batch_size, + hidden_size, + provider, + group_shape: GroupShape, + col_major: bool, + dtype: torch.dtype, +): device = torch.device("cuda") - x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype) + x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype) quantiles = [0.5, 0.2, 0.8] + quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major) if provider == "torch": - fn = lambda: torch_per_token_quant_fp8(x.clone()) + fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone()) elif provider == "cuda": - fn = lambda: cuda_per_token_quant_fp8(x.clone()) + fn = lambda: quant_fp8.forward_cuda(x.clone()) + elif provider == "triton": + if not group_shape.is_per_group(): + # Triton only supported for per-group + return 0, 0, 0 + + fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone()) ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles) return 1000 * ms, 1000 * max_ms, 1000 * min_ms +# TODO(luka) extract to utils +def compute_geomean_speedups( + df: pd.DataFrame, + baseline_col: str, + speedup_cols: list[str], + groupby_cols: list[str] | None = None, +) -> pd.DataFrame: + """ + Compute geometric mean speedups over a baseline column. + + Args: + df: Input dataframe + baseline_col: Column to use as baseline + speedup_cols: Columns to compute speedups for + groupby_cols: Columns to group by. If None, compute over entire df. + + Returns: + pd.DataFrame with geometric mean speedups + """ + from scipy.stats import gmean + + def geo_speedup(group: pd.DataFrame) -> pd.Series: + ratios = { + col: (group[baseline_col] / group[col]).values for col in speedup_cols + } + return pd.Series({col: gmean(vals) for col, vals in ratios.items()}) + + if groupby_cols is None: + result = geo_speedup(df).to_frame().T + else: + result = ( + df.groupby(groupby_cols) + .apply(geo_speedup, include_groups=False) + .reset_index() + ) + + return result + + if __name__ == "__main__": - calculate_diff(batch_size=4, seq_len=4096) - benchmark_quantization.run(print_data=True) + parser = FlexibleArgumentParser( + description="Benchmark the various implementations of QuantFP8 (dynamic-only)" + ) + parser.add_argument("-c", "--check", action="store_true") + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" + ) + parser.add_argument( + "--hidden-sizes", + type=int, + nargs="+", + default=None, + help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)", + ) + parser.add_argument( + "--batch-sizes", + type=int, + nargs="+", + default=None, + help="Batch sizes to benchmark (default: 1,16,32,64,128)", + ) + parser.add_argument( + "--group-sizes", + type=int, + nargs="+", + default=None, + help="Group sizes for GroupShape(1,N) to benchmark. " + "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)", + ) + parser.add_argument( + "--no-column-major", + action="store_true", + help="Disable column-major scales testing", + ) + + args = parser.parse_args() + assert args + + dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] + + hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] + batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128] + + if args.group_sizes is not None: + group_shapes = [] + for size in args.group_sizes: + if size == 0: + group_shapes.append(GroupShape.PER_TENSOR) + elif size == -1: + group_shapes.append(GroupShape.PER_TOKEN) + else: + group_shapes.append(GroupShape(1, size)) + else: + group_shapes = [ + GroupShape.PER_TENSOR, + GroupShape.PER_TOKEN, + GroupShape(1, 64), + GroupShape(1, 128), + ] + + column_major_scales = [False] if args.no_column_major else [True, False] + + config_gen = itertools.product( + group_shapes, + column_major_scales, + batch_sizes, + hidden_sizes, + ) + + # filter out column-major scales for non-group, reverse order + configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1])) + + print(f"Running {len(configs)} configurations:") + print(f" Hidden sizes: {hidden_sizes}") + print(f" Batch sizes: {batch_sizes}") + print(f" Group shapes: {[str(g) for g in group_shapes]}") + print(f" Column major scales: {column_major_scales}") + print() + + if args.check: + for group_shape in group_shapes: + group_size = group_shape[1] + print(f"{group_size=}") + calculate_diff( + batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype + ) + + benchmark = triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["hidden_size", "batch_size", "col_major", "group_shape"], + x_vals=configs, + line_arg="provider", + line_vals=["torch", "cuda", "triton"], + line_names=["Torch (Compiled)", "CUDA", "Triton"], + styles=[("blue", "-"), ("green", "-"), ("black", "-")], + ylabel="us", + plot_name="QuantFP8 performance", + args={}, + ) + )(benchmark_quantization) + + df = benchmark.run(print_data=True, dtype=dtype, return_df=True) + + # Print geomean speedups + geo_table_grouped = compute_geomean_speedups( + df, + baseline_col="Torch (Compiled)", + speedup_cols=["CUDA", "Triton"], + groupby_cols=["col_major", "group_shape"], + ) + + print("Speedup over Torch (Compiled)") + print(geo_table_grouped.to_string(index=False)) diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py new file mode 100644 index 0000000000000..720eee62760db --- /dev/null +++ b/tests/kernels/quantization/test_fp8_quant_group.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for QuantFP8 Group Quantization implementation.""" + +import pytest +import torch + +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape) +from vllm.platforms import current_platform + + +@pytest.mark.parametrize( + "batch_size,hidden_dim,group_size", + [ + (16, 256, 32), # Small + (64, 1024, 64), # Medium + (128, 2048, 128), # Large + (8, 513, 64), # Non-divisible (native only) + ]) +@pytest.mark.parametrize("seed", [42]) +@torch.inference_mode() +def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int, + group_size: int, seed: int) -> None: + """Test QuantFP8 group quantization with various configurations. + + Tests both CUDA and native implementations, column-major scales, + and verifies consistency between implementations. + """ + current_platform.seed_everything(seed) + + x = torch.randn( + (batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8 + expected_num_groups = (hidden_dim + group_size - 1) // group_size + is_divisible = hidden_dim % group_size == 0 + + group_shape = GroupShape(1, group_size) + quant_op = QuantFP8(static=False, + group_shape=group_shape, + column_major_scales=False) + + # 1. Test native implementation (always available) + x_quant_native, scales_native = quant_op.forward_native(x.clone()) + assert x_quant_native.shape == x.shape + assert scales_native.shape == (batch_size, expected_num_groups) + + # 2. Test column-major scales configuration + quant_op_col = QuantFP8(static=False, + group_shape=group_shape, + column_major_scales=True) + _, scales_col = quant_op_col.forward_native(x.clone()) + assert scales_col.shape == (expected_num_groups, batch_size) + + # 3. Test CUDA implementation (only for divisible dimensions) + if is_divisible: + x_quant_cuda, scales_cuda = quant_op.forward_cuda(x.clone()) + assert x_quant_cuda.shape == x.shape + assert scales_cuda.shape == (batch_size, expected_num_groups) + + # Verify CUDA/native consistency + assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8) + + # Quantized values should mostly match + diff_count = (x_quant_cuda != x_quant_native).sum().item() + diff_ratio = diff_count / x_quant_cuda.numel() + assert diff_ratio < 0.002, f"Too many differences: {diff_ratio:.4%}" + + +@pytest.mark.parametrize("seed", [42]) +@torch.inference_mode() +def test_quantfp8_group_multidimensional(seed: int) -> None: + current_platform.seed_everything(seed) + + group_size = 64 + + # Test with 3D input + batch1, batch2, hidden_dim = 4, 8, 512 + x_3d = torch.randn( + (batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8 + + group_shape = GroupShape(1, group_size) + quant_op = QuantFP8(static=False, + group_shape=group_shape, + column_major_scales=False) + + x_quant, scales = quant_op.forward_native(x_3d.clone()) + assert x_quant.shape == x_3d.shape + assert scales.shape == (batch1, batch2, hidden_dim // group_size) + + # Test column_major_scales with multi-dim + quant_op_col = QuantFP8(static=False, + group_shape=group_shape, + column_major_scales=True) + _, scales_col = quant_op_col.forward_native(x_3d.clone()) + assert scales_col.shape == (batch1, hidden_dim // group_size, batch2) + + # Test with 4D input + batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256 + x_4d = torch.randn((batch1, batch2, batch3, hidden_dim), + dtype=torch.bfloat16, + device="cuda") * 8 + + x_quant_4d, scales_4d = quant_op.forward_native(x_4d.clone()) + assert x_quant_4d.shape == x_4d.shape + assert scales_4d.shape == (batch1, batch2, batch3, + hidden_dim // group_size) + + _, scales_4d_col = quant_op_col.forward_native(x_4d.clone()) + assert scales_4d_col.shape == (batch1, batch2, hidden_dim // group_size, + batch3) + + +@pytest.mark.parametrize("seed", [42]) +@torch.inference_mode() +def test_quantfp8_group_edge_cases(seed: int) -> None: + current_platform.seed_everything(seed) + + batch_size = 16 + group_size = 64 + + # Test with single group (group_size >= hidden_dim) + x_small = torch.randn( + (batch_size, 32), dtype=torch.bfloat16, device="cuda") * 8 + group_shape = GroupShape(1, group_size) + quant_op = QuantFP8(static=False, + group_shape=group_shape, + column_major_scales=False) + + x_quant_small, scales_small = quant_op.forward_native(x_small.clone()) + assert x_quant_small.shape == x_small.shape + assert scales_small.shape == (batch_size, 1) + + # Test with zero inputs + x_zero = torch.zeros((batch_size, 256), + dtype=torch.bfloat16, + device="cuda") + x_quant_zero, scales_zero = quant_op.forward_native(x_zero.clone()) + assert x_quant_zero.shape == x_zero.shape + assert (scales_zero > 0).all(), "Scales should be clamped to minimum" + + # Test very large values + x_large = torch.full((batch_size, 256), + 1000.0, + dtype=torch.bfloat16, + device="cuda") + x_quant_large, scales_large = quant_op.forward_native(x_large.clone()) + assert x_quant_large.shape == x_large.shape + # FP8 max is typically 448 or 224, so scales should be > 1 + assert (scales_large > 1.0).all(), "Large values should have scales > 1" diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 30e46ffa7b176..36c2ab8b2d5f3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -32,9 +32,11 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP) from vllm.model_executor.layers.fused_moe.utils import ( - _resize_cache, moe_kernel_quantize_input, per_token_group_quant_fp8) + _resize_cache, moe_kernel_quantize_input) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( calculate_tile_tokens_dim) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( dequant_mxfp4) from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index e1a9bdde9334d..31182f40b48f6 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -23,28 +23,39 @@ _FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0) @CustomOp.register("quant_fp8") class QuantFP8(CustomOp): """ - Quantize input tensor to per-tensor or per-token FP8. + Quantize input tensor to FP8 (per-tensor, per-token, or per-group). This CustomOp supports both static and dynamic quantization. """ def __init__(self, static: bool, group_shape: GroupShape, - num_token_padding: Optional[int] = None): + num_token_padding: Optional[int] = None, + column_major_scales: bool = False): """ - :param static: static or dynamic quantization - :param group_shape: quantization group shape (PER_TOKEN or PER_TENSOR) - :param num_token_padding: Pad the token dimension of output to this size + :param group_shape: quantization group shape (PER_TOKEN, PER_TENSOR, + or arbitrary block size) + :param num_token_padding: Pad the token dimension of output to this + size + :param column_major_scales: For group quantization, output scales in + column major format """ super().__init__() - self.num_token_padding = num_token_padding - assert group_shape in {GroupShape.PER_TOKEN, GroupShape.PER_TENSOR} - assert not static or group_shape == GroupShape.PER_TENSOR, \ - "Only per-tensor scales supported for static quantization." self.static = static self.group_shape = group_shape - self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN + self.num_token_padding = num_token_padding + self.column_major_scales = column_major_scales + + self.is_group_quant = group_shape.is_per_group() + if self.is_group_quant: + assert not static, "Group quantization only supports dynamic mode" + self.group_size = group_shape.col + else: + assert group_shape in {GroupShape.PER_TOKEN, GroupShape.PER_TENSOR} + assert not static or group_shape == GroupShape.PER_TENSOR, \ + "Only per-tensor scales supported for static quantization." + self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN def forward_cuda( self, @@ -52,11 +63,19 @@ class QuantFP8(CustomOp): scale: Optional[torch.Tensor] = None, scale_ub: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: + if self.is_group_quant: + assert scale is None, "Group quantization is always dynamic" + from vllm.model_executor.layers.quantization.utils import fp8_utils + return fp8_utils.per_token_group_quant_fp8( + x, + group_size=self.group_size, + column_major_scales=self.column_major_scales, + dtype=_FP8_DTYPE) + assert (scale is not None) == self.static assert scale_ub is None or (not self.static and self.group_shape == GroupShape.PER_TOKEN and scale_ub.numel() == 1) - return ops.scaled_fp8_quant( x, scale, @@ -70,6 +89,10 @@ class QuantFP8(CustomOp): scale: Optional[torch.Tensor] = None, scale_ub: Optional[torch.Tensor] = None, ): + if self.is_group_quant: + assert scale is None, "Group quantization is always dynamic" + return self._quantize_group_native(x) + assert (scale is not None) == self.static assert scale_ub is None or (not self.static and self.group_shape == GroupShape.PER_TOKEN @@ -84,8 +107,7 @@ class QuantFP8(CustomOp): else: x_max = x.abs().max().unsqueeze(-1).to(torch.float32) - scale = x_max / _FP8_MAX - scale = scale.clamp(min=_FP8_MIN_SCALING_FACTOR) + scale = (x_max / _FP8_MAX).clamp(min=_FP8_MIN_SCALING_FACTOR) # Even for dynamic per-token scales, # reciprocal performs slightly better than division @@ -101,3 +123,34 @@ class QuantFP8(CustomOp): out = F.pad(out, (0, 0, 0, padding), "constant", 0.0) return out, scale + + def _quantize_group_native( + self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + orig_shape = x.shape + hidden_dim = x.shape[-1] + num_groups = (hidden_dim + self.group_size - 1) // self.group_size + padded_dim = num_groups * self.group_size + + if padded_dim != hidden_dim: + padding = padded_dim - hidden_dim + x = F.pad(x, (0, padding), mode='constant', value=0.0) + + x_grouped = x.view(-1, num_groups, self.group_size) + absmax = x_grouped.abs().max(dim=-1, keepdim=True)[0].float() + scales = (absmax / _FP8_MAX).clamp(min=_FP8_MIN_SCALING_FACTOR) + + x_scaled = x_grouped / scales + x_quant = x_scaled.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE) + + x_quant = x_quant.view(-1, padded_dim) + if padded_dim != hidden_dim: + x_quant = x_quant[..., :hidden_dim] + x_quant = x_quant.view(orig_shape) + + scales = scales.squeeze(-1) + scales = scales.reshape(orig_shape[:-1] + (num_groups, )) + + if self.column_major_scales: + scales = scales.transpose(-2, -1).contiguous() + + return x_quant, scales diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index f4ff875adb21c..5339c6043cc16 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -34,6 +34,15 @@ class GroupShape(_GroupShape): PER_TENSOR: ClassVar['GroupShape'] PER_TOKEN: ClassVar['GroupShape'] + def is_per_tensor(self) -> bool: + return self.row == -1 and self.col == -1 + + def is_per_token(self) -> bool: + return self.row == 1 and self.col == -1 + + def is_per_group(self) -> bool: + return self.row == 1 and self.col >= 1 + GroupShape.PER_TENSOR = GroupShape(-1, -1) GroupShape.PER_TOKEN = GroupShape(1, -1) From 64ad551878e9f0323407928fa747ce14c67c71f2 Mon Sep 17 00:00:00 2001 From: Benjamin Bartels <benjamin@bartels.dev> Date: Wed, 17 Sep 2025 02:33:18 +0100 Subject: [PATCH 335/584] Removes source compilation of nixl dependency (#24874) Signed-off-by: bbartels <benjamin@bartels.dev> Signed-off-by: Benjamin Bartels <benjamin@bartels.dev> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Daniele <36171005+dtrifiro@users.noreply.github.com> --- docker/Dockerfile | 20 +++- docs/serving/expert_parallel_deployment.md | 4 +- requirements/kv_connectors.txt | 3 +- tools/install_gdrcopy.sh | 57 +++++++++++ tools/install_nixl.sh | 109 --------------------- 5 files changed, 77 insertions(+), 116 deletions(-) create mode 100755 tools/install_gdrcopy.sh delete mode 100644 tools/install_nixl.sh diff --git a/docker/Dockerfile b/docker/Dockerfile index 17f8e6043f895..034f73736ca72 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -283,6 +283,10 @@ WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive ARG TARGETPLATFORM +ARG GDRCOPY_CUDA_VERSION=12.8 +# Keep in line with FINAL_BASE_IMAGE +ARG GDRCOPY_OS_VERSION=Ubuntu22_04 + SHELL ["/bin/bash", "-c"] ARG DEADSNAKES_MIRROR_URL @@ -441,13 +445,21 @@ COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh RUN --mount=type=cache,target=/root/.cache/uv \ VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} -# Install EP kernels(pplx-kernels and DeepEP), NixL +COPY tools/install_gdrcopy.sh install_gdrcopy.sh +RUN set -eux; \ + case "${TARGETPLATFORM}" in \ + linux/arm64) UUARCH="aarch64" ;; \ + linux/amd64) UUARCH="x64" ;; \ + *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \ + esac; \ + ./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \ + rm ./install_gdrcopy.sh + +# Install EP kernels(pplx-kernels and DeepEP) COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh -COPY tools/install_nixl.sh install_nixl.sh ENV CUDA_HOME=/usr/local/cuda RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \ - && bash install_python_libraries.sh \ - && bash install_nixl.sh --force + && bash install_python_libraries.sh #################### vLLM installation IMAGE #################### diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 494d2ad021e71..7489fc2609831 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -10,7 +10,7 @@ Before using EP, you need to install the necessary dependencies. We are actively 1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](gh-file:tools/ep_kernels). 2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation). -3. **For disaggregated serving**: Install UCX and NIXL following the [script](gh-file:tools/install_nixl.sh). +3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](gh-file:tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). ### Backend Selection Guide @@ -191,7 +191,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok ### Setup Steps -1. **Install KV Connector**: Install NIXL using the [installation script](gh-file:tools/install_nixl.sh) +1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. 2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}` diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt index 262675a231206..3b610e0d97363 100644 --- a/requirements/kv_connectors.txt +++ b/requirements/kv_connectors.txt @@ -1 +1,2 @@ -lmcache \ No newline at end of file +lmcache +nixl >= 0.5.1 # Required for disaggregated prefill diff --git a/tools/install_gdrcopy.sh b/tools/install_gdrcopy.sh new file mode 100755 index 0000000000000..481723320c63b --- /dev/null +++ b/tools/install_gdrcopy.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Usage: install_gdrcopy.sh <GDRCOPY_OS_VERSION> <GDRCOPY_CUDA_VERSION> <uuarch> +# uuarch must be "x64" or "aarch64" +# Optional: set GDRCOPY_VERSION to override the libgdrapi package version (default: 2.5.1-1) +# Requires: curl, apt-get, root privileges +if [[ $(id -u) -ne 0 ]]; then + echo "Must be run as root" >&2 + + exit 1 +fi +if [[ $# -ne 3 ]]; then + echo "Usage: $0 <GDRCOPY_OS_VERSION> <GDRCOPY_CUDA_VERSION> <uuarch(x64|aarch64)>" >&2 + exit 1 +fi + +OS_VER="$1" +CUDA_VER="$2" +UUARCH_RAW="$3" + +# Normalize/validate arch +case "${UUARCH_RAW,,}" in + aarch64|arm64) + URL_ARCH="aarch64" + DEB_ARCH="arm64" + ;; + x64|x86_64|amd64) + URL_ARCH="x64" + DEB_ARCH="amd64" + ;; + *) + echo "Unsupported uuarch: ${UUARCH_RAW}. Use 'x64' or 'aarch64'." >&2 + exit 1 + ;; +esac + +OS_VER_LOWER="$(tr '[:upper:]' '[:lower:]' <<<"$OS_VER")" +GDRCOPY_PKG_VER="${GDRCOPY_VERSION:-2.5.1-1}" + +DEB_NAME="libgdrapi_${GDRCOPY_PKG_VER}_${DEB_ARCH}.${OS_VER}.deb" +BASE_URL="https://developer.download.nvidia.com/compute/redist/gdrcopy" +URL="${BASE_URL}/CUDA%20${CUDA_VER}/${OS_VER_LOWER}/${URL_ARCH}/${DEB_NAME}" + +echo "Downloading: ${URL}" +TMPDIR="$(mktemp -d)" +trap 'rm -rf "${TMPDIR}"' EXIT + +curl -fSL "${URL}" -o "${TMPDIR}/${DEB_NAME}" + +export DEBIAN_FRONTEND=noninteractive +apt-get update +apt-get install -y "${TMPDIR}/${DEB_NAME}" +apt-get clean +rm -rf /var/lib/apt/lists/* + +echo "Installed ${DEB_NAME}" diff --git a/tools/install_nixl.sh b/tools/install_nixl.sh deleted file mode 100644 index 56717cfb77f7b..0000000000000 --- a/tools/install_nixl.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash -# Usage: ./install_nixl.sh [--force] - -FORCE=false -if [ "$1" == "--force" ]; then - FORCE=true -fi - -SUDO=false -if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then - SUDO=true -fi - -ARCH=$(uname -m) - -ROOT_DIR="/usr/local" -mkdir -p "$ROOT_DIR" -GDR_HOME="$ROOT_DIR/gdrcopy" -UCX_HOME="$ROOT_DIR/ucx" -NIXL_HOME="$ROOT_DIR/nixl" -CUDA_HOME=/usr/local/cuda - -export PATH="$GDR_HOME/bin:$UCX_HOME/bin:$NIXL_HOME/bin:$PATH" -export LD_LIBRARY_PATH="$GDR_HOME/lib:$UCX_HOME/lib:$NIXL_HOME/lib/$ARCH-linux-gnu:$LD_LIBRARY_PATH" - -TEMP_DIR="nixl_installer" -mkdir -p "$TEMP_DIR" -cd "$TEMP_DIR" - -pip install meson ninja pybind11 - -if [ ! -e "/dev/gdrdrv" ] || [ "$FORCE" = true ]; then - echo "Installing gdrcopy\n" - wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz - tar xzf v2.5.tar.gz; rm v2.5.tar.gz - cd gdrcopy-2.5 - make prefix=$GDR_HOME CUDA=$CUDA_HOME all install - - if $SUDO; then - echo "Running insmod.sh with sudo" - sudo ./insmod.sh - else - echo "Skipping insmod.sh - sudo not available" - echo "Please run 'sudo ./gdrcopy-2.5/insmod.sh' manually if needed" - fi - - cd .. -else - echo "Found /dev/gdrdrv. Skipping gdrcopy installation" -fi - -if ! command -v ucx_info &> /dev/null || [ "$FORCE" = true ]; then - echo "Installing UCX" - wget https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz - tar xzf ucx-1.18.0.tar.gz; rm ucx-1.18.0.tar.gz - cd ucx-1.18.0 - - # Checking Mellanox NICs - MLX_OPTS="" - if lspci | grep -i mellanox > /dev/null || command -v ibstat > /dev/null; then - echo "Mellanox NIC detected, adding Mellanox-specific options" - MLX_OPTS="--with-rdmacm \ - --with-mlx5-dv \ - --with-ib-hw-tm" - fi - - ./configure --prefix=$UCX_HOME \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=$CUDA_HOME \ - --with-dm \ - --with-gdrcopy=$GDR_HOME \ - --with-verbs \ - --enable-mt \ - $MLX_OPTS - make -j - make -j install-strip - - if $SUDO; then - echo "Running ldconfig with sudo" - sudo ldconfig - else - echo "Skipping ldconfig - sudo not available" - echo "Please run 'sudo ldconfig' manually if needed" - fi - - cd .. -else - echo "Found existing UCX. Skipping UCX installation" -fi - -if ! command -v nixl_test &> /dev/null || [ "$FORCE" = true ]; then - echo "Installing NIXL" - wget https://github.com/ai-dynamo/nixl/archive/refs/tags/0.2.0.tar.gz - tar xzf 0.2.0.tar.gz; rm 0.2.0.tar.gz - cd nixl-0.2.0 - meson setup build --prefix=$NIXL_HOME -Ducx_path=$UCX_HOME - cd build - ninja - ninja install - - cd ../.. -else - echo "Found existing NIXL. Skipping NIXL installation" -fi From 3059b9cc6bf7772ac53389e01c53e583e4dea0d0 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Wed, 17 Sep 2025 09:45:29 +0800 Subject: [PATCH 336/584] [Doc] Add --force-overwrite option to generate_cmake_presets.py (#24375) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- docs/contributing/incremental_build.md | 10 +++++++++ tools/generate_cmake_presets.py | 29 +++++++++++++++++++------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/docs/contributing/incremental_build.md b/docs/contributing/incremental_build.md index 0e34e69245afb..cc01a60ce1e7f 100644 --- a/docs/contributing/incremental_build.md +++ b/docs/contributing/incremental_build.md @@ -40,6 +40,16 @@ python tools/generate_cmake_presets.py The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it. +**Force overwrite existing file:** + +To automatically overwrite an existing `CMakeUserPresets.json` without prompting, use the `--force-overwrite` flag: + +```console +python tools/generate_cmake_presets.py --force-overwrite +``` + +This is particularly useful in automated scripts or CI/CD environments where interactive prompts are not desired. + After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository. ### Example `CMakeUserPresets.json` diff --git a/tools/generate_cmake_presets.py b/tools/generate_cmake_presets.py index 5f92f2f5848fa..4869a71307e4e 100644 --- a/tools/generate_cmake_presets.py +++ b/tools/generate_cmake_presets.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse import json import multiprocessing import os @@ -26,7 +27,8 @@ def get_cpu_cores(): return multiprocessing.cpu_count() -def generate_presets(output_path="CMakeUserPresets.json"): +def generate_presets(output_path="CMakeUserPresets.json", + force_overwrite=False): """Generates the CMakeUserPresets.json file.""" print("Attempting to detect your system configuration...") @@ -143,12 +145,15 @@ def generate_presets(output_path="CMakeUserPresets.json"): output_file_path = os.path.join(project_root, output_path) if os.path.exists(output_file_path): - overwrite = input( - f"'{output_file_path}' already exists. Overwrite? (y/N): ").strip( - ).lower() - if overwrite != 'y': - print("Generation cancelled.") - return + if force_overwrite: + print(f"Overwriting existing file '{output_file_path}'") + else: + overwrite = input( + f"'{output_file_path}' already exists. Overwrite? (y/N): " + ).strip().lower() + if overwrite != 'y': + print("Generation cancelled.") + return try: with open(output_file_path, "w") as f: @@ -166,4 +171,12 @@ def generate_presets(output_path="CMakeUserPresets.json"): if __name__ == "__main__": - generate_presets() + parser = argparse.ArgumentParser() + parser.add_argument( + "--force-overwrite", + action="store_true", + help="Force overwrite existing CMakeUserPresets.json without prompting" + ) + + args = parser.parse_args() + generate_presets(force_overwrite=args.force_overwrite) From eeb135eb87277b388e1a2c97ff9c13fcd94fc6ae Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Tue, 16 Sep 2025 19:18:06 -0700 Subject: [PATCH 337/584] [Core] Use `CpuGpuBuffer` for block table tensors (#24795) Signed-off-by: Nick Hill <nhill@redhat.com> --- tests/v1/tpu/worker/test_tpu_model_runner.py | 2 +- tests/v1/worker/test_gpu_input_batch.py | 5 +- tests/v1/worker/test_gpu_model_runner.py | 2 +- vllm/v1/worker/block_table.py | 75 +++++++++----------- vllm/v1/worker/cpu_model_runner.py | 8 +-- vllm/v1/worker/gpu_model_runner.py | 24 +++---- 6 files changed, 53 insertions(+), 63 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index bd9b6131c2222..4f4a9c7db88a3 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -125,7 +125,7 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: return False num_blocks = block_table.num_blocks_per_row[req_index] - block_table_values = block_table.block_table_np[req_index, :num_blocks] + block_table_values = block_table.block_table.np[req_index, :num_blocks] return (block_table_values == req_block_ids).all() diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 38f543c784866..98700ff73fd1f 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -15,6 +15,7 @@ from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.utils import CpuGpuBuffer from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -45,7 +46,7 @@ def _compare_objs(obj1, is_same = False if isinstance(a, torch.Tensor): - if (a.numel() == 0 or b.numel() == 0): + if a.numel() == 0 or b.numel() == 0: is_same = (a.numel() == 0 and b.numel() == 0) elif torch.allclose(a, b): is_same = True @@ -61,6 +62,8 @@ def _compare_objs(obj1, is_same = True # if we make it here must be same elif a == b: is_same = True + elif isinstance(a, CpuGpuBuffer): + is_same = np.allclose(a.np, b.np) and torch.allclose(a.gpu, b.gpu) assert is_same, f"Attribute {attr_name} is different"\ f" in {obj1} and {obj2}: {a} != {b}" diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 4ad8df1ce3868..8b571f95c5ecf 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -165,7 +165,7 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: req_state.block_ids[0]): return False num_blocks = block_table.num_blocks_per_row[req_index] - return (block_table.block_table_np[req_index, :num_blocks] == + return (block_table.block_table.np[req_index, :num_blocks] == req_state.block_ids[0]).all() diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 194984bf50536..82b6d1b514d50 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Union import numpy as np import torch @@ -7,6 +8,7 @@ import torch from vllm.distributed import get_dcp_group from vllm.logger import init_logger from vllm.utils import cdiv +from vllm.v1.utils import CpuGpuBuffer logger = init_logger(__name__) @@ -29,28 +31,13 @@ class BlockTable: self.pin_memory = pin_memory self.device = device - self.block_table = torch.zeros( - (max_num_reqs, max_num_blocks_per_req), - device=self.device, - dtype=torch.int32, - ) - self.block_table_cpu = torch.zeros( - (max_num_reqs, max_num_blocks_per_req), - device="cpu", - dtype=torch.int32, - pin_memory=pin_memory, - ) - self.block_table_np = self.block_table_cpu.numpy() + self.block_table = self._make_buffer(max_num_reqs, + max_num_blocks_per_req, + dtype=torch.int32) self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32) - self.slot_mapping_cpu = torch.zeros(self.max_num_batched_tokens, - dtype=torch.int64, - device="cpu", - pin_memory=self.pin_memory) - self.slot_mapping_np = self.slot_mapping_cpu.numpy() - self.slot_mapping = torch.zeros(self.max_num_batched_tokens, - dtype=torch.int64, - device=self.device) + self.slot_mapping = self._make_buffer(self.max_num_batched_tokens, + dtype=torch.int64) try: self.dcp_world_size = get_dcp_group().world_size self.dcp_rank = get_dcp_group().rank_in_group @@ -69,7 +56,7 @@ class BlockTable: num_blocks = len(block_ids) start = self.num_blocks_per_row[row_idx] self.num_blocks_per_row[row_idx] += num_blocks - self.block_table_np[row_idx, start:start + num_blocks] = block_ids + self.block_table.np[row_idx, start:start + num_blocks] = block_ids def add_row(self, block_ids: list[int], row_idx: int) -> None: self.num_blocks_per_row[row_idx] = 0 @@ -77,17 +64,14 @@ class BlockTable: def move_row(self, src: int, tgt: int) -> None: num_blocks = self.num_blocks_per_row[src] - self.block_table_np[tgt, :num_blocks] = self.block_table_np[ - src, :num_blocks] + block_table_np = self.block_table.np + block_table_np[tgt, :num_blocks] = block_table_np[src, :num_blocks] self.num_blocks_per_row[tgt] = num_blocks def swap_row(self, src: int, tgt: int) -> None: - num_blocks_src = self.num_blocks_per_row[src] - num_blocks_tgt = self.num_blocks_per_row[tgt] - self.num_blocks_per_row[src] = num_blocks_tgt - self.num_blocks_per_row[tgt] = num_blocks_src - - self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]] + src_tgt, tgt_src = [src, tgt], [tgt, src] + self.num_blocks_per_row[src_tgt] = self.num_blocks_per_row[tgt_src] + self.block_table.np[src_tgt] = self.block_table.np[tgt_src] def compute_slot_mapping(self, req_indices: np.ndarray, positions: np.ndarray) -> None: @@ -107,7 +91,7 @@ class BlockTable: virtual_block_size = self.block_size * self.dcp_world_size block_table_indices = (req_indices * self.max_num_blocks_per_req + positions // virtual_block_size) - block_numbers = self.block_table_np.ravel()[block_table_indices] + block_numbers = self.block_table.np.ravel()[block_table_indices] # Use virtual_block_size for mask calculation, which marks local # tokens. virtual_block_offsets = positions % virtual_block_size @@ -117,40 +101,45 @@ class BlockTable: # Calculate slot_mapping slot_mapping = block_numbers * self.block_size + block_offsets # Write final slots, use -1 for not-local - self.slot_mapping_np[:req_indices.shape[0]] = np.where( + self.slot_mapping.np[:req_indices.shape[0]] = np.where( mask, slot_mapping, -1) else: block_table_indices = (req_indices * self.max_num_blocks_per_req + positions // self.block_size) - block_numbers = self.block_table_np.ravel()[block_table_indices] + block_numbers = self.block_table.np.ravel()[block_table_indices] block_offsets = positions % self.block_size np.add(block_numbers * self.block_size, block_offsets, - out=self.slot_mapping_np[:req_indices.shape[0]]) + out=self.slot_mapping.np[:req_indices.shape[0]]) def commit_block_table(self, num_reqs: int) -> None: - self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs], - non_blocking=True) + self.block_table.copy_to_gpu(num_reqs) def commit_slot_mapping(self, num_tokens: int) -> None: - self.slot_mapping[:num_tokens].copy_( - self.slot_mapping_cpu[:num_tokens], non_blocking=True) + self.slot_mapping.copy_to_gpu(num_tokens) def clear(self) -> None: - self.block_table.fill_(0) - self.block_table_cpu.fill_(0) + self.block_table.gpu.fill_(0) + self.block_table.cpu.fill_(0) - def get_device_tensor(self) -> torch.Tensor: + def get_device_tensor(self, num_reqs: int) -> torch.Tensor: """Returns the device tensor of the block table.""" - return self.block_table + return self.block_table.gpu[:num_reqs] def get_cpu_tensor(self) -> torch.Tensor: """Returns the CPU tensor of the block table.""" - return self.block_table_cpu + return self.block_table.cpu def get_numpy_array(self) -> np.ndarray: """Returns the numpy array of the block table.""" - return self.block_table_np + return self.block_table.np + + def _make_buffer(self, *size: Union[int, torch.SymInt], + dtype: torch.dtype) -> CpuGpuBuffer: + return CpuGpuBuffer(*size, + dtype=dtype, + device=self.device, + pin_memory=self.pin_memory) class MultiGroupBlockTable: diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 619ed88ab5b27..ccdbeac64bce0 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -89,7 +89,7 @@ class CPUModelRunner(GPUModelRunner): assert isinstance(device_tensor, torch.Tensor) setattr(obj, device_attr_name, cpu_tensor) - for k, v in vars(self).items(): + for v in vars(self).values(): if isinstance(v, CpuGpuBuffer): v.gpu = v.cpu @@ -98,9 +98,9 @@ class CPUModelRunner(GPUModelRunner): replace_tensor(self.input_batch, k, k[:-11]) for block_table in self.input_batch.block_table.block_tables: - for k, v in vars(block_table).items(): - if k.endswith("_cpu") and isinstance(v, torch.Tensor): - replace_tensor(block_table, k, k[:-4]) + for v in vars(block_table).values(): + if isinstance(v, CpuGpuBuffer): + v.gpu = v.cpu def load_model(self, eep_scale_up: bool = False) -> None: logger.info("Starting to load model %s...", self.model_config.model) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2ae748dee43c9..e23115e177e69 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -427,9 +427,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): *size: Union[int, torch.SymInt], dtype: torch.dtype, numpy: bool = True) -> CpuGpuBuffer: - # Bfloat16 torch tensors cannot be directly cast to a numpy array, so - # if a bfloat16 buffer is needed without a corresponding numpy array, - # don't bother instantiating the numpy array. return CpuGpuBuffer(*size, dtype=dtype, device=self.device, @@ -1062,13 +1059,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_common_prefix_blocks = 0 else: blk_table = self.input_batch.block_table[kv_cache_group_id] - blk_table_tensor = blk_table.get_device_tensor()[:num_reqs] - slot_mapping = blk_table.slot_mapping[: - total_num_scheduled_tokens] + blk_table_tensor = blk_table.get_device_tensor(num_reqs) + slot_mapping = blk_table.slot_mapping.gpu[: + total_num_scheduled_tokens] # Fill unused with -1. Needed for reshape_and_cache in full cuda # graph mode. - blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1) + blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_( + -1) num_common_prefix_blocks = ( scheduler_output. num_common_prefix_blocks[kv_cache_group_id]) @@ -2903,10 +2901,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_actual_tokens=num_tokens, max_query_len=max_query_len, max_seq_len=self.max_model_len, - block_table_tensor=self.input_batch.block_table[ - kv_cache_group_id].get_device_tensor()[:num_reqs], - slot_mapping=self.input_batch. - block_table[kv_cache_group_id].slot_mapping[:num_tokens], + block_table_tensor=self.input_batch. + block_table[kv_cache_group_id].get_device_tensor(num_reqs), + slot_mapping=self.input_batch.block_table[ + kv_cache_group_id].slot_mapping.gpu[:num_tokens], causal=True) for attn_group in self.attn_groups[kv_cache_group_id]: if ubatch_slices is not None: @@ -3265,8 +3263,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cudagraph_runtime_mode=cudagraph_runtime_mode, uniform_decode=False) - # Capture full cudagraph for uniform decode batches if we have - # dont already have full mixed prefill-decode cudagraphs + # Capture full cudagraph for uniform decode batches if we + # don't already have full mixed prefill-decode cudagraphs. if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \ cudagraph_mode.separate_routine(): max_num_tokens = self.scheduler_config.max_num_seqs * \ From 5a411ef6c446f8fb08311a385e16e13bacf44bc5 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Wed, 17 Sep 2025 11:29:43 +0800 Subject: [PATCH 338/584] [Benchmarks] Add MMVU video dataset support and clean up deprecated datasets (#24719) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- benchmarks/benchmark_dataset.py | 1288 ------------------------------- docs/contributing/benchmarks.md | 1 + vllm/benchmarks/datasets.py | 66 +- 3 files changed, 65 insertions(+), 1290 deletions(-) delete mode 100644 benchmarks/benchmark_dataset.py diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py deleted file mode 100644 index 64ffa62c04d85..0000000000000 --- a/benchmarks/benchmark_dataset.py +++ /dev/null @@ -1,1288 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This module defines a framework for sampling benchmark requests from various -datasets. Each dataset subclass of BenchmarkDataset must implement sample -generation. Supported dataset types include: - - ShareGPT - - Random (synthetic) - - Sonnet - - BurstGPT - - HuggingFace - - VisionArena -""" - -import base64 -import io -import json -import logging -import random -from abc import ABC, abstractmethod -from collections.abc import Mapping -from copy import deepcopy -from dataclasses import dataclass -from functools import cache -from io import BytesIO -from typing import Any, Callable, Optional, Union - -import numpy as np -import pandas as pd -from datasets import load_dataset -from PIL import Image -from transformers import PreTrainedTokenizerBase - -from vllm.lora.request import LoRARequest -from vllm.lora.utils import get_adapter_absolute_path -from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.image import convert_image_mode -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer - -logger = logging.getLogger(__name__) - -# ----------------------------------------------------------------------------- -# Data Classes -# ----------------------------------------------------------------------------- - - -@dataclass -class SampleRequest: - """ - Represents a single inference request for benchmarking. - """ - - prompt: Union[str, Any] - prompt_len: int - expected_output_len: int - multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None - lora_request: Optional[LoRARequest] = None - request_id: Optional[str] = None - - -# ----------------------------------------------------------------------------- -# Benchmark Dataset Base Class -# ----------------------------------------------------------------------------- - - -class BenchmarkDataset(ABC): - DEFAULT_SEED = 0 - IS_MULTIMODAL = False - - def __init__( - self, - dataset_path: Optional[str] = None, - random_seed: int = DEFAULT_SEED, - ) -> None: - """ - Initialize the BenchmarkDataset with an optional dataset path and random - seed. Args: - dataset_path (Optional[str]): Path to the dataset. If None, it - indicates that a default or random dataset might be used. - random_seed (int): Seed value for reproducible shuffling or - sampling. Defaults to DEFAULT_SEED. - """ - self.dataset_path = dataset_path - # Set the random seed, ensuring that a None value is replaced with the - # default seed. - self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED - self.data = None - - def apply_multimodal_chat_transformation( - self, prompt: str, mm_content: Optional[MultiModalDataDict] = None - ) -> list[dict]: - """ - Transform a prompt and optional multimodal content into a chat format. - This method is used for chat models that expect a specific conversation - format. - """ - content = [{"text": prompt, "type": "text"}] - if mm_content is not None: - content.append(mm_content) - return [{"role": "user", "content": content}] - - def load_data(self) -> None: - """ - Load data from the dataset path into self.data. - - This method must be overridden by subclasses since the method to load - data will vary depending on the dataset format and source. - - Raises: - NotImplementedError: If a subclass does not implement this method. - """ - # TODO (jenniferzhao): add support for downloading data - raise NotImplementedError("load_data must be implemented in subclasses.") - - def get_random_lora_request( - self, - tokenizer: PreTrainedTokenizerBase, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - ) -> tuple[Optional[LoRARequest], AnyTokenizer]: - """ - Optionally select a random LoRA request and return its associated - tokenizer. - - This method is used when LoRA parameters are provided. It randomly - selects a LoRA based on max_loras and retrieves a cached tokenizer for - that LoRA if available. Otherwise, it returns the base tokenizer. - - Args: - tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no - LoRA is selected. max_loras (Optional[int]): The maximum number of - LoRAs available. If None, LoRA is not used. lora_path - (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA - is not used. - - Returns: - tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first - element is a LoRARequest (or None if not applicable) and the second - element is the tokenizer associated with the LoRA request (or the - base tokenizer). - """ - if max_loras is None or lora_path is None: - return None, tokenizer - - # Generate a random LoRA ID in the range [1, max_loras]. - lora_id = random.randint(1, max_loras) - lora_request = LoRARequest( - lora_name=str(lora_id), - lora_int_id=lora_id, - lora_path=lora_path_on_disk(lora_path), - ) - if lora_id not in lora_tokenizer_cache: - lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) - # Return lora_request and the cached tokenizer if available; otherwise, - # return the base tokenizer - return lora_request, lora_tokenizer_cache[lora_id] or tokenizer - - @abstractmethod - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - request_id_prefix: str = "", - ) -> list[SampleRequest]: - """ - Abstract method to generate sample requests from the dataset. - - Subclasses must override this method to implement dataset-specific logic - for generating a list of SampleRequest objects. - - Args: - tokenizer (PreTrainedTokenizerBase): The tokenizer to be used - for processing the dataset's text. - num_requests (int): The number of sample requests to generate. - request_id_prefix (str) The prefix of request_id. - - Returns: - list[SampleRequest]: A list of sample requests generated from the - dataset. - """ - raise NotImplementedError("sample must be implemented in subclasses.") - - def maybe_oversample_requests( - self, - requests: list[SampleRequest], - num_requests: int, - request_id_prefix: str = "", - ) -> None: - """ - Oversamples the list of requests if its size is less than the desired - number. - - Args: - requests (List[SampleRequest]): The current list of sampled - requests. - num_requests (int): The target number of requests. - request_id_prefix (str) The prefix of the request ids. - """ - if len(requests) < num_requests: - random.seed(self.random_seed) - additional = deepcopy( - random.choices(requests, k=num_requests - len(requests)) - ) - for i in range(len(additional)): - req = additional[i] - req.request_id = request_id_prefix + str(len(requests) + i) - requests.extend(additional) - logger.info("Oversampled requests to reach %d total samples.", num_requests) - - -# ----------------------------------------------------------------------------- -# Utility Functions and Global Caches -# ----------------------------------------------------------------------------- - - -def is_valid_sequence( - prompt_len: int, - output_len: int, - min_len: int = 4, - max_prompt_len: int = 1024, - max_total_len: int = 2048, - skip_min_output_len_check: bool = False, -) -> bool: - """ - Validate a sequence based on prompt and output lengths. - - Default pruning criteria are copied from the original `sample_hf_requests` - and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as - from `sample_requests` in benchmark_throughput.py. - """ - # Check for invalid conditions - prompt_too_short = prompt_len < min_len - output_too_short = (not skip_min_output_len_check) and (output_len < min_len) - prompt_too_long = prompt_len > max_prompt_len - combined_too_long = (prompt_len + output_len) > max_total_len - - # Return True if none of the invalid conditions are met - return not ( - prompt_too_short or output_too_short or prompt_too_long or combined_too_long - ) - - -@cache -def lora_path_on_disk(lora_path: str) -> str: - return get_adapter_absolute_path(lora_path) - - -# Global cache for LoRA tokenizers. -lora_tokenizer_cache: dict[int, AnyTokenizer] = {} - - -def process_image(image: Any) -> Mapping[str, Any]: - """ - Process a single image input and return a multimedia content dictionary. - - Supports three input types: - - 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key - containing raw image data. - Loads the bytes as a PIL.Image.Image. - - 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as - a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns - a dictionary with the image as a base64 data URL. - - 3. String input: - Treats the string as a URL or local file path. - - Prepends "file://" if the string doesn't start with "http://" or - "file://". - Returns a dictionary with the image URL. - - Raises: - ValueError: If the input is not a supported type. - """ - if isinstance(image, dict) and "bytes" in image: - image = Image.open(BytesIO(image["bytes"])) - if isinstance(image, Image.Image): - image = convert_image_mode(image, "RGB") - with io.BytesIO() as image_data: - image.save(image_data, format="JPEG") - image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") - return { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, - } - - if isinstance(image, str): - image_url = ( - image if image.startswith(("http://", "file://")) else f"file://{image}" - ) - return {"type": "image_url", "image_url": {"url": image_url}} - - raise ValueError( - f"Invalid image input {image}. Must be a PIL.Image.Image" - " or str or dictionary with raw image bytes." - ) - - -def process_video(video: Any) -> Mapping[str, Any]: - """ - Process a single video input and return a multimedia content dictionary. - - Supports the following input types: - - 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key - containing raw video data. - - 2. String input: - Treats the string as a URL or local file path. - - Prepends "file://" if the string doesn't start with "http://" or - "file://". - Returns a dictionary with the image URL. - - Raises: - ValueError: If the input is not a supported type. - """ - if isinstance(video, dict) and "bytes" in video: - video_bytes = video["bytes"] - video_base64 = base64.b64encode(video_bytes).decode("utf-8") - return { - "type": "video_url", - "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, - } - - if isinstance(video, str): - video_url = ( - video if video.startswith(("http://", "file://")) else f"file://{video}" - ) - return {"type": "video_url", "video_url": {"url": video_url}} - - raise ValueError( - f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 - ) - - -# ----------------------------------------------------------------------------- -# Random Dataset Implementation (Synthetic Data) -# ----------------------------------------------------------------------------- - - -class RandomDataset(BenchmarkDataset): - # Default values copied from benchmark_serving.py for the random dataset. - DEFAULT_PREFIX_LEN = 0 - DEFAULT_RANGE_RATIO = 0.0 - DEFAULT_INPUT_LEN = 1024 - DEFAULT_OUTPUT_LEN = 128 - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - range_ratio: float = DEFAULT_RANGE_RATIO, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - request_id_prefix: str = "", - **kwargs, - ) -> list[SampleRequest]: - # Enforce range_ratio < 1 - assert range_ratio < 1.0, ( - "random_range_ratio must be < 1.0 to ensure a valid sampling range" - ) - - vocab_size = tokenizer.vocab_size - num_special_tokens = tokenizer.num_special_tokens_to_add() - real_input_len = input_len - num_special_tokens - - prefix_token_ids = ( - np.random.randint(0, vocab_size, size=prefix_len).tolist() - if prefix_len > 0 - else [] - ) - - # New sampling logic: [X * (1 - b), X * (1 + b)] - input_low = int(real_input_len * (1 - range_ratio)) - input_high = int(real_input_len * (1 + range_ratio)) - output_low = int(output_len * (1 - range_ratio)) - # Ensure the lower bound for output length is at least 1 to prevent - # sampling 0 tokens, which can cause request failures. - output_low = max(output_low, 1) - output_high = int(output_len * (1 + range_ratio)) - - # Add logging for debugging - logger.info("Sampling input_len from [%s, %s]", input_low, input_high) - logger.info("Sampling output_len from [%s, %s]", output_low, output_high) - - input_lens = np.random.randint(input_low, input_high + 1, size=num_requests) - output_lens = np.random.randint(output_low, output_high + 1, size=num_requests) - offsets = np.random.randint(0, vocab_size, size=num_requests) - - requests = [] - for i in range(num_requests): - inner_seq = ( - (offsets[i] + i + np.arange(input_lens[i])) % vocab_size - ).tolist() - token_sequence = prefix_token_ids + inner_seq - prompt = tokenizer.decode(token_sequence) - # After decoding the prompt we have to encode and decode it again. - # This is done because in some cases N consecutive tokens - # give a string tokenized into != N number of tokens. - # For example for GPT2Tokenizer: - # [6880, 6881] -> ['Ġcalls', 'here'] -> - # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] - # To avoid uncontrolled change of the prompt length, - # the encoded sequence is truncated before being decoded again. - total_input_len = prefix_len + int(input_lens[i]) - re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ - :total_input_len - ] - prompt = tokenizer.decode(re_encoded_sequence) - total_input_len = len(re_encoded_sequence) - requests.append( - SampleRequest( - prompt=prompt, - prompt_len=total_input_len, - expected_output_len=int(output_lens[i]), - request_id=request_id_prefix + str(i), - ) - ) - - return requests - - -# ----------------------------------------------------------------------------- -# ShareGPT Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ShareGPTDataset(BenchmarkDataset): - """ - Implements the ShareGPT dataset. Loads data from a JSON file and generates - sample requests based on conversation turns. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - with open(self.dataset_path, encoding="utf-8") as f: - self.data = json.load(f) - # Filter entries with at least two conversation turns. - self.data = [ - entry - for entry in self.data - if "conversations" in entry and len(entry["conversations"]) >= 2 - ] - random.seed(self.random_seed) - random.shuffle(self.data) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - **kwargs, - ) -> list: - samples: list = [] - ind = 0 - for entry in self.data: - if len(samples) >= num_requests: - break - prompt, completion = ( - entry["conversations"][0]["value"], - entry["conversations"][1]["value"], - ) - - lora_request, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path - ) - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - new_output_len = len(completion_ids) if output_len is None else output_len - if not is_valid_sequence( - prompt_len, - new_output_len, - skip_min_output_len_check=output_len is not None, - ): - continue - if image_path := entry.get("image"): - mm_content = process_image(image_path) - elif video_path := entry.get("video"): - mm_content = process_video(video_path) - else: - mm_content = None - if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=new_output_len, - lora_request=lora_request, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(ind), - ) - ) - ind += 1 - self.maybe_oversample_requests(samples, num_requests, request_id_prefix) - return samples - - -# ----------------------------------------------------------------------------- -# Custom Dataset Implementation -# ----------------------------------------------------------------------------- - - -class CustomDataset(BenchmarkDataset): - """ - Implements the Custom dataset. Loads data from a JSONL file and generates - sample requests based on conversation turns. E.g., - ``` - {"prompt": "What is the capital of India?"} - {"prompt": "What is the capital of Iran?"} - {"prompt": "What is the capital of China?"} - ``` - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - # self.data will be a list of dictionaries - # e.g., [{"prompt": "What is the capital of India?"}, ...] - # This will be the standardized format which load_data() - # has to convert into depending on the filetype of dataset_path. - # sample() will assume this standardized format of self.data - self.data = [] - - # Load the JSONL file - if self.dataset_path.endswith(".jsonl"): - jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) - - # check if the JSONL file has a 'prompt' column - if "prompt" not in jsonl_data.columns: - raise ValueError("JSONL file must contain a 'prompt' column.") - - # Convert each row to a dictionary and append to self.data - # This will convert the DataFrame to a list of dictionaries - # where each dictionary corresponds to a row in the DataFrame. - # This is the standardized format we want for self.data - for _, row in jsonl_data.iterrows(): - self.data.append(row.to_dict()) - else: - raise NotImplementedError( - "Only JSONL format is supported for CustomDataset." - ) - - random.seed(self.random_seed) - random.shuffle(self.data) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - skip_chat_template: bool = False, - request_id_prefix: str = "", - **kwargs, - ) -> list: - sampled_requests = [] - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - prompt = item["prompt"] - - # apply template - if not skip_chat_template: - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(i), - ) - ) - self.maybe_oversample_requests( - sampled_requests, num_requests, request_id_prefix - ) - - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Sonnet Dataset Implementation -# ----------------------------------------------------------------------------- - - -class SonnetDataset(BenchmarkDataset): - """ - Simplified implementation of the Sonnet dataset. Loads poem lines from a - text file and generates sample requests. Default values here copied from - `benchmark_serving.py` for the sonnet dataset. - """ - - DEFAULT_PREFIX_LEN = 200 - DEFAULT_INPUT_LEN = 550 - DEFAULT_OUTPUT_LEN = 150 - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if not self.dataset_path: - raise ValueError("dataset_path must be provided.") - with open(self.dataset_path, encoding="utf-8") as f: - self.data = f.readlines() - - def sample( - self, - tokenizer, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - return_prompt_formatted: bool = False, - request_id_prefix: str = "", - **kwargs, - ) -> list: - # Calculate average token length for a poem line. - tokenized_lines = [tokenizer(line).input_ids for line in self.data] - avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) - - # Build the base prompt. - base_prompt = "Pick as many lines as you can from these poem lines:\n" - base_msg = [{"role": "user", "content": base_prompt}] - base_fmt = tokenizer.apply_chat_template( - base_msg, add_generation_prompt=True, tokenize=False - ) - base_offset = len(tokenizer(base_fmt).input_ids) - if input_len <= base_offset: - raise ValueError( - f"'input_len' must be higher than the base prompt length " - f"({base_offset})." - ) - - # Determine how many poem lines to use. - num_input_lines = round((input_len - base_offset) / avg_len) - num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) - prefix_lines = self.data[:num_prefix_lines] - - samples = [] - ind = 0 - while len(samples) < num_requests: - extra_lines = random.choices( - self.data, k=num_input_lines - num_prefix_lines - ) - prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" - msg = [{"role": "user", "content": prompt}] - prompt_formatted = tokenizer.apply_chat_template( - msg, add_generation_prompt=True, tokenize=False - ) - prompt_len = len(tokenizer(prompt_formatted).input_ids) - - if prompt_len <= input_len: - samples.append( - SampleRequest( - prompt=prompt_formatted if return_prompt_formatted else prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(ind), - ) - ) - ind += 1 - return samples - - -# ----------------------------------------------------------------------------- -# BurstGPT Dataset Implementation -# ----------------------------------------------------------------------------- - - -class BurstGPTDataset(BenchmarkDataset): - """ - Implements the BurstGPT dataset. Loads data from a CSV file and generates - sample requests based on synthetic prompt generation. Only rows with Model - "GPT-4" and positive response tokens are used. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data( - self, - ): - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - df = pd.read_csv(self.dataset_path) - # Filter to keep only GPT-4 rows. - gpt4_df = df[df["Model"] == "GPT-4"] - # Remove failed requests (where Response tokens is 0 or less). - gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] - # Sample the desired number of rows. - self.data = gpt4_df - - def _sample_loaded_data(self, num_requests: int) -> list: - if num_requests <= len(self.data): - data = self.data.sample(n=num_requests, random_state=self.random_seed) - else: - data = self.data.sample( - n=num_requests, - random_state=self.random_seed, - replace=True, - ) - # Convert the dataframe to a list of lists. - return data.values.tolist() - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - request_id_prefix: str = "", - **kwargs, - ) -> list[SampleRequest]: - samples = [] - data = self._sample_loaded_data(num_requests=num_requests) - for i in range(num_requests): - input_len = int(data[i][2]) - output_len = int(data[i][3]) - lora_req, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path - ) - vocab_size = tokenizer.vocab_size - # Generate a synthetic prompt: a list of token IDs computed as (i + - # j) modulo vocab_size. - token_ids = [(i + j) % vocab_size for j in range(input_len)] - prompt = tokenizer.decode(token_ids) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=output_len, - lora_request=lora_req, - request_id=request_id_prefix + str(i), - ) - ) - return samples - - -# ----------------------------------------------------------------------------- -# HuggingFace Dataset Base Implementation -# ----------------------------------------------------------------------------- -class HuggingFaceDataset(BenchmarkDataset): - """Base class for datasets hosted on HuggingFace.""" - - SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set() - - def __init__( - self, - dataset_path: str, - dataset_split: str, - no_stream: bool = False, - dataset_subset: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(dataset_path=dataset_path, **kwargs) - - self.dataset_split = dataset_split - self.dataset_subset = dataset_subset - self.load_stream = not no_stream - self.load_data() - - def load_data(self) -> None: - """Load data from HuggingFace datasets.""" - self.data = load_dataset( - self.dataset_path, - name=self.dataset_subset, - split=self.dataset_split, - streaming=self.load_stream, - ) - self.data = self.data.shuffle(seed=self.random_seed) - - -# ----------------------------------------------------------------------------- -# Conversation Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ConversationDataset(HuggingFaceDataset): - """Dataset for conversation data with multimodal support.""" - - SUPPORTED_DATASET_PATHS = { - "lmms-lab/LLaVA-OneVision-Data", - "Aeala/ShareGPT_Vicuna_unfiltered", - } - IS_MULTIMODAL = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - **kwargs, - ) -> list: - # Filter examples with at least 2 conversations - filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) - sampled_requests = [] - dynamic_output = output_len is None - ind = 0 - - for item in filtered_data: - if len(sampled_requests) >= num_requests: - break - conv = item["conversations"] - prompt, completion = conv[0]["value"], conv[1]["value"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence(prompt_len, completion_len): - continue - mm_content = process_image(item["image"]) if "image" in item else None - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len and output len - prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(ind), - ) - ) - ind += 1 - self.maybe_oversample_requests( - sampled_requests, num_requests, request_id_prefix - ) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Vision Arena Dataset Implementation -# ----------------------------------------------------------------------------- - - -class VisionArenaDataset(HuggingFaceDataset): - """ - Vision Arena Dataset. - """ - - DEFAULT_OUTPUT_LEN = 128 - SUPPORTED_DATASET_PATHS = { - "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"], - "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"], - } - IS_MULTIMODAL = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) - if parser_fn is None: - raise ValueError(f"Unsupported dataset path: {self.dataset_path}") - prompt = parser_fn(item) - mm_content = process_image(item["images"][0]) - prompt_len = len(tokenizer(prompt).input_ids) - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len - prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(i), - ) - ) - self.maybe_oversample_requests( - sampled_requests, num_requests, request_id_prefix - ) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Instruct Coder Dataset Implementation -# ----------------------------------------------------------------------------- - - -class InstructCoderDataset(HuggingFaceDataset): - """ - InstructCoder Dataset. - https://huggingface.co/datasets/likaixin/InstructCoder - - InstructCoder is the dataset designed for general code editing. It consists - of 114,239 instruction-input-output triplets, and covers multiple distinct - code editing scenario. - """ - - DEFAULT_OUTPUT_LEN = 200 # this is the average default output length - SUPPORTED_DATASET_PATHS = { - "likaixin/InstructCoder", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - prompt = ( - f"{item['input']}\n\n{item['instruction']} Just output " - "the code, do not include any explanation." - ) - - # apply template - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(i), - ) - ) - self.maybe_oversample_requests( - sampled_requests, num_requests, request_id_prefix - ) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# MT-Bench Dataset Implementation -# ----------------------------------------------------------------------------- - - -class MTBenchDataset(HuggingFaceDataset): - """ - MT-Bench Dataset. - https://huggingface.co/datasets/philschmid/mt-bench - - We create a single turn dataset for MT-Bench. - This is similar to Spec decoding benchmark setup in vLLM - https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 - """ # noqa: E501 - - DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM - SUPPORTED_DATASET_PATHS = { - "philschmid/mt-bench", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - request_id_prefix: str = "", - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - - for i, item in enumerate(self.data): - if len(sampled_requests) >= num_requests: - break - prompt = item["turns"][0] - - # apply template - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - request_id=request_id_prefix + str(i), - ) - ) - self.maybe_oversample_requests( - sampled_requests, num_requests, request_id_prefix - ) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# AIMO Dataset Implementation -# ----------------------------------------------------------------------------- - - -class AIMODataset(HuggingFaceDataset): - """ - Dataset class for processing a AIMO dataset with reasoning questions. - """ - - SUPPORTED_DATASET_PATHS = { - "AI-MO/aimo-validation-aime", - "AI-MO/NuminaMath-1.5", - "AI-MO/NuminaMath-CoT", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - request_id_prefix: str = "", - **kwargs, - ) -> list: - sampled_requests = [] - dynamic_output = output_len is None - ind = 0 - - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt, completion = item["problem"], item["solution"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence( - prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000 - ): - continue - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=None, - request_id=request_id_prefix + str(ind), - ) - ) - ind += 1 - self.maybe_oversample_requests( - sampled_requests, num_requests, request_id_prefix - ) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Next Edit Prediction Dataset Implementation -# ----------------------------------------------------------------------------- - - -zeta_prompt = """### Instruction: -You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location. - -### User Edits: - -{} - -### User Excerpt: - -{} - -### Response: - -""" # noqa: E501 - - -def _format_zeta_prompt( - sample: dict, original_start_marker: str = "<|editable_region_start|>" -) -> dict: - """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. - - This function formats examples from the NEP dataset - into prompts and expected outputs. It could be - further extended to support more NEP datasets. - - Args: - sample: The dataset sample containing events, - inputs, and outputs. - original_start_marker: The marker indicating the - start of the editable region. Defaults to - "<|editable_region_start|>". - - Returns: - A dictionary with the formatted prompts and expected outputs. - """ - events = sample["events"] - input = sample["input"] - output = sample["output"] - prompt = zeta_prompt.format(events, input) - - # following the original implementation, extract the focused region - # from the raw output - output_start_index = output.find(original_start_marker) - output_focused_region = output[output_start_index:] - expected_output = output_focused_region - - return {"prompt": prompt, "expected_output": expected_output} - - -class NextEditPredictionDataset(HuggingFaceDataset): - """ - Dataset class for processing a Next Edit Prediction dataset. - """ - - SUPPORTED_DATASET_PATHS = { - "zed-industries/zeta", - } - MAPPING_PROMPT_FUNCS = { - "zed-industries/zeta": _format_zeta_prompt, - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - request_id_prefix: str = "", - **kwargs, - ): - formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path) - if formatting_prompt_func is None: - raise ValueError(f"Unsupported dataset path: {self.dataset_path}") - samples = [] - for i, sample in enumerate(self.data): - sample = formatting_prompt_func(sample) - samples.append( - SampleRequest( - prompt=sample["prompt"], - prompt_len=len(tokenizer(sample["prompt"]).input_ids), - expected_output_len=len( - tokenizer(sample["expected_output"]).input_ids - ), - request_id=request_id_prefix + str(i), - ) - ) - if len(samples) >= num_requests: - break - self.maybe_oversample_requests(samples, num_requests, request_id_prefix) - return samples - - -# ----------------------------------------------------------------------------- -# ASR Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ASRDataset(HuggingFaceDataset): - """ - Dataset class for processing a ASR dataset for transcription. - Tested on the following set: - - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | Dataset | Domain | Speaking Style | hf-subset | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | TED-LIUM | TED talks | Oratory | release1, release2, release3| - | | | | release3-speaker-adaptation | - | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | - | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | - | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | - | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | - | AMI | Meetings | Spontaneous | ihm, sdm | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - - """ # noqa: E501 - - SUPPORTED_DATASET_PATHS = { - "openslr/librispeech_asr", - "facebook/voxpopuli", - "LIUM/tedlium", - "edinburghcstr/ami", - "speechcolab/gigaspeech", - "kensho/spgispeech", - } - - DEFAULT_OUTPUT_LEN = 128 - IS_MULTIMODAL = True - - # TODO Whisper-specific. Abstract interface when more models are supported. - TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" - skip_long_audios: bool = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - request_id_prefix: str = "", - **kwargs, - ) -> list: - import librosa - - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - prompt = ASRDataset.TRANSCRIPTION_PREAMBLE - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests = [] - skipped = 0 - ind = 0 - for item in self.data: - if len(sampled_requests) >= num_requests: - break - audio = item["audio"] - y, sr = audio["array"], audio["sampling_rate"] - duration_s = librosa.get_duration(y=y, sr=sr) - # Whisper max supported duration - if self.skip_long_audios and duration_s > 30: - skipped += 1 - continue - - mm_content = {"audio": (y, sr)} - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - request_id=request_id_prefix + str(ind), - ) - ) - ind += 1 - if skipped: - logger.warning( - "%d samples discarded from dataset due to" - " their length being greater than" - " what Whisper supports.", - skipped, - ) - self.maybe_oversample_requests( - sampled_requests, num_requests, request_id_prefix - ) - return sampled_requests diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 13582dadb46e0..d04b1d1136a1c 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -37,6 +37,7 @@ th { | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` | | Prefix Repetition | ✅ | ✅ | `synthetic` | | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` | +| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` | | HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` | | HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` | | HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` | diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 0a297479bcc00..8d11b19066bba 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -335,7 +335,7 @@ def process_image(image: Any) -> Mapping[str, Any]: if isinstance(image, str): image_url = (image if image.startswith( - ("http://", "file://")) else f"file://{image}") + ("http://", "https://", "file://")) else f"file://{image}") return {"type": "image_url", "image_url": {"url": image_url}} raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image" @@ -370,7 +370,7 @@ def process_video(video: Any) -> Mapping[str, Any]: if isinstance(video, str): video_url = (video if video.startswith( - ("http://", "file://")) else f"file://{video}") + ("http://", "https://", "file://")) else f"file://{video}") return {"type": "video_url", "video_url": {"url": video_url}} raise ValueError( @@ -1405,6 +1405,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: dataset_class = VisionArenaDataset args.hf_split = "train" args.hf_subset = None + elif ( + args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MMVUDataset + args.hf_split = "validation" + args.hf_subset = None elif ( args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS @@ -2053,6 +2060,61 @@ class VisionArenaDataset(HuggingFaceDataset): return sampled_requests +class MMVUDataset(HuggingFaceDataset): + """ + MMVU Dataset. + https://huggingface.co/datasets/yale-nlp/MMVU + """ + + DEFAULT_OUTPUT_LEN = 128 + SUPPORTED_DATASET_PATHS = { + "yale-nlp/MMVU": + lambda x: x["question"] + " " + ( + " ".join(f"{k}.{v}" for k, v in x["choices"].items()) + ), + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) + if parser_fn is None: + raise ValueError(f"Unsupported dataset path: {self.hf_name}") + prompt = parser_fn(item) + mm_content = process_video(item["video"]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + )) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix, no_oversample) + return sampled_requests + + # ----------------------------------------------------------------------------- # Instruct Coder Dataset Implementation # ----------------------------------------------------------------------------- From dd83a157f12c0ba7f1357f7954cf85aff2c3b882 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Tue, 16 Sep 2025 23:42:23 -0400 Subject: [PATCH 339/584] [UX] Enforce valid choices for envs like VLLM_ATTENTION_BACKEND, etc (#24761) Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> --- vllm/envs.py | 101 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 24 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index d2006979ea81c..385d2a7c51f26 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -6,7 +6,7 @@ import json import os import sys import tempfile -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union if TYPE_CHECKING: VLLM_HOST_IP: str = "" @@ -56,11 +56,12 @@ if TYPE_CHECKING: VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True VLLM_USE_RAY_SPMD_WORKER: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False - VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" + VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", + "shm"] = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True VLLM_XLA_USE_SPMD: bool = False - VLLM_WORKER_MULTIPROC_METHOD: str = "fork" + VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_VIDEO_FETCH_TIMEOUT: int = 30 @@ -77,7 +78,8 @@ if TYPE_CHECKING: VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False - CMAKE_BUILD_TYPE: Optional[str] = None + CMAKE_BUILD_TYPE: Optional[Literal["Debug", "Release", + "RelWithDebInfo"]] = None VERBOSE: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_RPC_TIMEOUT: int = 10000 # ms @@ -140,22 +142,25 @@ if TYPE_CHECKING: VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False - VLLM_FLASHINFER_MOE_BACKEND: str = "throughput" + VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", + "latency"] = "throughput" VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 - VLLM_ALL2ALL_BACKEND: str = "naive" + VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput", + "deepep_low_latency"] = "naive" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_SLEEP_WHEN_IDLE: bool = False VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16 VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300 - VLLM_KV_CACHE_LAYOUT: Optional[str] = None + VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None VLLM_COMPUTE_NANS_IN_LOGITS: bool = False VLLM_USE_NVFP4_CT_EMULATIONS: bool = False - VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE" + VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal["FP", "INT8", "INT6", "INT4", + "NONE"] = "NONE" VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120 @@ -207,6 +212,48 @@ def maybe_convert_bool(value: Optional[str]) -> Optional[bool]: return bool(int(value)) +def env_with_choices( + env_name: str, + default: Optional[str], + choices: Union[list[str], Callable[[], list[str]]], + case_sensitive: bool = True) -> Callable[[], Optional[str]]: + """ + Create a lambda that validates environment variable against allowed choices + + Args: + env_name: Name of the environment variable + default: Default value if not set (can be None) + choices: List of valid string options or callable that returns list + case_sensitive: Whether validation should be case sensitive + + Returns: + Lambda function for environment_variables dict + """ + + def _get_validated_env() -> Optional[str]: + value = os.getenv(env_name) + if value is None: + return default + + # Resolve choices if it's a callable (for lazy loading) + actual_choices = choices() if callable(choices) else choices + + if not case_sensitive: + check_value = value.lower() + check_choices = [choice.lower() for choice in actual_choices] + else: + check_value = value + check_choices = actual_choices + + if check_value not in check_choices: + raise ValueError(f"Invalid value '{value}' for {env_name}. " + f"Valid options: {actual_choices}.") + + return value + + return _get_validated_env + + def get_vllm_port() -> Optional[int]: """Get the port from VLLM_PORT environment variable. @@ -287,7 +334,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # If not set, defaults to "Debug" or "RelWithDebInfo" # Available options: "Debug", "Release", "RelWithDebInfo" "CMAKE_BUILD_TYPE": - lambda: os.getenv("CMAKE_BUILD_TYPE"), + env_with_choices("CMAKE_BUILD_TYPE", None, + ["Debug", "Release", "RelWithDebInfo"]), # If set, vllm will print verbose logs during installation "VERBOSE": @@ -476,7 +524,7 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), # Backend for attention computation - # Available options: + # Example options: # - "TORCH_SDPA": use torch.nn.MultiheadAttention # - "FLASH_ATTN": use FlashAttention # - "XFORMERS": use XFormers @@ -486,8 +534,11 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "FLASH_ATTN_MLA": use FlashAttention for MLA # - "FLASHINFER_MLA": use FlashInfer for MLA # - "CUTLASS_MLA": use CUTLASS for MLA + # All possible options loaded dynamically from _Backend enum "VLLM_ATTENTION_BACKEND": - lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), + env_with_choices("VLLM_ATTENTION_BACKEND", None, + lambda: list(__import__('vllm.platforms.interface', \ + fromlist=['_Backend'])._Backend.__members__.keys())), # If set, vllm will use flashinfer sampler "VLLM_USE_FLASHINFER_SAMPLER": @@ -550,7 +601,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "shm": use shared memory and gRPC for communication # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set. "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": - lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"), + env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto", + ["auto", "nccl", "shm"]), # If the env var is set, it enables GPU communication overlap # (experimental feature) in Ray's Compiled Graph. This flag is ignored if @@ -569,7 +621,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": - lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), + env_with_choices("VLLM_WORKER_MULTIPROC_METHOD", "fork", + ["spawn", "fork"]), # Path to the cache for storing downloaded assets "VLLM_ASSETS_CACHE": @@ -833,7 +886,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Choice of quantization level: FP, INT8, INT6, INT4 or NONE # Recommended for large models to get allreduce "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": - lambda: os.getenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE").upper(), + env_with_choices("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "NONE", + ["FP", "INT8", "INT6", "INT4", "NONE"]), # Custom quick allreduce kernel for MI3* cards # Due to the lack of the bfloat16 asm instruction, bfloat16 @@ -1075,21 +1129,20 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "deepep_high_throughput", use deepep high-throughput kernels # - "deepep_low_latency", use deepep low-latency kernels "VLLM_ALL2ALL_BACKEND": - lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), + env_with_choices("VLLM_ALL2ALL_BACKEND", "naive", + ["naive", "pplx", + "deepep_high_throughput", "deepep_low_latency"]), - # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both - # require compute capability 10.0 or above. + # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. + # Both require compute capability 10.0 or above. # Available options: # - "throughput": [default] # Uses CUTLASS kernels optimized for high-throughput batch inference. # - "latency": # Uses TensorRT-LLM kernels optimized for low-latency inference. - # To set this backend, define the environment variable: - # export VLLM_FLASHINFER_MOE_BACKEND=latency. - # If not set, defaults to "throughput". - "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv( - "VLLM_FLASHINFER_MOE_BACKEND", "throughput" - ), + "VLLM_FLASHINFER_MOE_BACKEND": + env_with_choices("VLLM_FLASHINFER_MOE_BACKEND", "throughput", + ["throughput", "latency"]), # Control the maximum number of tokens per expert supported by the # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for @@ -1145,7 +1198,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # leave the layout choice to the backend. Mind that backends may only # implement and support a subset of all possible layouts. "VLLM_KV_CACHE_LAYOUT": - lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None), + env_with_choices("VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]), # Enable checking whether the generated logits contain NaNs, # indicating corrupted output. Useful for debugging low level bugs From 5672ba90bd18129946437266a12c5e619baca488 Mon Sep 17 00:00:00 2001 From: yyzxw <34639446+yyzxw@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:53:23 +0800 Subject: [PATCH 340/584] [Docs] fix invalid doc link (#25017) Signed-off-by: zxw <1020938856@qq.com> --- docs/contributing/model/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md index 6c013738ac1ec..36068bc14876b 100644 --- a/docs/contributing/model/README.md +++ b/docs/contributing/model/README.md @@ -3,7 +3,7 @@ !!! important Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first! -vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/compatibility_matrix.md) to optimize their performance. +vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance. The complexity of integrating a model into vLLM depends heavily on the model's architecture. The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. From 67532a1a6855e8262b3e1c9512c85e2fc934b3c0 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Tue, 16 Sep 2025 23:57:51 -0400 Subject: [PATCH 341/584] [UX] Remove "quantization is not fully optimized yet" log (#25012) Signed-off-by: mgoin <mgoin64@gmail.com> --- vllm/config/__init__.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 535802585d18b..5f30576099714 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1086,22 +1086,6 @@ class ModelConfig: def _verify_quantization(self) -> None: supported_quantization = me_quant.QUANTIZATION_METHODS - optimized_quantization_methods = [ - "fp8", - "modelopt", - "gptq_marlin_24", - "gptq_marlin", - "awq_marlin", - "fbgemm_fp8", - "compressed-tensors", - "experts_int8", - "quark", - "modelopt_fp4", - "bitblas", - "gptq_bitblas", - "inc", - "petit_nvfp4", - ] if self.quantization is not None: self.quantization = cast(me_quant.QuantizationMethods, self.quantization) @@ -1183,11 +1167,6 @@ class ModelConfig: f"be one of {supported_quantization}.") from vllm.platforms import current_platform current_platform.verify_quantization(self.quantization) - if self.quantization not in optimized_quantization_methods: - logger.warning( - "%s quantization is not fully " - "optimized yet. The speed can be slower than " - "non-quantized models.", self.quantization) def _verify_cuda_graph(self) -> None: # The `max_seq_len_to_capture` was incorrectly From ea3de5ef0d8cbf0e61ee27647954e5a867fae020 Mon Sep 17 00:00:00 2001 From: Prashant Gupta <prashantgupta@us.ibm.com> Date: Tue, 16 Sep 2025 20:58:38 -0700 Subject: [PATCH 342/584] [misc] fix typo in value error (#24995) Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> --- vllm/entrypoints/renderer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index f0798afbcf212..fb859d57be9fe 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -383,7 +383,7 @@ class CompletionRenderer(BaseRenderer): """Create validated EngineTokensPrompt.""" if max_length is not None and len(token_ids) > max_length: raise ValueError( - f"This maximum context length is {max_length} tokens. " + f"This model's maximum context length is {max_length} tokens. " f"However, your request has {len(token_ids)} input tokens. " "Please reduce the length of the input messages.") From 58d4c705a88adc3187591b4b2e651eae3b190061 Mon Sep 17 00:00:00 2001 From: Russell Bryant <rbryant@redhat.com> Date: Tue, 16 Sep 2025 23:59:07 -0400 Subject: [PATCH 343/584] [Core] Get num_encoder_tokens from scheduler config (#24989) Signed-off-by: Russell Bryant <rbryant@redhat.com> --- vllm/v1/core/sched/scheduler.py | 5 ++--- vllm/v1/kv_cache_interface.py | 5 ++--- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c1e59423e9a18..85ca858ad7bd6 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -465,9 +465,8 @@ class Scheduler(SchedulerInterface): in self.vllm_config.model_config.model.lower()), ( "Whisper is the only supported " "encoder-decoder model.") - num_encoder_tokens = MULTIMODAL_REGISTRY.\ - get_encdec_max_encoder_len( - self.vllm_config.model_config) + num_encoder_tokens =\ + self.scheduler_config.max_num_encoder_input_tokens else: num_encoder_tokens = 0 diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 6e8f569fff0e3..0cf92a680a689 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -11,7 +11,6 @@ from typing_extensions import Self from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.utils import cdiv, get_dtype_size logger = init_logger(__name__) @@ -230,8 +229,8 @@ class CrossAttentionSpec(AttentionSpec): def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: # For cross-attention, we need to cache encoder states # Get encoder length (e.g., 1500 for Whisper). - max_encoder_len = MULTIMODAL_REGISTRY.\ - get_encdec_max_encoder_len(vllm_config.model_config) + max_encoder_len = vllm_config.scheduler_config.\ + max_num_encoder_input_tokens return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e23115e177e69..f256dc160a6b5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -234,8 +234,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if self.model_config.is_encoder_decoder: # Maximum length of the encoder input, only for encoder-decoder # models. - self.max_encoder_len = self.mm_registry.\ - get_encdec_max_encoder_len(model_config) + self.max_encoder_len = scheduler_config.\ + max_num_encoder_input_tokens else: self.max_encoder_len = 0 From 5801e4977679895d89493e84088e7936f528285f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Tue, 16 Sep 2025 21:29:27 -0700 Subject: [PATCH 344/584] [V0 Deprecation] Remove MQLLMEngine (#25019) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- .buildkite/test-pipeline.yaml | 2 - .../entrypoints/openai/test_lora_resolvers.py | 4 +- tests/entrypoints/openai/test_serving_chat.py | 14 +- tests/mq_llm_engine/__init__.py | 0 tests/mq_llm_engine/conftest.py | 12 - tests/mq_llm_engine/test_abort.py | 69 -- tests/mq_llm_engine/test_error_handling.py | 376 ---------- tests/mq_llm_engine/test_load.py | 59 -- tests/mq_llm_engine/utils.py | 81 --- vllm/engine/multiprocessing/__init__.py | 145 ---- vllm/engine/multiprocessing/client.py | 643 ------------------ vllm/engine/multiprocessing/engine.py | 470 ------------- vllm/entrypoints/launcher.py | 2 - vllm/entrypoints/openai/api_server.py | 102 +-- vllm/platforms/rocm.py | 2 +- 15 files changed, 12 insertions(+), 1969 deletions(-) delete mode 100644 tests/mq_llm_engine/__init__.py delete mode 100644 tests/mq_llm_engine/conftest.py delete mode 100644 tests/mq_llm_engine/test_abort.py delete mode 100644 tests/mq_llm_engine/test_error_handling.py delete mode 100644 tests/mq_llm_engine/test_load.py delete mode 100644 tests/mq_llm_engine/utils.py delete mode 100644 vllm/engine/multiprocessing/__init__.py delete mode 100644 vllm/engine/multiprocessing/client.py delete mode 100644 vllm/engine/multiprocessing/engine.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6f06099edd53f..b5ea4407ef5bd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -46,7 +46,6 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - - tests/mq_llm_engine - tests/async_engine - tests/test_inputs.py - tests/test_outputs.py @@ -57,7 +56,6 @@ steps: - tests/transformers_utils commands: - python3 standalone_tests/lazy_imports.py - - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 2bf29ecf087f3..e2c83b9c40045 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -10,7 +10,6 @@ from unittest.mock import MagicMock import pytest from vllm.config.multimodal import MultiModalConfig -from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_models import (BaseModelPath, @@ -18,6 +17,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM MODEL_NAME = "openai-community/gpt2" BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] @@ -82,7 +82,7 @@ def register_mock_resolver(): @pytest.fixture def mock_serving_setup(): """Provides a mocked engine and serving completion instance.""" - mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 502704c9bbdff..de26fce854f5b 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -13,12 +13,12 @@ import pytest import pytest_asyncio from vllm.config.multimodal import MultiModalConfig -from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer @@ -276,7 +276,7 @@ def test_async_serving_chat_init(): @pytest.mark.asyncio async def test_serving_chat_returns_correct_model_name(): - mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False @@ -312,7 +312,7 @@ async def test_serving_chat_returns_correct_model_name(): @pytest.mark.asyncio async def test_serving_chat_should_set_correct_max_tokens(): - mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False @@ -355,7 +355,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): } # Reinitialize the engine with new settings - mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False @@ -410,7 +410,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): } # Reinitialize the engine with new settings - mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False @@ -467,7 +467,7 @@ async def test_serving_chat_could_load_correct_generation_config(): "repetition_penalty": 1.05 } - mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False @@ -523,7 +523,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): mock_model_config = MockModelConfig() mock_model_config.hf_config.model_type = model_type - mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine = MagicMock(spec=AsyncLLM) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False diff --git a/tests/mq_llm_engine/__init__.py b/tests/mq_llm_engine/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py deleted file mode 100644 index 375b248ebedaa..0000000000000 --- a/tests/mq_llm_engine/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py deleted file mode 100644 index 5ff08cbb32487..0000000000000 --- a/tests/mq_llm_engine/test_abort.py +++ /dev/null @@ -1,69 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test that aborting is handled properly.""" - -import asyncio -import tempfile -import uuid - -import pytest - -from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate -from vllm.engine.arg_utils import AsyncEngineArgs - -MODEL = "google/gemma-1.1-2b-it" -ENGINE_ARGS = AsyncEngineArgs(model=MODEL) -RAISED_ERROR = KeyError -RAISED_VALUE = "foo" -EXPECTED_TOKENS = 250 - - -@pytest.fixture(scope="function") -def tmp_socket(): - with tempfile.TemporaryDirectory() as td: - yield f"ipc://{td}/{uuid.uuid4()}" - - -@pytest.mark.asyncio -async def test_abort(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket) as engine: - - client = await engine.make_client() - - request_id_to_be_aborted = "request-aborted" - request_ids_a = [f"request-a-{idx}" for idx in range(10)] - request_ids_b = [f"request-b-{idx}" for idx in range(10)] - - # Requests started before one to be aborted. - tasks = [] - for request_id in request_ids_a: - tasks.append( - asyncio.create_task( - generate(client, request_id, EXPECTED_TOKENS))) - - # Aborted. - task_aborted = asyncio.create_task( - generate(client, request_id_to_be_aborted, EXPECTED_TOKENS)) - - # Requests started after one to be aborted. - for request_id in request_ids_b: - tasks.append( - asyncio.create_task( - generate(client, request_id, EXPECTED_TOKENS))) - - # Actually abort. - await asyncio.sleep(0.5) - await client.abort(request_id_to_be_aborted) - - # Confirm that we got all the EXPECTED tokens from the requests. - for task in tasks: - count, request_id = await task - assert count == EXPECTED_TOKENS, ( - f"{request_id} generated only {count} tokens") - - # Cancel task (this will hang indefinitely if not). - task_aborted.cancel() - - # Shutdown. - client.close() diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py deleted file mode 100644 index 77e3732cd06c6..0000000000000 --- a/tests/mq_llm_engine/test_error_handling.py +++ /dev/null @@ -1,376 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test that various errors are handled properly.""" - -import asyncio -import tempfile -import time -import uuid -from unittest.mock import Mock - -import pytest - -from tests.mq_llm_engine.utils import RemoteMQLLMEngine -from vllm import SamplingParams -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.llm_engine import LLMEngine -from vllm.engine.multiprocessing import MQEngineDeadError -from vllm.engine.multiprocessing.engine import MQLLMEngine -from vllm.entrypoints.openai.api_server import build_async_engine_client -from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.lora.request import LoRARequest -from vllm.sequence import SequenceGroupMetadata -from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser - -MODEL = "google/gemma-1.1-2b-it" -ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True) -RAISED_ERROR = KeyError -RAISED_VALUE = "foo" - - -@pytest.fixture(scope="function") -def tmp_socket(): - with tempfile.TemporaryDirectory() as td: - yield f"ipc://{td}/{uuid.uuid4()}" - - -def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str): - # Make engine. - engine = MQLLMEngine.from_engine_args( - engine_args=engine_args, - usage_context=UsageContext.UNKNOWN_CONTEXT, - ipc_path=ipc_path) - - # Raise error during first forward pass. - engine.engine.model_executor.execute_model = Mock( - side_effect=RAISED_ERROR(RAISED_VALUE)) - - # Run engine. - engine.start() - - -@pytest.mark.asyncio -async def test_evil_forward(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket, - run_fn=run_with_evil_forward) as engine: - - client = await engine.make_client() - - # Server should be healthy after initial probe. - await asyncio.sleep(2.0) - await client.check_health() - - # Throws an error that should get ENGINE_DEAD_ERROR. - with pytest.raises(MQEngineDeadError): - async for _ in client.generate(prompt="Hello my name is", - sampling_params=SamplingParams(), - request_id=str(uuid.uuid4())): - pass - assert client.errored - - await asyncio.sleep(1.0) - with pytest.raises(RAISED_ERROR): - await client.check_health() - assert client.errored - - # Shutdown. - client.close() - - -def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs, - ipc_path: str): - # Make engine. - engine = MQLLMEngine.from_engine_args( - engine_args=engine_args, - usage_context=UsageContext.UNKNOWN_CONTEXT, - ipc_path=ipc_path) - - # Raise error during first forward pass. - engine.engine.model_executor.check_health = Mock(side_effect=RAISED_ERROR) - - # Run engine. - engine.start() - - -@pytest.mark.asyncio -async def test_failed_health_check(tmp_socket): - with RemoteMQLLMEngine( - engine_args=ENGINE_ARGS, - ipc_path=tmp_socket, - run_fn=run_with_evil_model_executor_health) as engine: - - client = await engine.make_client() - assert client.is_running - - # Health probe should throw RAISED_ERROR. - await asyncio.sleep(15.) - - with pytest.raises(RAISED_ERROR): - await client.check_health() - assert client.errored - - # Generate call should throw ENGINE_DEAD_ERROR - with pytest.raises(MQEngineDeadError): - async for _ in client.generate(prompt="Hello my name is", - sampling_params=SamplingParams(), - request_id=str(uuid.uuid4())): - pass - - client.close() - - -def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str): - # Make engine. - engine = MQLLMEngine.from_engine_args( - engine_args=engine_args, - usage_context=UsageContext.UNKNOWN_CONTEXT, - ipc_path=ipc_path) - - # Raise error during abort call. - engine.engine.abort_request = Mock(side_effect=RAISED_ERROR) - - # Run engine. - engine.start() - - -@pytest.mark.asyncio -async def test_failed_abort(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket, - run_fn=run_with_evil_abort) as engine: - - client = await engine.make_client() - assert client.is_running - - # First check health should work. - await client.check_health() - - # Trigger an abort on the client side. - # This request ID does not exist, and will cause the engine to error - await client.abort(request_id="foo") - - # Future generation requests will now fail - # with reference to the original KeyError("foo") - with pytest.raises(MQEngineDeadError) as execinfo: - async for _ in client.generate( - prompt="Hello my name is", - sampling_params=SamplingParams(max_tokens=10), - request_id=str(uuid.uuid4())): - pass - assert "KeyError" in repr(execinfo.value) - assert client.errored - - # This should raise the original error. - with pytest.raises(RAISED_ERROR): - await client.check_health() - - client.close() - - -@pytest.mark.asyncio -async def test_batch_error(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket, - run_fn=run_with_evil_abort) as engine: - - client = await engine.make_client() - assert client.is_running - - # First check health should work. - await client.check_health() - - # Batch of requests - async def do_generate(client): - # min_tokens=2048 to keep busy the engine busy - # to get enough time to get process a request - # that will crash the engine - params = SamplingParams(min_tokens=2048, max_tokens=2048) - async for _ in client.generate(prompt="Hello my name is", - sampling_params=params, - request_id=str(uuid.uuid4())): - pass - - tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)] - - # This request will force a processing batch to raise - # an exception and next the engine get errored - await client.abort(request_id="foo") - - # The batch of those request failed, then they - # should get the same exception as a MQEngineDeadError. - errors = await asyncio.gather(*tasks, return_exceptions=True) - for e in errors: - assert isinstance(e, MQEngineDeadError) - assert "KeyError" in repr(e) - - client.close() - - -@pytest.mark.asyncio -async def test_bad_request(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket) as engine: - - client = await engine.make_client() - - # Invalid request should fail, but not crash the server. - with pytest.raises(ValueError): - async for _ in client.generate(prompt="Hello my name is", - sampling_params=SamplingParams(), - request_id="abcd-1", - lora_request=LoRARequest( - "invalid-lora", 1, - "invalid-path")): - pass - - # This request should be okay. - async for _ in client.generate(prompt="Hello my name is", - sampling_params=SamplingParams(), - request_id="abcd-2"): - pass - - # Shutdown. - client.close() - - -@pytest.mark.asyncio -async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - - parser = FlexibleArgumentParser( - description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) - args = parser.parse_args([]) - - # When LLMEngine is loaded, it will crash. - def mock_init(): - raise ValueError - - m.setattr(LLMEngine, "__init__", mock_init) - - start = time.perf_counter() - async with build_async_engine_client(args): - pass - end = time.perf_counter() - - assert end - start < 100, ( - "Expected vLLM to gracefully shutdown in <100s " - "if there is an error in the startup.") - - -@pytest.mark.asyncio -async def test_mp_cuda_init(): - # it should not crash, when cuda is initialized - # in the API server process - import torch - torch.cuda.init() - parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) - args = parser.parse_args([]) - - async with build_async_engine_client(args): - pass - - -@pytest.mark.asyncio -async def test_engine_process_death(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket) as engine: - - client = await engine.make_client() - assert client.is_running - - # kill the engine process - engine.proc.kill() - - # Generate call should fail - with pytest.raises(MQEngineDeadError): - async for _ in client.generate(prompt="Hello my name is", - sampling_params=SamplingParams(), - request_id=str(uuid.uuid4())): - pass - - # And the health check should show the engine is dead - with pytest.raises(RuntimeError, match="Engine process .* died"): - await client.check_health() - - client.close() - - -def run_with_evil_input_processing(engine_args: AsyncEngineArgs, - ipc_path: str): - """Simulate an exception while preparing inputs for the model. - In the wild, this could be something like a multimodal input processor - failing on invalid image data.""" - - # Make engine. - engine = MQLLMEngine.from_engine_args( - engine_args=engine_args, - usage_context=UsageContext.UNKNOWN_CONTEXT, - ipc_path=ipc_path) - - runner = engine.engine.model_executor.driver_worker.worker.model_runner - - # Raise error in the model runner when adding a sequence group. - # See class ModelInputForGPUBuilder - def raiser(_, seq_group_metadata: SequenceGroupMetadata): - if seq_group_metadata.request_id.startswith("evil"): - raise RAISED_ERROR(RAISED_VALUE) - - runner.builder.per_seq_group_compute_fns.append(raiser) - - # Run engine. - engine.start() - - -@pytest.mark.asyncio -async def test_failed_inputs(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket, - run_fn=run_with_evil_input_processing) as engine: - - client = await engine.make_client() - assert client.is_running - - # Engine should be healthy - await client.check_health() - - async def run_failing_request(): - async for _ in client.generate( - prompt="Hello my name is", - sampling_params=SamplingParams(max_tokens=10), - request_id="evil" + str(uuid.uuid4())): - pass - - async def run_passing_request(): - async for _ in client.generate( - prompt="Hello my name is", - sampling_params=SamplingParams(max_tokens=10), - request_id=str(uuid.uuid4())): - pass - - passing_tasks = [ - asyncio.create_task(run_passing_request()) for _ in range(10) - ] - failing_tasks = [ - asyncio.create_task(run_failing_request()) for _ in range(10) - ] - await asyncio.gather(*failing_tasks, return_exceptions=True) - await asyncio.gather(*passing_tasks) - - # All the bad inputs should have raised - for task in failing_tasks: - with pytest.raises(RAISED_ERROR): - task.result() - - # But all good inputs should have still succeeded - for task in passing_tasks: - task.result() - - # And the engine should remain healthy - assert not client.errored - await client.check_health() - - client.close() diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py deleted file mode 100644 index c934706611ae3..0000000000000 --- a/tests/mq_llm_engine/test_load.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test that the MQLLMEngine is able to handle 10k concurrent requests.""" - -import asyncio -import tempfile -import uuid - -import pytest - -from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate -from vllm.engine.arg_utils import AsyncEngineArgs - -MODEL = "google/gemma-1.1-2b-it" -NUM_EXPECTED_TOKENS = 10 -NUM_REQUESTS = 10000 - -# Scenarios to test for num generated token. -ENGINE_ARGS = AsyncEngineArgs(model=MODEL) - - -@pytest.fixture(scope="function") -def tmp_socket(): - with tempfile.TemporaryDirectory() as td: - yield f"ipc://{td}/{uuid.uuid4()}" - - -@pytest.mark.asyncio -async def test_load(tmp_socket): - with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, - ipc_path=tmp_socket) as engine: - - client = await engine.make_client() - - request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] - - # Create concurrent requests. - tasks = [] - for request_id in request_ids: - tasks.append( - asyncio.create_task( - generate(client, request_id, NUM_EXPECTED_TOKENS))) - - # Confirm that we got all the EXPECTED tokens from the requests. - failed_request_id = None - tokens = None - for task in tasks: - num_generated_tokens, request_id = await task - if (num_generated_tokens != NUM_EXPECTED_TOKENS - and failed_request_id is None): - failed_request_id = request_id - tokens = num_generated_tokens - - assert failed_request_id is None, ( - f"{failed_request_id} generated {tokens} but " - f"expected {NUM_EXPECTED_TOKENS}") - - # Shutdown. - client.close() diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py deleted file mode 100644 index 7976d5031aea1..0000000000000 --- a/tests/mq_llm_engine/utils.py +++ /dev/null @@ -1,81 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import multiprocessing -from typing import Callable, Union - -from vllm import SamplingParams -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.multiprocessing.client import MQLLMEngineClient -from vllm.engine.multiprocessing.engine import MQLLMEngine -from vllm.outputs import RequestOutput -from vllm.usage.usage_lib import UsageContext - - -async def generate( - client: MQLLMEngineClient, - request_id: str, - num_tokens: int, - return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]: - - final_output = None - count = 0 - async for out in client.generate( - request_id=request_id, - prompt="Hello my name is Robert and", - sampling_params=SamplingParams(max_tokens=num_tokens, - temperature=0)): - - count += 1 - final_output = out - await asyncio.sleep(0.) - - if return_output: - return final_output - - # Confirm we generated all the tokens we expected. - return count, request_id - - -def run_normal(engine_args: AsyncEngineArgs, ipc_path: str): - # Make engine. - engine = MQLLMEngine.from_engine_args( - engine_args=engine_args, - usage_context=UsageContext.UNKNOWN_CONTEXT, - ipc_path=ipc_path) - - # Run engine. - engine.start() - - -class RemoteMQLLMEngine: - - def __init__(self, - engine_args: AsyncEngineArgs, - ipc_path: str, - run_fn: Callable = run_normal) -> None: - - self.engine_args = engine_args - self.ipc_path = ipc_path - context = multiprocessing.get_context("spawn") - self.proc = context.Process(target=run_fn, - args=(engine_args, ipc_path)) - self.proc.start() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.proc.kill() - - async def make_client(self) -> MQLLMEngineClient: - engine_config = self.engine_args.create_engine_config() - client = MQLLMEngineClient(self.ipc_path, engine_config, self.proc.pid) - while True: - try: - await client.setup() - break - except TimeoutError: - assert self.proc.is_alive() - return client diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py deleted file mode 100644 index 9f64ee0808df2..0000000000000 --- a/vllm/engine/multiprocessing/__init__.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import uuid -from dataclasses import dataclass, field -from enum import Enum -from typing import List, Mapping, Optional, Union - -from vllm import PoolingParams -from vllm.inputs import PromptType -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput -from vllm.sampling_params import SamplingParams -from vllm.utils import Device - -VLLM_RPC_SUCCESS_STR = "SUCCESS" - -IPC_INPUT_EXT = "_input_socket" -IPC_OUTPUT_EXT = "_output_socket" -IPC_HEALTH_EXT = "_health_socket" -IPC_DATA_EXT = "_data_socket" - - -class MQEngineDeadError(RuntimeError): - pass - - -@dataclass -class RPCProcessRequest: - prompt: PromptType - params: Union[SamplingParams, PoolingParams] - request_id: str - lora_request: Optional[LoRARequest] = None - trace_headers: Optional[Mapping[str, str]] = None - priority: int = 0 - - def __init__( - self, - prompt: PromptType, - params: Union[SamplingParams, PoolingParams], - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> None: - super().__init__() - - self.prompt = prompt - self.params = params - self.request_id = request_id - self.lora_request = lora_request - self.trace_headers = trace_headers - self.priority = priority - - -@dataclass -class RPCError: - request_id: Optional[str] - is_engine_errored: bool - exception: BaseException - - -@dataclass -class RPCAbortRequest: - request_id: str - - -class RPCStartupRequest(Enum): - IS_SERVER_READY = 1 - - -@dataclass -class RPCStartupResponse: - tracing_enabled: bool - - -class RPCUProfileRequest(Enum): - START_PROFILE = 1 - STOP_PROFILE = 2 - - -class RPCResetMultiModalCacheRequest(Enum): - RESET = 1 - - -@dataclass -class RPCResetPrefixCacheRequest: - device: Device - - -class RPCSleepRequest(Enum): - SLEEP_LEVEL_1 = 1 - SLEEP_LEVEL_2 = 2 - - -@dataclass -class RPCWakeUpRequest: - tags: Optional[list[str]] = None - - -@dataclass -class RPCIsSleepingRequest: - # Set the default value of request_id to a new UUID - request_id: str = field(default_factory=lambda: str(uuid.uuid4())) - - -@dataclass -class RPCIsSleepingResponse: - request_id: str - is_sleeping: bool - - -@dataclass -class RPCLoadAdapterRequest: - lora_request: LoRARequest - # Set the default value of request_id to a new UUID - request_id: str = field(default_factory=lambda: str(uuid.uuid4())) - - -@dataclass -class RPCAdapterLoadedResponse: - request_id: str - lora_loaded: bool - - -RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, - RPCUProfileRequest, RPCLoadAdapterRequest, - RPCResetMultiModalCacheRequest, - RPCResetPrefixCacheRequest, RPCSleepRequest, - RPCWakeUpRequest, RPCIsSleepingRequest] - -REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, - RPCIsSleepingResponse, RPCError] - - -def ENGINE_DEAD_ERROR( - error: Optional[BaseException] = None) -> MQEngineDeadError: - if error is None: - return MQEngineDeadError( - "Engine loop is not running. Inspect the stacktrace to " - "find the original error") - - return MQEngineDeadError( - "Engine loop is not running. Inspect the stacktrace to " - f"find the original error: {repr(error)}.") diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py deleted file mode 100644 index 7d1f29a9824d7..0000000000000 --- a/vllm/engine/multiprocessing/client.py +++ /dev/null @@ -1,643 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import copy -import pickle -from contextlib import contextmanager, suppress -from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List, - Mapping, Optional, Union) - -import cloudpickle -import psutil -import zmq -import zmq.asyncio -from zmq import Frame # type: ignore[attr-defined] -from zmq.asyncio import Socket - -from vllm import PoolingParams -from vllm.config import DecodingConfig, ModelConfig, VllmConfig -from vllm.core.scheduler import SchedulerOutputs -# yapf conflicts with isort for this block -# yapf: disable -from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, - IPC_HEALTH_EXT, IPC_INPUT_EXT, - IPC_OUTPUT_EXT, RPC_REQUEST_T, - VLLM_RPC_SUCCESS_STR, RPCAbortRequest, - RPCAdapterLoadedResponse, RPCError, - RPCIsSleepingRequest, - RPCIsSleepingResponse, - RPCLoadAdapterRequest, - RPCProcessRequest, - RPCResetMultiModalCacheRequest, - RPCResetPrefixCacheRequest, - RPCSleepRequest, RPCStartupRequest, - RPCStartupResponse, - RPCUProfileRequest, RPCWakeUpRequest) -from vllm.engine.protocol import EngineClient -# yapf: enable -from vllm.envs import VLLM_RPC_TIMEOUT -from vllm.inputs import PromptType -from vllm.inputs.preprocess import InputPreprocessor -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.utils import Device - -logger = init_logger(__name__) - - -class MQClientClosedError(Exception): - """Exception class raised when the client is used post-close. - - The client can be closed, which closes the ZMQ context. This normally - happens on server shutdown. In some cases, methods like abort and - do_log_stats will still be called and then try to open a socket, which - causes a ZMQError and creates a huge stack trace. - So, we throw this error such that we can suppress it. - """ - - -class MQLLMEngineClient(EngineClient): - """A client wrapper for MQLLMEngine that conforms to the - EngineClient protocol. - - MQLLMEngine and MQLLMEngineClient are intended to run in separate - processes communicating via zeromq ipc sockets. - - The entrypoint to MQLLMEngineClient is through the generate() - method. On generate() MQLLMEngine does three things: - - Creates an asyncio output queue - - Sends a RPCGenerateRequest to the MQLLMEngine via zmq - - Pulls RequestOutputs from its queue and yields them - - MQLLMEngine runs two background loops: - - output_loop: the output loop pulls List[RequestOutput] - from the MQLLMEngine via zmq (each list is the output - of one engine_step in the LLMEngine). It then parses - the list and pushes individual request_outputs into - the corresponding output_queue such that they can be - consumed by the .generate() method. - - health_loop: the health loop queries the health socket - every N seconds, confirming the engine is healthy - """ - - def __init__(self, ipc_path: str, engine_config: VllmConfig, - engine_pid: int): - self.context = zmq.asyncio.Context() - self._errored_with: Optional[BaseException] = None - - # Get the configs. - self.vllm_config = engine_config - self.model_config = engine_config.model_config - self.decoding_config = engine_config.decoding_config - - if self.vllm_config.model_config.skip_tokenizer_init: - self.tokenizer = None - - else: - # Create the tokenizer group. - self.tokenizer = init_tokenizer_from_configs( - model_config=self.model_config, - scheduler_config=engine_config.scheduler_config, - lora_config=engine_config.lora_config) - - self.input_preprocessor = InputPreprocessor(self.model_config, - self.tokenizer) - - # Send RPCGenerateRequest to the MQLLMEngine. - self.input_socket: Socket = self.context.socket(zmq.constants.PUSH) - self.input_socket.connect(f"{ipc_path}{IPC_INPUT_EXT}") - - # Receive streams of RequestOutput from the MQLLMEngine. - self.output_socket: Socket = self.context.socket(zmq.constants.PULL) - self.output_socket.connect(f"{ipc_path}{IPC_OUTPUT_EXT}") - - # IPC path for acking heartbeats. - self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL) - self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}") - - # IPC path for the data socket. - self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}" - - # Stream for each individual request. - self.output_queues: Dict[str, asyncio.Queue] = {} - - # Loop to handle output of the LLMEngine periodically. - # Started after the MQLLMEngine is ready so that we can - # build the Client in an executor to enable clean shutdown. - self.output_loop: Optional[asyncio.Task] = None - - # Loop to check health of the LLMEngine periodically. - # Started after the MQLLMEngine is ready. - self.health_loop: Optional[asyncio.Task] = None - self._engine_process = psutil.Process(engine_pid) - - @staticmethod - def is_unsupported_config(vllm_config: VllmConfig): - # Pipeline parallel not yet supported - return vllm_config.parallel_config.pipeline_parallel_size > 1 - - @contextmanager - def get_data_socket(self) -> Iterator[Socket]: - socket = self.context.socket(zmq.constants.DEALER) - try: - socket.connect(self.data_ipc_path) - yield socket - finally: - socket.close(linger=0) - - async def run_heartbeat_loop(self, timeout: int): - """Background loop that continually checks to ensure the engine process - is still alive. - """ - try: - while True: - # Check if the engine process is running: - if not self._engine_process.is_running() or ( - self._engine_process.status() == psutil.STATUS_ZOMBIE): - # NB: is_running() returns True for zombies - self._set_errored( - RuntimeError( - f"Engine process (pid {self._engine_process.pid}) " - "died.")) - break - - if await self.heartbeat_socket.poll(timeout=timeout): - # Heartbeat received- check the message - await self._check_success( - error_message="Heartbeat failed.", - socket=self.heartbeat_socket) - - logger.debug("Heartbeat successful.") - - except asyncio.CancelledError: - logger.debug("Shutting down MQLLMEngineClient check health loop.") - - except psutil.NoSuchProcess: - self._set_errored( - RuntimeError( - f"Engine process (pid {self._engine_process.pid}) died.")) - - except Exception as e: - self._set_errored(e) - - async def run_output_handler_loop(self): - """Get RequestOutputs from Engine and stream to Request Queues""" - - try: - while True: - # Poll, checking for ENGINE_DEAD - while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT - ) == 0: - logger.debug("Waiting for output from MQLLMEngine.") - - # If errored, alert all running requests. - if self.errored: - for queue_j in tuple(self.output_queues.values()): - queue_j.put_nowait( - ENGINE_DEAD_ERROR(self._errored_with)) - return - - message: Frame = await self.output_socket.recv(copy=False) - request_outputs = pickle.loads(message.buffer) - - is_error = isinstance(request_outputs, - (BaseException, RPCError)) - if is_error: - if isinstance(request_outputs, RPCError): - rpc_error: RPCError = request_outputs - request_id = rpc_error.request_id - exception = rpc_error.exception - is_engine_errored = rpc_error.is_engine_errored - else: - # MPLLMEngine should always return an RPCError to - # the output_socket when an issue arises. - # If we are here, we are in a bad state and - # should shut down the server. - error: BaseException = request_outputs - logger.error( - "Received Exception %s rather than RPCError from " - "MPLLMEngine. This should never happen.", error) - request_id = None - exception = error - is_engine_errored = True - - # Set to error state only on engine critical error - # (and record only the first one) - if is_engine_errored and not self._errored_with: - self._errored_with = exception - # If engine is errored, no matter the type of exception - # it will no longer be able to receive new requests, - # therefore we have to inform that the current - # processed requests failed as well. Send back a dead - # engine error give this feedback and also give a - # 'hint' to the server to shut down next. - exception = self.dead_error - - if request_id is None: - # If request_id is None, then the engine raised an - # exception for a batch, and we may not know the - # request that caused it, neither if it was actually - # caused by any of them (e.g. CUDA OOM). Therefore we - # broadcast the same exception for all requests. - for queue_i in tuple(self.output_queues.values()): - queue_i.put_nowait(exception) - else: - queue = self.output_queues.get(request_id) - if queue is not None: - queue.put_nowait(exception) - # Put each output into the appropriate queue. - elif isinstance( - request_outputs, - (RPCAdapterLoadedResponse, RPCIsSleepingResponse)): - self._add_output(request_outputs) - else: - for request_output in request_outputs: - self._add_output(request_output) - - except asyncio.CancelledError: - logger.debug("Shutting down MQLLMEngineClient output handler.") - - def _add_output(self, request_output: Union[RequestOutput, - RPCAdapterLoadedResponse, - RPCIsSleepingResponse]): - queue = self.output_queues.get(request_output.request_id) - if queue is not None: - queue.put_nowait(request_output) - - async def setup(self): - """Set up the client before it starts sending server requests.""" - - # Start output_loop - if self.output_loop is None: - # only generate once to avoid multiple concurrent output_loops - # this will lead to race conditions and wrong orders of tokens - # returned by the engine - # setup will be called multiple times during the startup of - # the engine - self.output_loop = asyncio.create_task( - self.run_output_handler_loop()) - - with self.get_data_socket() as socket: - # Wait until server is ready. - response = await self._wait_for_server_rpc(socket) - - self.tracing_flag = response.tracing_enabled - - # Start health_loop. - if self.health_loop is None: - self.health_loop = asyncio.create_task( - self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT)) - - def close(self): - """Destroy the ZeroMQ Context.""" - # Close all sockets and terminate the context. - self.context.destroy(linger=0) - - # Cancel background tasks. - if self.health_loop is not None: - self.health_loop.cancel() - if self.output_loop is not None: - self.output_loop.cancel() - - def _set_errored(self, e: BaseException): - logger.exception(repr(e)) - if self._errored_with is None: - self._errored_with = e - - @staticmethod - async def _send_get_data_rpc_request(request: RPCStartupRequest, - expected_type: Any, - error_message: str, - socket: Socket) -> Any: - """Send an RPC request that is expecting data back.""" - - # Ping RPCServer with a request. - await socket.send_multipart((pickle.dumps(request), ), copy=False) - - # Make sure the server responds in time. - if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0: - raise TimeoutError("RPCServer didn't reply within " - f"{VLLM_RPC_TIMEOUT} ms") - - # Await the data from the Server. - frame = await socket.recv(copy=False) - data = pickle.loads(frame.buffer) - - if isinstance(data, BaseException): - raise data - elif not isinstance(data, expected_type): - raise ValueError(error_message) - - return data - - @staticmethod - async def _send_one_way_rpc_request(request: RPC_REQUEST_T, - socket: Socket): - """Send one-way RPC request to trigger an action.""" - - if socket.closed: - raise MQClientClosedError() - - await socket.send_multipart((pickle.dumps(request), )) - - async def _await_ack(self, error_message: str, socket: Socket): - """Await acknowledgement that a request succeeded.""" - - if socket.closed: - raise MQClientClosedError() - - if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0: - raise TimeoutError("MQLLMEngine didn't reply within " - f"{VLLM_RPC_TIMEOUT}ms") - - await self._check_success(error_message, socket) - - @staticmethod - async def _check_success(error_message: str, socket: Socket): - """Confirm that socket has a VLLM_RPC_SUCCESS_STR message""" - - if socket.closed: - raise MQClientClosedError() - - frame = await socket.recv(copy=False) - response = pickle.loads(frame.buffer) - - # Raise error if unsuccessful - if isinstance(response, BaseException): - raise response - elif (not isinstance(response, str) - or response != VLLM_RPC_SUCCESS_STR): - raise ValueError(error_message) - - async def get_input_preprocessor(self) -> InputPreprocessor: - return self.input_preprocessor - - async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): - if self.tokenizer is None: - return None - else: - return await self.tokenizer.get_lora_tokenizer_async(lora_request) - - async def get_vllm_config(self) -> VllmConfig: - return self.vllm_config - - async def get_decoding_config(self) -> DecodingConfig: - return self.decoding_config - - async def get_model_config(self) -> ModelConfig: - return self.model_config - - async def is_tracing_enabled(self) -> bool: - return self.tracing_flag - - async def _wait_for_server_rpc(self, socket: Socket) -> RPCStartupResponse: - """Wait for the RPCServer to start up.""" - - return await self._send_get_data_rpc_request( - request=RPCStartupRequest.IS_SERVER_READY, - expected_type=RPCStartupResponse, - error_message="Unable to start RPC Server", - socket=socket) - - async def abort(self, request_id: Union[str, Iterable[str]]): - """Send an ABORT_REQUEST signal to the RPC Server""" - - if not isinstance(request_id, str): - raise RuntimeError("Only single-request abort supported in" - " deprecated V0") - - with suppress(MQClientClosedError): - await self._send_one_way_rpc_request( - request=RPCAbortRequest(request_id), socket=self.input_socket) - - async def do_log_stats( - self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None, - ) -> None: - """ - Ignore do_log_stats (handled on MQLLMEngine polling) - """ - pass - - async def check_health(self): - """ - The check health loop probes the health status of the - Engine's health every N seconds and sets _errored_with - if the engine is unhealthy. - """ - if self._errored_with is not None: - raise self._errored_with - - @property - def is_running(self) -> bool: - return not self.errored - - @property - def is_stopped(self) -> bool: - return self.errored - - @property - def errored(self) -> bool: - return self._errored_with is not None - - @property - def dead_error(self) -> BaseException: - return ENGINE_DEAD_ERROR(self._errored_with) - - def generate( - self, - prompt: PromptType, - sampling_params: SamplingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> AsyncGenerator[RequestOutput, None]: - """Generate outputs for a request. - - Generate outputs for a request. This method is a coroutine. It adds the - request into the waiting queue of the LLMEngine and streams the outputs - from the LLMEngine to the caller. - - Args: - prompt: The prompt to the LLM. See - [`PromptType`][vllm.inputs.PromptType] for more details about - the format of each input. - sampling_params: The sampling parameters of the request. - request_id: The unique id of the request. - lora_request: LoRA request to use for generation, if any. - trace_headers: OpenTelemetry trace headers. - priority: Priority of the request (lower means earlier handling). - Any priority other than 0 will lead to an error if the - scheduling policy is not "priority". - """ - return self._process_request(prompt, sampling_params, request_id, - lora_request, trace_headers, priority) - - def encode( - self, - prompt: PromptType, - pooling_params: PoolingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - raise NotImplementedError( - "Pooling models are not supported in vLLM V0") - - async def _process_request( - self, - prompt: PromptType, - params: SamplingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> AsyncGenerator[RequestOutput, None]: - """Send an RPCGenerateRequest to the RPCServer and stream responses.""" - - # If already dead, error out. - if self._errored_with is not None: - raise ENGINE_DEAD_ERROR(self._errored_with) - - # Ensure the request id is unique among running requests - if request_id in self.output_queues: - raise ValueError(f"Request {request_id} already exists") - - # 1) Create output queue for this request. - queue: asyncio.Queue[Union[RequestOutput, - BaseException]] = asyncio.Queue() - self.output_queues[request_id] = queue - - try: - # 2) Detach logits processors so that they can be pickled - # separately (may require cloudpickle which is slower) - if params.logits_processors: - # Defensive shallow copy - params = copy.copy(params) - logits_processors = params.logits_processors - params.logits_processors = None - lp_bytes = cloudpickle.dumps(logits_processors) - else: - lp_bytes = None - - request_bytes = pickle.dumps( - RPCProcessRequest( - prompt=prompt, - params=params, - request_id=request_id, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - )) - - # 3) Send the RPCGenerateRequest to the MQLLMEngine. - parts = (request_bytes, - lp_bytes) if lp_bytes else (request_bytes, ) - await self.input_socket.send_multipart(parts, copy=False) - - # 4) Stream the RequestOutputs from the output queue. Note - # that the output_loop pushes RequestOutput objects to this - # queue after pulling them from the zmq socket. - finished = False - try: - while not finished: - request_output = await queue.get() - - if isinstance(request_output, BaseException): - raise request_output - - finished = request_output.finished - yield request_output - finally: - # Request was canceled by the client. - if not finished and not self.errored: - await self.abort(request_id) - finally: - self.output_queues.pop(request_id) - - async def start_profile(self) -> None: - """Start profiling the engine""" - - await self._send_one_way_rpc_request( - request=RPCUProfileRequest.START_PROFILE, socket=self.input_socket) - - async def stop_profile(self) -> None: - """Stop profiling the engine""" - - await self._send_one_way_rpc_request( - request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket) - - async def reset_mm_cache(self) -> None: - """Reset the multi-modal cache""" - - await self._send_one_way_rpc_request( - request=RPCResetMultiModalCacheRequest.RESET, - socket=self.input_socket) - - async def reset_prefix_cache(self, - device: Optional[Device] = None) -> None: - """Reset the prefix cache""" - - await self._send_one_way_rpc_request( - request=RPCResetPrefixCacheRequest(device), - socket=self.input_socket) - - async def sleep(self, level: int = 1) -> None: - """Sleep the engine for a given level""" - return await self._send_one_way_rpc_request( - request=RPCSleepRequest(level), socket=self.input_socket) - - async def wake_up(self, tags: Optional[list[str]] = None) -> None: - """Wake up the engine""" - return await self._send_one_way_rpc_request( - request=RPCWakeUpRequest(tags), socket=self.input_socket) - - async def is_sleeping(self) -> bool: - """Check whether the engine is sleeping""" - request = RPCIsSleepingRequest() - - queue: asyncio.Queue[Union[BaseException, - RPCIsSleepingResponse]] = asyncio.Queue() - self.output_queues[request.request_id] = queue - - request_bytes = pickle.dumps(request) - await self.input_socket.send_multipart((request_bytes, ), copy=False) - - request_output = await queue.get() - self.output_queues.pop(request.request_id) - - if isinstance(request_output, BaseException): - raise request_output - return request_output.is_sleeping - - async def add_lora(self, lora_request: LoRARequest) -> bool: - """Load a new LoRA adapter into the engine for future requests.""" - # Uses the same I/O as generate requests - request = RPCLoadAdapterRequest(lora_request) - - # Create output queue for this request. - queue: asyncio.Queue[Union[ - BaseException, RPCAdapterLoadedResponse]] = asyncio.Queue() - self.output_queues[request.request_id] = queue - - # Send the request - request_bytes = pickle.dumps(request) - await self.input_socket.send_multipart((request_bytes, ), copy=False) - - # Wait for the response - request_output = await queue.get() - self.output_queues.pop(request.request_id) - - # Raise on error, otherwise happily return None - if isinstance(request_output, BaseException): - raise request_output - return request_output.lora_loaded diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py deleted file mode 100644 index 138283d4c8a75..0000000000000 --- a/vllm/engine/multiprocessing/engine.py +++ /dev/null @@ -1,470 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pickle -import signal -from contextlib import contextmanager -from typing import Iterator, List, Optional, Union - -import cloudpickle -import zmq - -from vllm import AsyncEngineArgs, SamplingParams -from vllm.config import VllmConfig -from vllm.engine.llm_engine import LLMEngine -# yapf conflicts with isort for this block -# yapf: disable -from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, - IPC_HEALTH_EXT, IPC_INPUT_EXT, - IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T, - VLLM_RPC_SUCCESS_STR, RPCAbortRequest, - RPCAdapterLoadedResponse, RPCError, - RPCIsSleepingRequest, - RPCIsSleepingResponse, - RPCLoadAdapterRequest, - RPCProcessRequest, - RPCResetMultiModalCacheRequest, - RPCResetPrefixCacheRequest, - RPCSleepRequest, RPCStartupRequest, - RPCStartupResponse, - RPCUProfileRequest, RPCWakeUpRequest) -# yapf: enable -from vllm.logger import init_logger -from vllm.outputs import RequestOutput -from vllm.transformers_utils.config import ( - maybe_register_config_serialize_by_value) -from vllm.usage.usage_lib import UsageContext -from vllm.utils import deprecate_kwargs -from vllm.worker.model_runner_base import InputProcessingError - -logger = init_logger(__name__) - -POLLING_TIMEOUT_MS = 10000 -HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), ) - - -class MQLLMEngine: - """A multiprocessing wrapper for - [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. - - This class is used to wrap the - [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use - in concurrent manner. It runs a background loop and uses zeromq to - receive new requests and stream outputs incrementally via ipc. - - The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode - process is kicked off when a new RPCProcessRequest is received by the - input_socket. - - The self.engine_loop checks the input_socket for new requests, - adds them to the LLMEngine if there are any, calls the internal - [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends - the RequestOutputs back over the output_socket. - - If use_async_sockets is set, the logic associated with reading new - requests from the socket and sending data to the socket is passed - as a callback to the llm_engine, which calls the logic asynchronously - such that the IPC can be overlapped with the GPU. - - Args: - ipc_path: Base path for zeromq interprocess messaging - use_async_sockets: Whether to make send/recv async with GPU - log_requests: Whether to log the requests. - *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. - **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. - """ - - def __init__(self, - ipc_path: str, - use_async_sockets: bool, - *args, - log_requests: bool = True, - **kwargs) -> None: - # For MQLLMEngine, we can use cached outputs, since each new request - # output is immediately pickled and send over the socket, which frees - # the python object to be reused again. - kwargs['use_cached_outputs'] = True - - self.engine = LLMEngine(*args, **kwargs) - self.log_requests = log_requests - - self.use_async_sockets = use_async_sockets - if self.use_async_sockets: - self.engine.process_request_outputs_callback = \ - self._async_socket_engine_callback - - self.ctx = zmq.Context() # type: ignore[attr-defined] - - # Receive input from the client. - self.input_socket = self.ctx.socket(zmq.constants.PULL) - self.input_socket.bind(f"{ipc_path}{IPC_INPUT_EXT}") - - # Send output stream back to client. - self.output_socket = self.ctx.socket(zmq.constants.PUSH) - self.output_socket.bind(f"{ipc_path}{IPC_OUTPUT_EXT}") - - # Send heartbeats back to client. - self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH) - self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}") - - # IPC path for the data socket. - self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}" - - # Error state. - self._errored_with: Optional[BaseException] = None - - @property - def dead_error(self) -> BaseException: - if self._errored_with is not None: - return ENGINE_DEAD_ERROR(self._errored_with) - else: - return ENGINE_DEAD_ERROR() - - @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=("This argument will have no effect. " - "Use `enable_log_requests` instead."), - ) - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext, - enable_log_requests: bool, - disable_log_stats: bool, - ipc_path: str, - disable_log_requests: bool = True, # Deprecated, will be removed - ) -> "MQLLMEngine": - # Setup plugins for each process - from vllm.plugins import load_general_plugins - load_general_plugins() - - use_async_sockets = vllm_config.model_config.use_async_output_proc - - return cls( - vllm_config=vllm_config, - executor_class=LLMEngine._get_executor_cls(vllm_config), - ipc_path=ipc_path, - usage_context=usage_context, - use_async_sockets=use_async_sockets, - log_requests=enable_log_requests, - log_stats=(not disable_log_stats), - ) - - @staticmethod - def from_engine_args(engine_args: AsyncEngineArgs, - usage_context: UsageContext, ipc_path: str): - """Creates an MQLLMEngine from the engine arguments.""" - - vllm_config = engine_args.create_engine_config(usage_context) - return MQLLMEngine.from_vllm_config( - ipc_path=ipc_path, - vllm_config=vllm_config, - usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, - disable_log_stats=engine_args.disable_log_stats, - ) - - def start(self): - try: - try: - logger.debug("Starting Startup Loop.") - self.run_startup_loop() - logger.debug("Starting Engine Loop.") - self.run_engine_loop() - except Exception as e: - logger.exception(repr(e)) - except KeyboardInterrupt: - logger.debug("Shutting down MQLLMEngine.") - finally: - logger.debug("MQLLMEngine is shut down.") - self.cleanup() - - def cleanup(self): - """Cleanup zeromq state on shutdown.""" - # Closes all sockets and destroys context. - self.ctx.destroy(linger=0) - del self.engine - - @contextmanager - def make_data_socket( - self) -> Iterator[zmq.Socket]: # type: ignore[name-defined] - socket = self.ctx.socket(zmq.constants.ROUTER) - try: - socket.bind(self.data_ipc_path) - yield socket - finally: - socket.close(linger=0) - - def run_startup_loop(self) -> None: - """Startup loop for sending data from Engine -> Client.""" - - with self.make_data_socket() as socket: - response: Union[RPCStartupResponse, BaseException] - try: - identity, message = socket.recv_multipart(copy=False) - request: RPCStartupRequest = pickle.loads(message.buffer) - - # Handle the query from the Client. - if request == RPCStartupRequest.IS_SERVER_READY: - tracing_enabled = self.engine.is_tracing_enabled() - response = RPCStartupResponse( - tracing_enabled=tracing_enabled) - - except Exception as e: - response = e - - socket.send_multipart((identity, pickle.dumps(response)), - copy=False) - - def run_engine_loop(self): - """Core busy loop of the LLMEngine.""" - - while True: - if not self.engine.has_unfinished_requests(): - # Poll until there is work to do. - while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: - # When there's no work, check on engine health and send - # health status back to client - self._health_check() - self.engine.do_log_stats() - logger.debug("Waiting for new requests in engine loop.") - - # Handle any input from the client. - self.handle_new_input() - - # Engine step. - request_outputs = self.engine_step() - - # Send request outputs (if async, done in engine_step callback). - if not self.use_async_sockets: - self._send_outputs(request_outputs) - - def engine_step(self) -> List[RequestOutput]: - """Engine step wrapper with error handling.""" - try: - return self.engine.step() - except SystemExit: - raise - except InputProcessingError as e: - # Special case where we handle an error preparing the inputs for - # a single request in the batch - rpc_err = RPCError(request_id=e.request_id, - is_engine_errored=False, - exception=e.__cause__) - self._send_outputs(rpc_err) - return [] - except BaseException as e: - self._set_errored(e) - rpc_err = RPCError(request_id=None, - is_engine_errored=True, - exception=e) - self._send_outputs(rpc_err) - raise e - - def handle_new_input(self): - """Handle new input from the socket""" - try: - while self.input_socket.poll(timeout=0) != 0: - frames = self.input_socket.recv_multipart(copy=False) - request = pickle.loads(frames[0].buffer) - - if isinstance(request, RPCProcessRequest): - if len(frames) > 1: - # Use cloudpickle for logits processors - assert isinstance(request.params, SamplingParams) - lprocs = cloudpickle.loads(frames[1].buffer) - request.params.logits_processors = lprocs - self._handle_process_request(request) - elif isinstance(request, RPCAbortRequest): - self._handle_abort_request(request) - elif isinstance(request, RPCUProfileRequest): - if request == RPCUProfileRequest.START_PROFILE: - self.start_profile() - else: - self.stop_profile() - elif isinstance(request, RPCLoadAdapterRequest): - self._handle_load_adapter_request(request) - elif isinstance(request, RPCResetMultiModalCacheRequest): - self.reset_mm_cache() - elif isinstance(request, RPCResetPrefixCacheRequest): - self.reset_prefix_cache() - elif isinstance(request, RPCSleepRequest): - self.sleep(request.value) - elif isinstance(request, RPCWakeUpRequest): - self.wake_up(request.tags) - elif isinstance(request, RPCIsSleepingRequest): - self._handle_is_sleeping_request(request) - else: - raise ValueError("Unknown RPCRequest Type: " - f"{type(request)}") - - except Exception as e: - self._set_errored(e) - self._send_unhealthy(e) - raise e from None - - def _handle_process_request(self, request: RPCProcessRequest): - """Handle RPCProcessRequest by adding it to the LLMEngine.""" - request_id = request.request_id - - if self._errored_with is not None: - rpc_err = RPCError(request_id=request_id, - is_engine_errored=True, - exception=ENGINE_DEAD_ERROR(self._errored_with)) - self._send_outputs(rpc_err) - - try: - self.engine.add_request(request_id=request_id, - prompt=request.prompt, - params=request.params, - lora_request=request.lora_request, - trace_headers=request.trace_headers, - priority=request.priority) - - if self.log_requests: - logger.info("Added request %s.", request.request_id) - - except Exception as e: - # We do not set self._errored = True here, since the error - # is due to an issue adding this request to the engine, - # rather than an issue with the engine itself. - logger.debug("Failed to add request %s to engine. %s", - request.request_id, e) - is_errored = self._errored_with is not None - rpc_err = RPCError(request_id=request_id, - is_engine_errored=is_errored, - exception=e) - self._send_outputs(rpc_err) - - # Remove request from the engine. - self.engine.abort_request(request_id) - - def _handle_abort_request(self, request: RPCAbortRequest): - self.engine.abort_request(request.request_id) - if self.log_requests: - logger.info("Aborted request %s.", request.request_id) - - def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest): - try: - lora_loaded = self.engine.add_lora(request.lora_request) - except BaseException as e: - # Send back an error if the adater fails to load - rpc_err = RPCError(request_id=request.request_id, - is_engine_errored=False, - exception=e) - self._send_outputs(rpc_err) - return - # Otherwise, send back the successful load message - self._send_outputs( - RPCAdapterLoadedResponse(request_id=request.request_id, - lora_loaded=lora_loaded)) - - def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest): - is_sleeping = self.is_sleeping() - self._send_outputs( - RPCIsSleepingResponse(request_id=request.request_id, - is_sleeping=is_sleeping)) - - def _health_check(self): - # Send unhealthy if engine has already errored - if self._errored_with is not None: - self._send_unhealthy(self._errored_with) - try: - self.engine.check_health() - self._send_healthy() - except Exception as e: - self._set_errored(e) - self._send_unhealthy(e) - - def _send_outputs(self, outputs: REQUEST_OUTPUTS_T): - """Send outputs back to the engine client. These can be: - - Exceptions - - A list of generation outputs - - A response from loading a lora adapter - """ - if outputs: - try: - from ray.exceptions import RayTaskError - - # RayTaskError might not pickelable here. We need to unpack the - # underlying exception as the real exception in the output. - if (isinstance(outputs, RPCError) - and isinstance(outputs.exception, RayTaskError)): - outputs.exception = outputs.exception.cause - except ImportError: - pass - - output_bytes = pickle.dumps(outputs) - self.output_socket.send_multipart((output_bytes, ), copy=False) - - def _send_healthy(self): - """Send HEALTHY message to RPCClient.""" - if not self.heartbeat_socket.closed: - self.heartbeat_socket.send_multipart(HEALTHY_RESPONSE, copy=False) - - def _send_unhealthy(self, error: BaseException): - """Send UNHEALTHY message to RPCClient.""" - if not self.heartbeat_socket.closed: - error_bytes = pickle.dumps(error) - self.heartbeat_socket.send_multipart((error_bytes, ), copy=False) - - def _async_socket_engine_callback(self, - request_outputs: REQUEST_OUTPUTS_T): - """Callback used by engine to make socket handling async with GPU.""" - self._send_outputs(request_outputs) - self.handle_new_input() - - def _set_errored(self, e: BaseException): - """Log and set errored status if this is the first issue.""" - if self._errored_with is None: - self._errored_with = e - - def start_profile(self) -> None: - self.engine.start_profile() - - def stop_profile(self) -> None: - self.engine.stop_profile() - - def reset_mm_cache(self) -> bool: - return self.engine.reset_mm_cache() - - def reset_prefix_cache(self) -> bool: - return self.engine.reset_prefix_cache() - - def sleep(self, level: int = 1) -> None: - self.engine.sleep(level) - - def wake_up(self, tags: Optional[list[str]] = None) -> None: - self.engine.wake_up(tags) - - def is_sleeping(self) -> bool: - return self.engine.is_sleeping() - - -def signal_handler(*_) -> None: - raise KeyboardInterrupt("MQLLMEngine terminated") - - -def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, - ipc_path: str, disable_log_stats: bool, - enable_log_requests: bool, engine_alive): - try: - # Ensure we can serialize transformer config before spawning - maybe_register_config_serialize_by_value() - - engine = MQLLMEngine.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - disable_log_stats=disable_log_stats, - enable_log_requests=enable_log_requests, - ipc_path=ipc_path) - - signal.signal(signal.SIGTERM, signal_handler) - - engine.start() - - except BaseException as e: - logger.exception(e) - engine_alive.value = False - raise e from None diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 887e277109240..c3195dbc4697f 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -12,7 +12,6 @@ from fastapi import FastAPI, Request, Response from vllm import envs from vllm.engine.async_llm_engine import AsyncEngineDeadError -from vllm.engine.multiprocessing import MQEngineDeadError from vllm.engine.protocol import EngineClient from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) @@ -156,7 +155,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: @app.exception_handler(RuntimeError) @app.exception_handler(AsyncEngineDeadError) - @app.exception_handler(MQEngineDeadError) @app.exception_handler(EngineDeadError) @app.exception_handler(EngineGenerateError) async def runtime_exception_handler(request: Request, __): diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2e4aa7f3d5a6f..527193c913394 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -import atexit import gc import importlib import inspect @@ -17,7 +16,6 @@ import uuid from argparse import Namespace from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager -from functools import partial from http import HTTPStatus from typing import Annotated, Any, Callable, Optional @@ -42,8 +40,6 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore -from vllm.engine.multiprocessing.client import MQLLMEngineClient -from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (load_chat_template, resolve_hf_chat_template, @@ -102,13 +98,10 @@ from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, log_non_default_args, with_cancellation) from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.transformers_utils.config import ( - maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, - get_open_zmq_ipc_path, is_valid_ipv6_address, - set_ulimit) + is_valid_ipv6_address, set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -237,8 +230,7 @@ async def build_async_engine_client_from_engine_args( async_llm.shutdown() # V0 AsyncLLM. - elif (MQLLMEngineClient.is_unsupported_config(vllm_config) - or disable_frontend_multiprocessing): + else: engine_client: Optional[EngineClient] = None try: @@ -252,96 +244,6 @@ async def build_async_engine_client_from_engine_args( if engine_client and hasattr(engine_client, "shutdown"): engine_client.shutdown() - # V0MQLLMEngine. - else: - if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: - # Make TemporaryDirectory for prometheus multiprocessing - # Note: global TemporaryDirectory will be automatically - # cleaned up upon exit. - global prometheus_multiproc_dir - prometheus_multiproc_dir = tempfile.TemporaryDirectory() - os.environ[ - "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name - else: - logger.warning( - "Found PROMETHEUS_MULTIPROC_DIR was set by user. " - "This directory must be wiped between vLLM runs or " - "you will find inaccurate metrics. Unset the variable " - "and vLLM will properly handle cleanup.") - - # Select random path for IPC. - ipc_path = get_open_zmq_ipc_path() - logger.debug("Multiprocessing frontend to use %s for IPC Path.", - ipc_path) - - # Start RPCServer in separate process (holds the LLMEngine). - # the current process might have CUDA context, - # so we need to spawn a new process - context = multiprocessing.get_context("spawn") - - # Ensure we can serialize transformer config before spawning - maybe_register_config_serialize_by_value() - - # The Process can raise an exception during startup, which may - # not actually result in an exitcode being reported. As a result - # we use a shared variable to communicate the information. - engine_alive = multiprocessing.Value('b', True, lock=False) - engine_process = context.Process( - target=run_mp_engine, - args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, - engine_args.disable_log_stats, - engine_args.enable_log_requests, engine_alive)) - engine_process.start() - engine_pid = engine_process.pid - assert engine_pid is not None, "Engine process failed to start." - logger.info("Started engine process with PID %d", engine_pid) - - def _cleanup_ipc_path(): - socket_path = ipc_path.replace("ipc://", "") - if os.path.exists(socket_path): - os.remove(socket_path) - - # Ensure we clean up the local IPC socket file on exit. - atexit.register(_cleanup_ipc_path) - - # Build RPCClient, which conforms to EngineClient Protocol. - build_client = partial(MQLLMEngineClient, ipc_path, vllm_config, - engine_pid) - mq_engine_client = await asyncio.get_running_loop().run_in_executor( - None, build_client) - try: - while True: - try: - await mq_engine_client.setup() - break - except TimeoutError: - if (not engine_process.is_alive() - or not engine_alive.value): - raise RuntimeError( - "Engine process failed to start. See stack " - "trace for the root cause.") from None - - yield mq_engine_client # type: ignore[misc] - finally: - # Ensure rpc server process was terminated - engine_process.terminate() - - # Close all open connections to the backend - mq_engine_client.close() - - # Wait for engine process to join - engine_process.join(4) - if engine_process.exitcode is None: - # Kill if taking longer than 5 seconds to stop - engine_process.kill() - - # Lazy import for prometheus multiprocessing. - # We need to set PROMETHEUS_MULTIPROC_DIR environment variable - # before prometheus_client is imported. - # See https://prometheus.github.io/client_python/multiprocess/ - from prometheus_client import multiprocess - multiprocess.mark_process_dead(engine_process.pid) - async def validate_json_request(raw_request: Request): content_type = raw_request.headers.get("content-type", "").lower() diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index bb8bff48c7b95..4f540fe965e22 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -191,7 +191,7 @@ class RocmPlatform(Platform): kv_cache_dtype, block_size, use_v1, use_mla, has_sink) -> str: if use_mla: - from vllm.attention.backends.rocm_aiter_mla import ( + from vllm.v1.attention.backends.mla.rocm_aiter_mla import ( is_aiter_mla_enabled) if selected_backend is None: From 0f7acdd73ca6316c8ae0474c0a9c4fc264e87a7b Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Tue, 16 Sep 2025 22:01:04 -0700 Subject: [PATCH 345/584] [Model] Support Qwen3-VL Model Series (#24727) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- docs/models/supported_models.md | 2 + examples/offline_inference/vision_language.py | 78 + .../multimodal/processing/test_common.py | 35 +- tests/models/registry.py | 6 + .../layers/rotary_embedding/__init__.py | 2 + .../layers/rotary_embedding/mrope.py | 144 +- vllm/model_executor/models/qwen2.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 2 +- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 1478 +++++++++++++++++ vllm/model_executor/models/qwen3_vl_moe.py | 344 ++++ vllm/model_executor/models/registry.py | 4 +- vllm/multimodal/video.py | 2 +- 13 files changed, 2084 insertions(+), 17 deletions(-) create mode 100644 vllm/model_executor/models/qwen3_vl.py create mode 100644 vllm/model_executor/models/qwen3_vl_moe.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 73834ddd0c5d6..7aeaeca97699c 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -661,6 +661,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ | +| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 929df8d8bebd9..de3f3afc17948 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1437,6 +1437,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# Qwen3-VL-Dense +def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData: + model_name = "Qwen/Qwen3-VL-4B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + limit_mm_per_prompt={modality: 1}, + ) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompts = [ + ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + +# Qwen3-VL-MOE +def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData: + model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + limit_mm_per_prompt={modality: 1}, + ) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompts = [ + ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # R-4B def run_r_vl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1645,6 +1719,8 @@ model_example_map = { "qwen2_vl": run_qwen2_vl, "qwen2_5_vl": run_qwen2_5_vl, "qwen2_5_omni": run_qwen2_5_omni, + "qwen3_vl": run_qwen3_vl, + "qwen3_vl_moe": run_qwen3_vl_moe, "rvl": run_r_vl, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, @@ -1658,6 +1734,8 @@ MODELS_NEED_VIDEO_METADATA = [ "glm4_1v", "glm4_5v", "glm4_5v_fp8", + "qwen3_vl", + "qwen3_vl_moe", ] diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index a272c840f8dac..0941cc3f608e7 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -31,6 +31,7 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: """ # Ensure video metadata is included if "video" in mm_data: + # GLM4.1V doesn't support multiple videos video = mm_data["video"] num_frames = len(video) mm_data["video"] = (video, { @@ -44,6 +45,34 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: return mm_data +def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: + """ + Patch the multimodal data for Qwen3-VL model. + """ + + def create_metadata(frames: np.ndarray): + num_frames = len(frames) + return { + "total_num_frames": num_frames, + "fps": 2.0, + "duration": num_frames / 2.0, + "video_backend": "opencv", + "frames_indices": list(range(num_frames)), + "do_sample_frames": True, + } + + # Ensure video metadata is included + if "video" in mm_data: + video = mm_data["video"] + if isinstance(video, list): + # multiple videos + mm_data["video"] = [(vid, create_metadata(vid)) for vid in video] + else: + # single video + mm_data["video"] = (video, create_metadata(video)) + return mm_data + + def _test_processing_correctness( model_id_or_arch: str, hit_rate: float, @@ -182,8 +211,10 @@ _IGNORE_MM_KEYS = { } MM_DATA_PATCHES = { - # GLM4.1V requires video metadata to be included in the input + # GLM4.1V and Qwen3-VL requires video metadata to be included in the input "glm4v": glm4_1v_patch_mm_data, + "qwen3_vl": qwen3_vl_patch_mm_data, + "qwen3_vl_moe": qwen3_vl_patch_mm_data, } @@ -326,6 +357,8 @@ def _test_processing_correctness_one( "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-Omni-3B", + "Qwen/Qwen3-VL-4B-Instruct", + "Qwen/Qwen3-VL-30B-A3B-Instruct", "YannQi/R-4B", "Skywork/Skywork-R1V-38B", "HuggingFaceTB/SmolVLM2-2.2B-Instruct", diff --git a/tests/models/registry.py b/tests/models/registry.py index 9aef08769fb22..93aa9d4025498 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -557,6 +557,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_model_len=4096), "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 + "Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501 + max_model_len=4096, + min_transformers_version="4.57"), # noqa: E501 + "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501 + max_model_len=4096, + min_transformers_version="4.57"), "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", trust_remote_code=True), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 564f9a5c00750..c9653aa9e4405 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -103,6 +103,8 @@ def get_rope( is_neox_style, dtype, mrope_section=rope_scaling["mrope_section"], + mrope_interleaved=rope_scaling.get("mrope_interleaved", + False), ) else: rotary_emb = RotaryEmbedding( diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 69849fdac0277..ef61dbc1a5ab1 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -177,6 +177,18 @@ def triton_mrope( return q, k +def apply_interleaved_rope(x: torch.Tensor, + mrope_section: list[int]) -> torch.Tensor: + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + """ + x_t = x[0].clone() + x_t[..., 1:mrope_section[1] * 3:3] = x[1, ..., 1:mrope_section[1] * 3:3] + x_t[..., 2:mrope_section[2] * 3:3] = x[2, ..., 2:mrope_section[2] * 3:3] + return x_t + + class MRotaryEmbedding(RotaryEmbedding): """Rotary Embedding with Multimodal Sections.""" @@ -189,6 +201,7 @@ class MRotaryEmbedding(RotaryEmbedding): is_neox_style: bool, dtype: torch.dtype, mrope_section: Optional[list[int]] = None, + mrope_interleaved: Optional[bool] = False, ) -> None: # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get @@ -198,6 +211,7 @@ class MRotaryEmbedding(RotaryEmbedding): base, is_neox_style, dtype) self.mrope_section = mrope_section + self.mrope_interleaved = mrope_interleaved if self.mrope_section: assert sum(self.mrope_section) == rotary_dim // 2 @@ -225,17 +239,20 @@ class MRotaryEmbedding(RotaryEmbedding): cos, sin = cos_sin.chunk(2, dim=-1) if positions.ndim == 2: assert self.mrope_section - - cos = torch.cat([ - m[i] - for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) - ], - dim=-1) - sin = torch.cat([ - m[i] - for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) - ], - dim=-1) + if self.mrope_interleaved: + cos = apply_interleaved_rope(cos, self.mrope_section) + sin = apply_interleaved_rope(sin, self.mrope_section) + else: + cos = torch.cat([ + m[i] for i, m in enumerate( + cos.split(self.mrope_section, dim=-1)) + ], + dim=-1) + sin = torch.cat([ + m[i] for i, m in enumerate( + sin.split(self.mrope_section, dim=-1)) + ], + dim=-1) query_shape = query.shape query = query.view(num_tokens, -1, self.head_size) @@ -265,6 +282,10 @@ class MRotaryEmbedding(RotaryEmbedding): assert positions.ndim == 1 or positions.ndim == 2 assert key is not None + if self.mrope_interleaved: + # TODO: add triton implementation to support mrope-interleaved + return self.forward_native(positions, query, key) + num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) @@ -388,6 +409,15 @@ class MRotaryEmbedding(RotaryEmbedding): context_len=context_len, seq_len=seq_len, ) + elif hf_config.model_type in ["qwen3_vl", "qwen3_vl_moe"]: + return cls._qwen3vl_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + context_len=context_len, + seq_len=seq_len, + ) elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]: return cls._ernie_get_input_positions_tensor( input_tokens=input_tokens, @@ -526,6 +556,98 @@ class MRotaryEmbedding(RotaryEmbedding): len(input_tokens)).item() return llm_positions, mrope_position_delta + @classmethod + def _qwen3vl_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + + video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw + for _ in range(t)] + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + return llm_positions, mrope_position_delta + @classmethod def _ernie_get_input_positions_tensor( cls, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 54dc0bebd9c5e..e13e87b93429d 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -285,7 +285,7 @@ class Qwen2Model(nn.Module): decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer): super().__init__() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_config.get_text_config() cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d08181c5fd53b..b6576b783b64a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -83,7 +83,7 @@ from .vision import get_vit_attn_backend logger = init_logger(__name__) # For profile run -_MAX_FRAMES_PER_VIDEO = 16 +_MAX_FRAMES_PER_VIDEO = 600 # === Vision Inputs === # diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 85429b3a01f92..0a504d90cde1f 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -378,7 +378,7 @@ class Qwen3MoeModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_config.get_text_config() cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py new file mode 100644 index 0000000000000..22948aee4936c --- /dev/null +++ b/vllm/model_executor/models/qwen3_vl.py @@ -0,0 +1,1478 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen3VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping, Sequence +from functools import partial +from typing import Any, Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import BatchFeature +from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast +from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize +from transformers.models.qwen3_vl import (Qwen3VLProcessor, + Qwen3VLVideoProcessor) +from transformers.models.qwen3_vl.configuration_qwen3_vl import ( + Qwen3VLConfig, Qwen3VLVisionConfig) +from transformers.video_utils import VideoMetadata + +from vllm.attention.layer import check_upstream_fa_availability +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargsItem, + MultiModalKwargsItems, VideoItem) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + MultiModalDataParser) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + PromptReplacement, PromptUpdate, + PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.platforms import _Backend +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import uses_mrope +from vllm.utils import is_list_of + +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) +from .qwen2_5_vl import (Qwen2_5_VisionAttention, + Qwen2_5_VisionRotaryEmbedding, + Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs, + Qwen2_5_VLImagePixelInputs, + Qwen2_5_VLVideoEmbeddingInputs, Qwen2_5_VLVideoInputs, + Qwen2_5_VLVideoPixelInputs) +from .qwen2_vl import Qwen2VLProcessingInfo +from .qwen3 import Qwen3ForCausalLM, Qwen3Model +from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, + maybe_prefix, merge_multimodal_embeddings) +from .vision import get_vit_attn_backend + +logger = init_logger(__name__) + + +class Qwen3_VisionPatchEmbed(nn.Module): + + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_channels: int = 3, + hidden_size: int = 1152, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.hidden_size = hidden_size + + kernel_size = (temporal_patch_size, patch_size, patch_size) + self.proj = nn.Conv3d(in_channels, + hidden_size, + kernel_size=kernel_size, + stride=kernel_size, + bias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, + self.patch_size) + x = self.proj(x).view(L, self.hidden_size) + return x + + +class Qwen3_VisionMLP(nn.Module): + + def __init__(self, + in_features: int, + hidden_features: int, + bias: bool = False, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.linear_fc1 = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + return_bias=False, + prefix=f"{prefix}.linear_fc1") + self.linear_fc2 = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + return_bias=False, + prefix=f"{prefix}.linear_fc2") + self.act_fn = act_fn + + def forward(self, x: torch.Tensor): + mlp_output = self.linear_fc2(self.act_fn(self.linear_fc1(x))) + return mlp_output + + +class Qwen3_VisionBlock(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + self.mlp = Qwen3_VisionMLP(dim, + mlp_hidden_dim, + act_fn=act_fn, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, # Only used for Flash Attention + seqlens: Optional[list[int]] = None, # Only used for xFormers + ) -> torch.Tensor: + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens) + + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen3_VisionPatchMerger(nn.Module): + + def __init__( + self, + d_model: int, + context_dim: int, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + spatial_merge_size: int = 2, + use_postshuffle_norm: bool = False, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + + self.use_postshuffle_norm = use_postshuffle_norm + if self.use_postshuffle_norm: + context_dim = self.hidden_size + + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.use_postshuffle_norm = use_postshuffle_norm + self.norm = norm_layer( + self.hidden_size if use_postshuffle_norm else context_dim) + self.linear_fc1 = ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_fc1") + self.act_fn = nn.GELU() + self.linear_fc2 = RowParallelLinear(self.hidden_size, + d_model, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_fc2") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.use_postshuffle_norm: + x = self.norm(x.view(-1, self.hidden_size)) + else: + x = self.norm(x).view(-1, self.hidden_size) + + x_parallel, _ = self.linear_fc1(x) + x_parallel = self.act_fn(x_parallel) + out, _ = self.linear_fc2(x_parallel) + return out + + +class Qwen3_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config: Qwen3VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + self.num_position_embeddings = vision_config.num_position_embeddings + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.spatial_merge_unit = self.spatial_merge_size**2 + self.temporal_patch_size = vision_config.temporal_patch_size + self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes + + self.patch_embed = Qwen3_VisionPatchEmbed( + patch_size=self.patch_size, + temporal_patch_size=self.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) + + self.pos_embed = nn.Embedding(self.num_position_embeddings, + self.hidden_size) + + norm_layer = partial(nn.LayerNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([ + Qwen3_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(vision_config.depth) + ]) + + self.merger = Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + ) + + self.deepstack_merger_list = nn.ModuleList([ + Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + spatial_merge_size=self.spatial_merge_size, + use_postshuffle_norm=True, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.deepstack_merger_list.{layer_idx}") + for layer_idx in range(len(self.deepstack_visual_indexes)) + ]) + + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability( + torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def fast_pos_embed_interpolate(self, grid_thw): + num_grid_per_side = int(self.num_position_embeddings**0.5) + + idx_list = [[] for _ in range(4)] + weight_list = [[] for _ in range(4)] + + for t, h, w in grid_thw: + h_idxs = torch.linspace(0, + num_grid_per_side - 1, + h, + dtype=torch.float32) + w_idxs = torch.linspace(0, + num_grid_per_side - 1, + w, + dtype=torch.float32) + + h_idxs_floor = h_idxs.to(torch.long) + w_idxs_floor = w_idxs.to(torch.long) + h_idxs_ceil = torch.clamp(h_idxs.to(torch.long) + 1, + max=num_grid_per_side - 1) + w_idxs_ceil = torch.clamp(w_idxs.to(torch.long) + 1, + max=num_grid_per_side - 1) + + dh = h_idxs - h_idxs_floor + dw = w_idxs - w_idxs_floor + + idx_list[0].extend(((h_idxs_floor * num_grid_per_side)[None].T + + w_idxs_floor[None]).flatten().tolist() * t) + idx_list[1].extend(((h_idxs_floor * num_grid_per_side)[None].T + + w_idxs_ceil[None]).flatten().tolist() * t) + idx_list[2].extend(((h_idxs_ceil * num_grid_per_side)[None].T + + w_idxs_floor[None]).flatten().tolist() * t) + idx_list[3].extend(((h_idxs_ceil * num_grid_per_side)[None].T + + w_idxs_ceil[None]).flatten().tolist() * t) + + weight_list[0].extend( + ((1 - dh)[None].T * (1 - dw)[None]).flatten().tolist() * t) + weight_list[1].extend( + ((1 - dh)[None].T * dw[None]).flatten().tolist() * t) + weight_list[2].extend( + (dh[None].T * (1 - dw)[None]).flatten().tolist() * t) + weight_list[3].extend( + (dh[None].T * dw[None]).flatten().tolist() * t) + + device = self.pos_embed.weight.device + dtype = self.pos_embed.weight.dtype + + p0 = self.pos_embed( + torch.tensor( + idx_list[0], dtype=torch.long, device=device)) * torch.tensor( + weight_list[0], dtype=dtype, device=device)[:, None] + p1 = self.pos_embed( + torch.tensor( + idx_list[1], dtype=torch.long, device=device)) * torch.tensor( + weight_list[1], dtype=dtype, device=device)[:, None] + p2 = self.pos_embed( + torch.tensor( + idx_list[2], dtype=torch.long, device=device)) * torch.tensor( + weight_list[2], dtype=dtype, device=device)[:, None] + p3 = self.pos_embed( + torch.tensor( + idx_list[3], dtype=torch.long, device=device)) * torch.tensor( + weight_list[3], dtype=dtype, device=device)[:, None] + + patch_pos_embeds = p0 + p1 + p2 + p3 + patch_pos_embeds = patch_pos_embeds.split( + [t * h * w for t, h, w in grid_thw]) + patch_pos_embeds_permute = [] + m_size = self.spatial_merge_size + for pos_embed, (t, h, w) in zip(patch_pos_embeds, grid_thw): + pos_embed = pos_embed.view(t, h // m_size, m_size, w // m_size, + m_size, -1).permute(0, 1, 3, 2, 4, + 5).flatten(0, 4) + patch_pos_embeds_permute.append(pos_embed) + patch_pos_embeds = torch.cat(patch_pos_embeds_permute) + return patch_pos_embeds + + def compute_attn_mask_seqlen( + self, + cu_seqlens: torch.Tensor, + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return max_seqlen, seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor: + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + cu_seqlens = torch.repeat_interleave( + grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + dtype=grid_thw.dtype + if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + hidden_states = hidden_states.unsqueeze(1) + rotary_pos_emb = rotary_pos_emb.to(hidden_states.device) + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens) + if layer_num in self.deepstack_visual_indexes: + deepstack_merger_idx = self.deepstack_visual_indexes.index( + layer_num) + deepstack_feature = self.deepstack_merger_list[ + deepstack_merger_idx](hidden_states) + deepstack_feature_lists.append(deepstack_feature) + hidden_states = self.merger(hidden_states) + hidden_states = torch.cat( + [hidden_states] + deepstack_feature_lists, + dim=1) # [seq_len, hidden_size * (1 + depth_of_deepstack)] + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("attn.qkv.", "attn.q.", "q"), + ("attn.qkv.", "attn.k.", "k"), + ("attn.qkv.", "attn.v.", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen3VLConfig) + + def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor: + return self.ctx.get_hf_processor( + Qwen3VLProcessor, + use_fast=kwargs.pop("use_fast", True), + **kwargs, + ) + + def get_tokenizer(self): + return self.ctx.tokenizer + + def get_image_processor(self, + **kwargs: object) -> Qwen2VLImageProcessorFast: + return self.get_hf_processor(**kwargs).image_processor + + def get_video_processor(self, **kwargs: object) -> Qwen3VLVideoProcessor: + return self.get_hf_processor(**kwargs).video_processor + + def _get_vision_info( + self, + *, + image_width: int, + image_height: int, + num_frames: int = 2, + do_resize: bool = True, + image_processor: Optional[Qwen2VLImageProcessorFast], + ) -> tuple[ImageSize, int]: + if image_processor is None: + image_processor = self.get_image_processor() + + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + merge_size = vision_config.spatial_merge_size + temporal_patch_size = vision_config.temporal_patch_size + + if do_resize: + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * merge_size, + min_pixels=image_processor.size["shortest_edge"], + max_pixels=image_processor.size["longest_edge"], + ) + preprocessed_size = ImageSize(width=resized_width, + height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, + height=image_height) + + padded_num_frames = num_frames + num_frames % temporal_patch_size + + grid_t = max(padded_num_frames // temporal_patch_size, 1) + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_patches = grid_t * grid_h * grid_w + num_vision_tokens = num_patches // (merge_size**2) + + return preprocessed_size, num_vision_tokens + + def _calculate_timestamps(self, indices: list[int] | torch.Tensor, + video_fps: float, merge_size: int): + if not isinstance(indices, list): + indices = indices.tolist() + if len(indices) % merge_size != 0: + # don't update metadata's frames_indices directly + indices = indices + [indices[-1] + ] * (merge_size - len(indices) % merge_size) + timestamps = [idx / video_fps for idx in indices] + timestamps = [(timestamps[i] + timestamps[i + merge_size - 1]) / 2 + for i in range(0, len(timestamps), merge_size)] + return timestamps + + def _get_video_second_idx( + self, + metadata: dict[str, Any], + out_item: MultiModalKwargsItem, + do_sample_frames: Optional[bool] = None, + sampled_fps: Optional[float] = None) -> list[int]: + video_processor = self.get_video_processor() + merge_size = video_processor.merge_size + indices = metadata["frames_indices"] + + # metadata["fps"] refers to the true fps of the input video. + video_fps = metadata["fps"] + if do_sample_frames is None: + do_sample_frames = metadata.get("do_sample_frames", False) + + # If video frames are sampled in HF processor (instead of vLLM + # video loader), we need to re-calculate the indices from original + # metadata. + if do_sample_frames: + # here video_fps is the fps of the sampled video, and + # metadata["fps"] refers to the fps of the original video. + video_fps = sampled_fps if sampled_fps else video_processor.fps + total_num_frames = metadata["total_num_frames"] + num_frames = int(total_num_frames / metadata["fps"] * video_fps) + num_frames = min( + min(max(num_frames, video_processor.min_frames), + video_processor.max_frames), total_num_frames) + indices = np.linspace(0, total_num_frames - 1, + num_frames).round().astype(int).tolist() + timestamps = self._calculate_timestamps(indices, video_fps, merge_size) + return timestamps + + +class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + image_token = "<|vision_start|><|image_pad|><|vision_end|>" + video_token = "<|vision_start|><|video_pad|><|vision_end|>" + + return image_token * num_images + video_token * num_videos + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + target_width, target_height = ( + self.info.get_image_size_with_most_features()) + target_num_frames = self.info.get_num_frames_with_most_features( + seq_len, mm_counts) + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=target_num_frames, + num_videos=num_videos, + ), + } + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[VideoItem]: + num_frames = max(num_frames, 2) + video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) + video_items = [] + for i in range(num_videos): + video_metadata = { + "fps": 2.0, + "duration": num_frames / 2.0, + "total_num_frames": num_frames, + "frames_indices": [i for i in range(num_frames)], + "video_backend": "opencv", + "do_sample_frames": False, + } + video_item = (video.copy(), video_metadata) + video_items.append(video_item) + return video_items + + +class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo] + ): + + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(video_needs_metadata=True) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_data = dict(mm_data) + processor = self.info.get_hf_processor(**mm_kwargs) + + # Separate video processing from image processing. Because the videos + # are processed into serval image patches + if ("videos" in mm_data and isinstance(mm_data["videos"], list) + and len(mm_data["videos"]) > 0): + video_grid_thw_lst = [] + pixel_values_videos_lst = [] + + for item_idx, item in enumerate(mm_data.pop("videos", [])): + video_array, metadata = item + + # NOTE: @JJJYmmm new attr metadata.frames_indices indicates + # the sampled frames indices of pre-sampled videos, which is + # used to calculate the timestamps. Make sure that + # do_sample_frames in mm_kwargs is false for presampled videos. + + # NOTE: a copy of is created to update do_sample_frames, + # otherwise mm_hash for the object will be incorrect. + video_mm_kwargs = dict(**mm_kwargs) + if "do_sample_frames" not in video_mm_kwargs: + # qwen_vl_utils already has "do_sample_frames" in + # mm_kwargs, don't overwrite it. + video_mm_kwargs["do_sample_frames"] = metadata.get( + "do_sample_frames", False) + + metadata = VideoMetadata(**{ + k: metadata[k] + for k in metadata if k != "do_sample_frames" + }) + + video_mm_data = dict() + video_mm_data["videos"] = [[video_array]] + video_mm_data["video_metadata"] = [[metadata]] + + video_outputs = super()._call_hf_processor( + prompt="<|vision_start|><|video_pad|><|vision_end|>", + mm_data=video_mm_data, + mm_kwargs=video_mm_kwargs, + tok_kwargs=tok_kwargs, + ) + input_ids = video_outputs.pop("input_ids") + video_placeholder = processor.tokenizer.batch_decode( + input_ids)[0] + prompt = prompt.replace( + "<|vision_start|><|video_pad|><|vision_end|>", + video_placeholder, + 1, + ) + + video_grid_thw_lst.append(video_outputs["video_grid_thw"]) + pixel_values_videos_lst.append( + video_outputs["pixel_values_videos"]) + video_outputs = dict( + pixel_values_videos=torch.cat(pixel_values_videos_lst), + video_grid_thw=torch.cat(video_grid_thw_lst), + ) + else: + video_outputs = dict() + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + combined_outputs = dict( + processed_outputs, + **video_outputs, + ) + return BatchFeature(combined_outputs) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_grid_sizes = image_grid_thw.prod(-1) + + video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) + video_grid_sizes = video_grid_thw.prod(-1) + + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_grid_thw=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), + video_grid_thw=MultiModalFieldConfig.batched("video"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor( + **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + hf_config = self.info.get_hf_config() + + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + vision_end_token_id = hf_config.vision_end_token_id + + merge_length = image_processor.merge_size**2 + + def get_image_replacement_qwen3vl(item_idx: int): + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item["image_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + num_tokens = int(grid_thw.prod()) // merge_length + return [hf_processor.image_token_id] * num_tokens + + def get_video_replacement_qwen3vl(item_idx: int): + out_item = out_mm_kwargs["video"][item_idx] + grid_thw = out_item["video_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + video, metadata = mm_items["video"][item_idx] + do_sample_frames = hf_processor_mm_kwargs.get("do_sample_frames") + sampled_fps = hf_processor_mm_kwargs.get("fps") + if is_list_of(sampled_fps, float): + sampled_fps = sampled_fps[item_idx] + timestamps = self.info._get_video_second_idx( + metadata, out_item, do_sample_frames, sampled_fps) + + assert len(timestamps) == grid_thw[0], ( + f"The timestamps length({len(timestamps)}) should be equal " + f"video length ({grid_thw[0]}).") + + frames_idx_token = [ + tokenizer.encode(f"<{curr_time:.1f} seconds>", + add_special_tokens=False) + for curr_time in timestamps + ] + num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length + placeholder = [] + for frame_idx in frames_idx_token: + placeholder.extend(frame_idx) + placeholder.extend([vision_start_token_id] + + [video_token_id] * num_tokens_per_frame + + [vision_end_token_id]) + return PromptUpdateDetails.select_token_id(placeholder, + video_token_id) + + return [ + PromptReplacement( + modality="image", + target=hf_processor.image_token, + replacement=get_image_replacement_qwen3vl, + ), + + # NOTE: We match string on purpose since searching sequence of + # token ids takes more time. + PromptReplacement( + modality="video", + target="<|vision_start|><|video_pad|><|vision_end|>", + replacement=get_video_replacement_qwen3vl, + ), + ] + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + # the same shape as input_embeds + "deepstack_input_embeds": 0 + }) +class Qwen3LLMModel(Qwen3Model): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + if not get_pp_group().is_first_rank: + assert self.start_layer >= len( + vllm_config.model_config.hf_config.vision_config. + deepstack_visual_indexes), ( + "start_layer should be greater than or equal to " + "len(deepstack_visual_indexes)") + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + # args for deepstack + deepstack_input_embeds: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for layer_idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + layer_idx = layer_idx + self.start_layer + + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + if deepstack_input_embeds is not None and \ + layer_idx in range(0, len(deepstack_input_embeds)): + hidden_states = hidden_states + deepstack_input_embeds[ + f"deepstack_input_embeds_{layer_idx}"] + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Qwen3LLMForCausalLM(Qwen3ForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3ForCausalLM, self).__init__() + config = vllm_config.model_config.hf_config.text_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix="lm_head") + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + +@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, + info=Qwen3VLProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder) +class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.visual.": "visual.", + "lm_head.": "language_model.lm_head.", + "model.language_model.": "language_model.model.", + }) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<|vision_start|><|image_pad|><|vision_end|>" + if modality.startswith("video"): + return "<|vision_start|><|video_pad|><|vision_end|>" + + raise ValueError("Only image or video modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): + super().__init__() + config: Qwen3VLConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.visual = Qwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + + self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, + "language_model")) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + self.use_deepstack = hasattr(config.vision_config, + 'deepstack_visual_indexes') + self.deepstack_num_level = len( + config.vision_config.deepstack_visual_indexes + ) if self.use_deepstack else 0 + # register buffer for deepstack + self.deepstack_input_embeds = [ + torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens, + config.text_config.hidden_size) + for _ in range(self.deepstack_num_level) + ] if self.use_deepstack else None + + def _get_deepstack_input_embeds(self, + num_tokens: int) -> IntermediateTensors: + # get deepstack_input_embeds from buffer, and clear the buffer + return IntermediateTensors({ + f"deepstack_input_embeds_{idx}": + self.deepstack_input_embeds[idx][:num_tokens] + for idx in range(self.deepstack_num_level) + }) + + def _set_deepstack_input_embeds( + self, deepstack_input_embeds: torch.Tensor) -> None: + # set deepstack_input_embeds to buffer + num_tokens = deepstack_input_embeds.size(1) + if num_tokens > self.deepstack_input_embeds[0].size(0): + self.deepstack_input_embeds = [ + torch.zeros(num_tokens, + self.config.text_config.hidden_size, + device=self.deepstack_input_embeds[0].device, + dtype=self.deepstack_input_embeds[0].dtype) + for _ in range(self.deepstack_num_level) + ] + for idx in range(self.deepstack_num_level): + self.deepstack_input_embeds[idx][:num_tokens].copy_( + deepstack_input_embeds[idx]) + + def _clear_deepstack_input_embeds(self, num_tokens: int) -> None: + # clear deepstack_input_embeds in buffer + if num_tokens > 0: + for idx in range(self.deepstack_num_level): + self.deepstack_input_embeds[idx][:num_tokens].zero_() + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return Qwen2_5_VLImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return Qwen2_5_VLImageEmbeddingInputs( + type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + second_per_grid_ts = kwargs.pop("second_per_grid_ts", None) + + if pixel_values_videos is None and video_embeds is None: + return None + + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2_5_VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + ) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, "video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + if not isinstance(video_embeds, torch.Tensor): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + return Qwen2_5_VLVideoEmbeddingInputs( + type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw) + + def _process_image_input( + self, + image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync + merge_size = self.visual.spatial_merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + return image_embeds.split(sizes) + + def _process_video_input( + self, + video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + + # Split concatenated embeddings for each video item. + # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync + merge_size = self.visual.spatial_merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + return video_embeds.split(sizes) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + mm_input_by_modality = {} + for input_key in kwargs: + if input_key in ("pixel_values", "image_embeds" + ) and "image" not in mm_input_by_modality: + mm_input_by_modality[ + "image"] = self._parse_and_validate_image_input(**kwargs) + if input_key in ("pixel_values_videos", "video_embeds" + ) and "video" not in mm_input_by_modality: + mm_input_by_modality[ + "video"] = self._parse_and_validate_video_input(**kwargs) + return mm_input_by_modality + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + + mm_input_by_modality = self._parse_and_validate_multimodal_inputs( + **kwargs) + if not mm_input_by_modality: + return None + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in mm_input_by_modality: + multimodal_input = mm_input_by_modality[modality] + if modality == "image": + vision_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += vision_embeddings + if modality == "video": + video_embeddings = self._process_video_input(multimodal_input) + multimodal_embeddings += video_embeddings + return multimodal_embeddings + + def _compute_deepstack_embeds( + self, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings) -> torch.Tensor: + visual_lens = [ + x.shape[0] if isinstance(x, torch.Tensor) else len(x) + for x in multimodal_embeddings + ] + multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0) + + visual_dim = multimodal_embeddings_cat.shape[-1] // ( + self.deepstack_num_level + 1) + + main_dim, multi_dim = visual_dim, visual_dim * self.deepstack_num_level + multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501 + multimodal_embeddings_cat, [main_dim, multi_dim], + dim=-1) + + multimodal_embeddings = torch.split(multimodal_embeddings_main, + visual_lens, + dim=0) + multimodal_embeddings_multiscale = torch.split( + multimodal_embeddings_multiscale, visual_lens, dim=0) + + deepstack_input_embeds = inputs_embeds.new_zeros( + inputs_embeds.size(0), + self.deepstack_num_level * inputs_embeds.size(1)) + + deepstack_input_embeds = merge_multimodal_embeddings( + input_ids, + deepstack_input_embeds, + multimodal_embeddings_multiscale, + placeholder_token_id=[ + self.config.image_token_id, self.config.video_token_id + ], + ) + deepstack_input_embeds = deepstack_input_embeds.view( + inputs_embeds.shape[0], self.deepstack_num_level, + visual_dim).contiguous() + deepstack_input_embeds = deepstack_input_embeds.permute( + 1, 0, 2).contiguous() + return deepstack_input_embeds, multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + deepstack_input_embeds = None + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None and self.use_deepstack: + deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 + input_ids, inputs_embeds, multimodal_embeddings) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) + + if self.use_deepstack: + if deepstack_input_embeds is None: + deepstack_input_embeds = torch.zeros_like( + inputs_embeds).unsqueeze(0).repeat( + self.deepstack_num_level, 1, 1).contiguous() + self._set_deepstack_input_embeds(deepstack_input_embeds) + + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[Qwen2_5_VLImageInputs] = None, + video_input: Optional[Qwen2_5_VLVideoInputs] = None, + ) -> torch.Tensor: + inputs_embeds = self.get_input_embeddings(input_ids) + + if self.use_deepstack: + visual_dim = inputs_embeds.shape[-1] + deepstack_input_embeds = None + if image_input is not None or video_input is not None: + deepstack_input_embeds = torch.zeros_like( + inputs_embeds).unsqueeze(1).repeat( + 1, self.deepstack_num_level, 1).flatten(1) + + if image_input is not None: + image_embeds = self._process_image_input(image_input) + if self.use_deepstack: + image_embeds = torch.cat(image_embeds) + + image_embeds, image_embeds_multiscale = image_embeds.split( + [visual_dim, visual_dim * self.deepstack_num_level], + dim=-1) + + deepstack_input_embeds = merge_multimodal_embeddings( + input_ids, + deepstack_input_embeds, + image_embeds_multiscale, + placeholder_token_id=self.config.image_token_id, + ) + + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + if self.use_deepstack: + video_embeds = torch.cat(video_embeds) + + video_embeds, video_embeds_multiscale = video_embeds.split( + [visual_dim, visual_dim * self.deepstack_num_level], + dim=-1) + + deepstack_input_embeds = merge_multimodal_embeddings( + input_ids, + deepstack_input_embeds, + video_embeds_multiscale, + placeholder_token_id=self.config.video_token_id, + ) + + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) + + if self.use_deepstack and deepstack_input_embeds is not None: + deepstack_input_embeds = deepstack_input_embeds.view( + inputs_embeds.shape[0], self.deepstack_num_level, + visual_dim).permute(1, 0, 2).contiguous() + self._set_deepstack_input_embeds(deepstack_input_embeds) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + """Run forward pass for Qwen3VL. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen3VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. + `None` if no images are passed. + pixel_values_videos: Pixel values of videos to be fed to a model. + `None` if no videos are passed. + video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. + `None` if no videos are passed. + """ + + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. + elif inputs_embeds is None: + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None + + if self.use_deepstack and inputs_embeds is not None and get_pp_group( + ).is_first_rank: + deepstack_input_embeds = self._get_deepstack_input_embeds( + inputs_embeds.size(0)) + else: + deepstack_input_embeds = None + + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + # args for deepstack + deepstack_input_embeds=deepstack_input_embeds, + ) + + if inputs_embeds is not None and get_pp_group().is_first_rank: + self._clear_deepstack_input_embeds(inputs_embeds.size(0)) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="model.visual.merger", + tower_model="model.visual.", + ) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py new file mode 100644 index 0000000000000..a800e94ab1e50 --- /dev/null +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -0,0 +1,344 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen3-VL-MoE model compatible with HuggingFace weights.""" +import typing +from collections.abc import Iterable +from typing import Callable, Optional, Union + +import torch +from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import ( + Qwen3VLMoeConfig) + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import IntermediateTensors + +from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel +from .qwen3_vl import (Qwen3_VisionTransformer, Qwen3VLDummyInputsBuilder, + Qwen3VLForConditionalGeneration, + Qwen3VLMultiModalProcessor, Qwen3VLProcessingInfo) +from .utils import is_pp_missing_parameter, maybe_prefix + +logger = init_logger(__name__) + + +class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen3VLMoeConfig) + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + # the same shape as input_embeds + "deepstack_input_embeds": 0 + }) +class Qwen3MoeLLMModel(Qwen3MoeModel): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + if not get_pp_group().is_first_rank: + assert self.start_layer >= len( + vllm_config.model_config.hf_config.vision_config. + deepstack_visual_indexes), ( + "start_layer should be greater than or equal to " + "len(deepstack_visual_indexes)") + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + deepstack_input_embeds: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for layer_idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + layer_idx = layer_idx + self.start_layer + + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + if deepstack_input_embeds is not None and \ + layer_idx in range(0, len(deepstack_input_embeds)): + hidden_states = hidden_states + deepstack_input_embeds[ + f"deepstack_input_embeds_{layer_idx}"] + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_fused_expert_weights(self, name: str, params_dict: dict, + loaded_weight: torch.Tensor, shard_id: str, + num_experts: int): + param = params_dict[name] + weight_loader = typing.cast(Callable[..., bool], param.weight_loader) + for expert_id in range(num_experts): + curr_expert_weight = loaded_weight[expert_id] + success = weight_loader(param, + curr_expert_weight, + name, + shard_id, + expert_id, + return_success=True) + if not success: + return False + return True + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + # Skip loading extra parameters for GPTQ/modelopt models. + ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale", + ".v_scale", "_v_scale", ".weight_scale", + "_weight_scale", ".input_scale", "_input_scale") + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + is_fused_expert = False + fused_expert_params_mapping = [ + ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"), + ("experts.w2_weight", "experts.down_proj", 0, "w2"), + ] + num_experts = self.config.num_experts + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if ("experts.gate_up_proj" in name + or "experts.down_proj" in name): + is_fused_expert = True + expert_params_mapping = fused_expert_params_mapping + + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if "mlp.experts" in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra parameters for GPTQ/modelopt models. + if name.endswith(ignore_suffixes) and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) + break + else: + is_expert_weight = False + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + name_mapped = name.replace(weight_name, param_name) + if is_fused_expert: + loaded_weight = loaded_weight.transpose(-1, + -2) # no bias + if "experts.gate_up_proj" in name: + loaded_weight = loaded_weight.chunk(2, dim=-2) + success_w1 = self.load_fused_expert_weights( + name_mapped, params_dict, loaded_weight[0], + "w1", num_experts) + success_w3 = self.load_fused_expert_weights( + name_mapped, params_dict, loaded_weight[1], + "w3", num_experts) + success = success_w1 and success_w3 + else: + # down_proj + success = self.load_fused_expert_weights( + name_mapped, params_dict, loaded_weight, + shard_id, num_experts) + else: + if is_pp_missing_parameter(name_mapped, self): + continue + # Skip loading extra parameters for GPTQ/modelopt models + if name_mapped.endswith( + ignore_suffixes + ) and name_mapped not in params_dict: + continue + param = params_dict[name_mapped] + # We should ask the weight loader to return success or + # not here since otherwise we may skip experts with + # other available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader(param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True) + if success: + name = name_mapped + break + else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra parameters for GPTQ/modelopt models. + if name.endswith( + ignore_suffixes) and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501 + name, + remapped_kv_scale_name, + ) + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3MoeForCausalLM, self).__init__() + self.config = vllm_config.model_config.hf_config.text_config + self.quant_config = vllm_config.quant_config + self.model = Qwen3MoeLLMModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=self.quant_config) + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(self.config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + +@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, + info=Qwen3VLMoeProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder) +class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3VLForConditionalGeneration, self).__init__() + config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.visual = Qwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + + self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, + "language_model")) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + self.use_deepstack = hasattr(config.vision_config, + 'deepstack_visual_indexes') + self.deepstack_num_level = len( + config.vision_config.deepstack_visual_indexes + ) if self.use_deepstack else 0 + # register buffer for deepstack + self.deepstack_input_embeds = [ + torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens, + config.text_config.hidden_size) + for _ in range(self.deepstack_num_level) + ] if self.use_deepstack else None diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 38f3d5c69b9ef..707b57106e6d9 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -259,11 +259,13 @@ _MULTIMODAL_MODELS = { "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 - "UltravoxModel": ("ultravox", "UltravoxModel"), + "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501 + "Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"), # noqa: E501 "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501 + "UltravoxModel": ("ultravox", "UltravoxModel"), "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 # [Encoder-decoder] "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index fb2dcac49ee93..6981f2ce56239 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -156,7 +156,7 @@ class OpenCVVideoBackend(VideoLoader): # can cause incorrect timestamp calculation without num_frames=-1. metadata = { "total_num_frames": num_frames, - "fps": original_fps, + "fps": num_frames / duration, "duration": duration, "video_backend": "opencv", "frames_indices": list(range(num_frames)), From ca2d1925ef5ad309061c2d5dd9a1e409c5ca28ee Mon Sep 17 00:00:00 2001 From: haoyangli-amd <lihaoyang0109@gmail.com> Date: Wed, 17 Sep 2025 13:15:13 +0800 Subject: [PATCH 346/584] [Rocm] [quantization] Fix quark ptpc moe and add test case (#24649) Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com> Co-authored-by: Haoyang Li <haoyang.li@amd.com> --- tests/quantization/test_quark.py | 25 ++ .../layers/quantization/quark/quark_moe.py | 223 ++++++++++++++---- 2 files changed, 196 insertions(+), 52 deletions(-) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 4a0c8ba4d8a95..c09931971e6fb 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -77,6 +77,31 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp): assert output +@pytest.mark.parametrize('tp', [1]) +def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp): + model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts" + with vllm_runner(model_path, tensor_parallel_size=tp) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + + if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): + assert qkv_proj.weight.dtype is current_platform.fp8_dtype() + assert qkv_proj.weight_scale.shape[0] == qkv_proj.weight.shape[ + 1] + assert qkv_proj.weight_scale.shape[1] == 1 + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output + + @pytest.mark.parametrize('tp', [1]) def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp): model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test" diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 6cff9f3019d34..bc8ae980429a3 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -5,17 +5,25 @@ from typing import Any, Callable, Optional, Union import torch +import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + is_rocm_aiter_moe_enabled) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + prepare_moe_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( OCP_MX_BLOCK_SIZE) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -67,21 +75,45 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): self.weight_quant = weight_config self.input_quant = input_config - weight_qscheme = self.weight_quant.get("qscheme") - input_qscheme = self.input_quant.get("qscheme") - if not (weight_qscheme == "per_tensor" - and input_qscheme == "per_tensor"): + self.weight_qscheme = self.weight_quant.get("qscheme") + self.input_qscheme = self.input_quant.get("qscheme") + per_tensor = (self.weight_qscheme == "per_tensor" + and self.input_qscheme == "per_tensor") + per_channel = (self.weight_qscheme == "per_channel" + and self.input_qscheme == "per_channel") + self.act_quant_group_shape = GroupShape.PER_TOKEN \ + if per_channel else GroupShape.PER_TENSOR + if not (per_tensor or per_channel): raise ValueError( - "For FP8 Fused MoE layers, only per-tensor scales " - "for weights and activations are supported. Found " - f"{weight_qscheme}, {input_qscheme}") # noqa E501 + "For FP8 Fused MoE layers, only per-tensor and per-channel " + "scales for weights and activations are supported. Found " + f"{self.weight_qscheme}, {self.input_qscheme}") # noqa E501 self.static_input_scales = not self.input_quant.get("is_dynamic") + if self.static_input_scales and per_channel: + raise ValueError( + "For FP8 Fused MoE layer, we require either per tensor or " + "channelwise, dynamic per token quantization.") + + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + self.use_marlin = (not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN) + # Disable marlin for rocm + if current_platform.is_rocm(): + self.use_marlin = False + + self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None params_dtype = torch.float8_e4m3fn # WEIGHTS @@ -104,24 +136,39 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): set_weight_attrs(w2_weight, extra_weight_attrs) # WEIGHT_SCALES - # Allocate 2 scales for w1 and w3 respectively. - # They will be combined to a single scale after weight loading. - w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts, - 2, - dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - - w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts, - dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add the quantization method used (per tensor/grouped/channel) - # to ensure the weight scales are loaded in properly - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) - set_weight_attrs(w13_weight_scale, extra_weight_attrs) - set_weight_attrs(w2_weight_scale, extra_weight_attrs) + if self.weight_qscheme == "per_tensor": + # Allocate 2 scales for w1 and w3 respectively. + # They are combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, 2, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-TENSOR quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + elif self.weight_qscheme == "per_channel": + # quark's scale is 1 dim. + w13_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, hidden_size, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) # INPUT_SCALES if self.static_input_scales: @@ -185,24 +232,60 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): layer.w2_input_scale = torch.nn.Parameter(w2_input_scale, requires_grad=False) - # Fp8 moe kernel needs single weight scale for w13 per expert. - # We take the max then dequant and requant each expert. - assert layer.w13_weight_scale is not None - shard_size = layer.intermediate_size_per_partition - max_w13_scales = layer.w13_weight_scale.max(dim=1).values - for expert_id in range(layer.local_num_experts): - start = 0 - for shard_id in range(2): - dq_weight = per_tensor_dequantize( - layer.w13_weight[expert_id][start:start + shard_size, :], - layer.w13_weight_scale[expert_id][shard_id]) - layer.w13_weight[expert_id][ - start:start + shard_size, :], _ = ops.scaled_fp8_quant( - dq_weight, max_w13_scales[expert_id]) - start += shard_size + # For per-tensor case, Fp8 moe kernel needs single weight scale + # for w13 per expert. Use max then dequant and requant each expert. + if self.weight_qscheme == "per_tensor": + assert layer.w13_weight_scale is not None + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + for expert_id in range(layer.local_num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start:start + + shard_size, :], + layer.w13_weight_scale[expert_id][shard_id]) + layer.w13_weight[expert_id][ + start:start + shard_size, :], _ = ops.scaled_fp8_quant( + dq_weight, max_w13_scales[expert_id]) + start += shard_size - layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, - requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, + requires_grad=False) + # quark's scale is 1 dim. + elif self.weight_qscheme == "per_channel": + if self.act_quant_group_shape == GroupShape.PER_TOKEN: + w13_weight_scale = layer.w13_weight_scale.unsqueeze(-1) + layer.w13_weight_scale = torch.nn.Parameter( + w13_weight_scale, requires_grad=False) + w2_weight_scale = layer.w2_weight_scale.unsqueeze(-1) + layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, + requires_grad=False) + # Property to determine if AITER is used + if self.rocm_aiter_moe_enabled: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa E501 + rocm_aiter_fused_experts, shuffle_weights) + + # reshaping weights is required for aiter moe kernel. + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data) + + layer.w13_weight = torch.nn.Parameter(shuffled_w13, + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, + requires_grad=False) + + self.rocm_aiter_fused_experts_func = rocm_aiter_fused_experts + elif self.use_marlin: + + prepare_moe_fp8_layer_for_marlin(layer, False) + # Activations not quantized for marlin. + del layer.w13_input_scale + del layer.w2_input_scale + self.fused_experts_func = None + else: + from vllm.model_executor.layers.fused_moe import fused_experts + self.fused_experts_func = fused_experts def apply( self, @@ -233,8 +316,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): raise NotImplementedError( "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet.") - from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -249,22 +330,60 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) - return fused_experts( - x, - layer.w13_weight, - layer.w2_weight, + if self.rocm_aiter_moe_enabled: + return self.rocm_aiter_fused_experts_func( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + use_fp8_w8a8=True, + per_channel_quant=self.weight_qscheme == "per_channel", + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + expert_map=expert_map) + if self.use_marlin: + assert activation == "silu", ( + f"{activation} not supported for Marlin MoE.") + return torch.ops.vllm.fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + None, + None, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + quant_type_id=scalar_types.float8_e4m3fn.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map) + + assert self.fused_experts_func is not None + + return self.fused_experts_func( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - use_fp8_w8a8=True, - global_num_experts=global_num_experts, + activation=activation, apply_router_weight_on_input=apply_router_weight_on_input, + use_fp8_w8a8=True, + per_channel_quant=self.weight_qscheme == "per_channel", + global_num_experts=global_num_experts, expert_map=expert_map, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - activation=activation) + a2_scale=layer.w2_input_scale) class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): From 43a62c51be1254a9b2923c7f477af0cfc52d1937 Mon Sep 17 00:00:00 2001 From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com> Date: Wed, 17 Sep 2025 08:53:17 +0300 Subject: [PATCH 347/584] Add more documentation and improve usability of lognormal dist (benchmark_serving_multi_turn) (#23255) Signed-off-by: daniels <daniels@pliops.com> --- benchmarks/multi_turn/README.md | 101 +++++++++++++++++ benchmarks/multi_turn/bench_dataset.py | 105 +++++++++++++++++- .../multi_turn/generate_multi_turn.json | 5 +- 3 files changed, 203 insertions(+), 8 deletions(-) diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md index 7adf97bcf5622..f5b5c6c97d484 100644 --- a/benchmarks/multi_turn/README.md +++ b/benchmarks/multi_turn/README.md @@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 ---------------------------------------------------------------------------------------------------- ``` +### JSON configuration file for synthetic conversations generation + +The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/> +When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations. + +The file `generate_multi_turn.json` is an example file. + +The file must contain the sections `prompt_input` and `prompt_output`. + +The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`: + +* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/> +The final value will always be rounded to an even number so each user turn has a reply. +* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation). +* `num_tokens` - Total token length of each **user** message (one turn). + +The `prompt_output` section must contain `num_tokens`: + +* `num_tokens` - Total token length of each **assistant** message (one turn). + +### Random distributions for synthetic conversations generation + +When creating an input JSON file (such as `generate_multi_turn.json`),<br/> +every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/> +The distribution determines how to randomly sample values for the field. + +The available distributions are listed below. + +**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br> +Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`. + +#### constant + +```json +{ + "distribution": "constant", + "value": 500 +} +``` + +* `value` - the fixed integer value (always returns the same number). + +#### uniform + +```json +{ + "distribution": "uniform", + "min": 12, + "max": 18 +} +``` + +* `min` - minimum value (inclusive). +* `max` - maximum value (inclusive), should be equal or larger than min. + +#### lognormal + +```json +{ + "distribution": "lognormal", + "average": 1000, + "max": 5000 +} +``` + +You can parameterize the lognormal distribution in one of two ways: + +Using the average and optional median ratio: + +* `average` - target average value of the distribution. +* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1). + +Using the parameters of the underlying normal distribution: + +* `mean` - mean of the underlying normal distribution. +* `sigma` - standard deviation of the underlying normal distribution. + +#### zipf + +```json +{ + "distribution": "zipf", + "alpha": 1.2, + "max": 100 +} +``` + +* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers. + +#### poisson + +```json +{ + "distribution": "poisson", + "alpha": 10, + "max": 50 +} +``` + +* `alpha` - expected value (λ). Also the variance of the distribution. + ## ShareGPT Conversations To run with the ShareGPT data, download the following ShareGPT dataset: diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py index 411b89dd23dc6..67b937930d58c 100644 --- a/benchmarks/multi_turn/bench_dataset.py +++ b/benchmarks/multi_turn/bench_dataset.py @@ -99,21 +99,105 @@ class PoissonDistribution(Distribution): class LognormalDistribution(Distribution): def __init__( - self, mean: float, sigma: float, max_val: Optional[int] = None + self, + mean: Optional[float] = None, + sigma: Optional[float] = None, + average: Optional[int] = None, + median_ratio: Optional[float] = None, + max_val: Optional[int] = None, ) -> None: + self.average = average + self.median_ratio = median_ratio + self.max_val = max_val + + if average is not None: + if average < 1: + raise ValueError("Lognormal average must be positive") + + if mean or sigma: + raise ValueError( + "When using lognormal average, you can't provide mean/sigma" + ) + + if self.median_ratio is None: + # Default value that provides relatively wide range of values + self.median_ratio = 0.85 + + # Calculate mean/sigma of np.random.lognormal based on the average + mean, sigma = self._generate_lognormal_by_median( + target_average=self.average, median_ratio=self.median_ratio + ) + else: + if mean is None or sigma is None: + raise ValueError( + "Must provide both mean and sigma if average is not used" + ) + + if mean <= 0 or sigma < 0: + raise ValueError( + "Lognormal mean must be positive and sigma must be non-negative" + ) + + # Mean and standard deviation of the underlying normal distribution + # Based on numpy.random.lognormal self.mean = mean self.sigma = sigma - self.max_val = max_val + + @staticmethod + def _generate_lognormal_by_median( + target_average: int, median_ratio: float + ) -> tuple[float, float]: + """ + Compute (mu, sigma) for a lognormal distribution given: + - a target average (mean of the distribution) + - a ratio of median / mean (controls skewness), assume mean > median + + Background: + If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma). + * mean(X) = exp(mu + sigma^2 / 2) + * median(X) = exp(mu) + + So: + median / mean = exp(mu) / exp(mu + sigma^2 / 2) + = exp(-sigma^2 / 2) + + Rearranging: + sigma^2 = 2 * ln(mean / median) + mu = ln(median) + + This gives a unique (mu, sigma) for any valid mean and median. + """ + # Check input validity: median must be smaller than mean + if median_ratio <= 0 or median_ratio >= 1: + raise ValueError("median_ratio must be in range (0, 1)") + + target_median = target_average * median_ratio + + # Solve sigma^2 = 2 * ln(mean / median) + sigma = np.sqrt(2 * np.log(target_average / target_median)) + mu = np.log(target_median) + + return mu, sigma def sample(self, size: int = 1) -> np.ndarray: samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size) + + if self.average is not None: + # Scale to average + samples *= self.average / samples.mean() + if self.max_val: samples = np.minimum(samples, self.max_val) return np.round(samples).astype(int) def __repr__(self) -> str: - return f"LognormalDistribution[{self.mean}, {self.sigma}]" + if self.average: + return ( + f"LognormalDistribution[{self.average}, " + f"{self.median_ratio}, {self.max_val}]" + ) + return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]" class GenConvArgs(NamedTuple): @@ -173,10 +257,21 @@ def get_random_distribution( return PoissonDistribution(conf["alpha"], max_val=max_val) elif distribution == "lognormal": + max_val = conf.get("max", None) + + if "average" in conf: + # Infer lognormal mean/sigma (numpy) from input average + median_ratio = conf.get("median_ratio", None) + return LognormalDistribution( + average=conf["average"], median_ratio=median_ratio, max_val=max_val + ) + + # Use mean/sigma directly (for full control over the distribution) verify_field_exists(conf, "mean", section, subsection) verify_field_exists(conf, "sigma", section, subsection) - max_val = conf.get("max", None) - return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val) + return LognormalDistribution( + mean=conf["mean"], sigma=conf["sigma"], max_val=max_val + ) elif distribution == "uniform": verify_field_exists(conf, "min", section, subsection) diff --git a/benchmarks/multi_turn/generate_multi_turn.json b/benchmarks/multi_turn/generate_multi_turn.json index 274d03c2bdb2b..03cfc7d63e8aa 100644 --- a/benchmarks/multi_turn/generate_multi_turn.json +++ b/benchmarks/multi_turn/generate_multi_turn.json @@ -15,9 +15,8 @@ }, "prefix_num_tokens": { "distribution": "lognormal", - "mean": 6, - "sigma": 4, - "max": 1500 + "average": 1000, + "max": 5000 }, "num_tokens": { "distribution": "uniform", From dd39baf7175c5e79faef071c67bb318eadb7752f Mon Sep 17 00:00:00 2001 From: Kunshang Ji <kunshang.ji@intel.com> Date: Wed, 17 Sep 2025 14:45:25 +0800 Subject: [PATCH 348/584] [XPU] Fix xpu model runner call torch.cuda APIs (#25011) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> --- vllm/v1/worker/xpu_model_runner.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py index fb892211f19db..7becdd392498f 100644 --- a/vllm/v1/worker/xpu_model_runner.py +++ b/vllm/v1/worker/xpu_model_runner.py @@ -45,8 +45,12 @@ def _torch_cuda_wrapper(): self.synchronize = lambda: None try: - # replace cuda Event with xpu Event, this should work by default + # replace cuda APIs with xpu APIs, this should work by default torch.cuda.Event = torch.xpu.Event + torch.cuda.Stream = torch.xpu.Stream + torch.cuda.default_stream = torch.xpu.current_stream + torch.cuda.current_stream = torch.xpu.current_stream + torch.cuda.stream = torch.xpu.stream yield finally: # if anything goes wrong, just patch it with a placeholder From b77bf34e531abb32c054a38747fa817d08395ae7 Mon Sep 17 00:00:00 2001 From: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Date: Wed, 17 Sep 2025 00:27:34 -0700 Subject: [PATCH 349/584] [EPLB] Support EPLB for Mixtral Model (#22842) Signed-off-by: rouchenzi <ruochenwen@gmail.com> Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com> Co-authored-by: Bowen Wang <abmfy@icloud.com> --- vllm/model_executor/models/mixtral.py | 160 ++++++++++++++++++++++---- 1 file changed, 137 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 52fcbbfc58be6..b02030b6d6272 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable from itertools import islice from typing import Optional, Union @@ -33,8 +34,9 @@ from transformers import MixtralConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import (get_ep_group, get_pp_group, + get_tensor_model_parallel_world_size) from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -50,8 +52,8 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, is_pp_missing_parameter, +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -74,10 +76,32 @@ class MixtralMoE(nn.Module): quant_config: Optional[QuantizationConfig] = None, tp_size: Optional[int] = None, dp_size: Optional[int] = None, - prefix: str = ""): + prefix: str = "", + enable_eplb: bool = False): super().__init__() self.hidden_size = hidden_size + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + + # Expert Parallelism Load balancing settings. + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + self.enable_eplb = enable_eplb + + self.n_routed_experts = num_experts + self.n_logical_experts = num_experts + self.n_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts) + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + # Gate always runs at half / full precision for now. self.gate = ReplicatedLinear(hidden_size, @@ -97,7 +121,9 @@ class MixtralMoE(nn.Module): quant_config=quant_config, tp_size=tp_size, dp_size=dp_size, - prefix=f"{prefix}.experts") + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # NOTE: hidden_states can have either 1D or 2D shape. @@ -200,6 +226,7 @@ class MixtralDecoderLayer(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + enable_eplb: bool = False, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -221,7 +248,8 @@ class MixtralDecoderLayer(nn.Module): hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, quant_config=quant_config, - prefix=f"{prefix}.block_sparse_moe") + prefix=f"{prefix}.block_sparse_moe", + enable_eplb=enable_eplb) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, @@ -262,6 +290,7 @@ class MixtralModel(nn.Module): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + parallel_config = vllm_config.parallel_config self.config = config self.quant_config = quant_config @@ -276,10 +305,18 @@ class MixtralModel(nn.Module): org_num_embeddings=config.vocab_size, ) + self.enable_eplb = parallel_config.enable_eplb + self.num_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts) + self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MixtralDecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix + config, + cache_config, + quant_config=quant_config, + prefix=prefix, + enable_eplb=self.enable_eplb, ), prefix=f"{prefix}.layers") @@ -325,7 +362,8 @@ class MixtralModel(nn.Module): ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", ckpt_up_proj_name="w3", - num_experts=self.config.num_local_experts) + num_experts=self.config.num_local_experts, + num_redundant_experts=self.num_redundant_experts) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -373,26 +411,40 @@ class MixtralModel(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: continue - name = name.replace(weight_name, param_name) + + is_expert_weight = True + name_mapped = name.replace(weight_name, param_name) + # Skip layers on other devices. - if is_pp_missing_parameter(name, self): + if is_pp_missing_parameter(name_mapped, self): continue - if ((name.endswith(".bias") or name.endswith("_bias")) - and name not in params_dict): + + if ((name_mapped.endswith(".bias") + or name_mapped.endswith("_bias")) + and name_mapped not in params_dict): continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id) - break + + param = params_dict[name_mapped] + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader(param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True) + if success: + name = name_mapped + break else: + if is_expert_weight: + continue # Skip loading extra bias for GPTQ models. if ((name.endswith(".bias") or name.endswith("_bias")) and name not in params_dict): @@ -413,7 +465,8 @@ class MixtralModel(nn.Module): return loaded_params -class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, + MixtureOfExperts): fall_back_to_pt_during_load = False packed_modules_mapping = { @@ -462,6 +515,67 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + self.expert_weights = [] + self.moe_layers: list[FusedMoE] = [] + example_moe = None + + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MixtralDecoderLayer) + if hasattr(layer, "block_sparse_moe") and isinstance( + layer.block_sparse_moe, MixtralMoE): + example_moe = layer.block_sparse_moe + self.moe_layers.append(layer.block_sparse_moe.experts) + + self.num_moe_layers = len(self.moe_layers) + + if example_moe is None: + raise RuntimeError("No MixtralMoE layer found in model.layers.") + + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_redundant_experts = example_moe.n_redundant_experts + self.num_expert_groups = 1 + self.num_shared_experts = 0 + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if hasattr(layer, "block_sparse_moe") and isinstance( + layer.block_sparse_moe, MixtralMoE): + moe = layer.block_sparse_moe + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 03191cd8f0ffa0f37629518b19d9155260fd2483 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Wed, 17 Sep 2025 08:57:34 +0100 Subject: [PATCH 350/584] [Core][MultiModalHasher] Hash images without converting image mode (#24969) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/multimodal/hasher.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 0fb1363ce471a..df6c531d876ad 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -12,7 +12,6 @@ from blake3 import blake3 from PIL import Image from vllm.logger import init_logger -from vllm.multimodal.image import convert_image_mode logger = init_logger(__name__) @@ -35,8 +34,12 @@ class MultiModalHasher: exif[Image.ExifTags.Base.ImageID], uuid.UUID): # If the image has exif ImageID tag, use that return (exif[Image.ExifTags.Base.ImageID].bytes, ) - return cls.iter_item_to_bytes( - "image", np.asarray(convert_image_mode(obj, "RGBA"))) + data = {"mode": obj.mode, "data": np.asarray(obj)} + if obj.palette is not None: + data["palette"] = obj.palette.palette + if obj.palette.rawmode is not None: + data["palette_rawmode"] = obj.palette.rawmode + return cls.iter_item_to_bytes("image", data) if isinstance(obj, torch.Tensor): tensor_obj: torch.Tensor = obj.cpu() tensor_dtype = tensor_obj.dtype From 4a9375fe9dbde2e88fde268fab40f5d3a2b6a8ff Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Wed, 17 Sep 2025 16:01:27 +0800 Subject: [PATCH 351/584] [Model] Pass param prefix to LLMHead (#24862) Signed-off-by: whx-sjtu <2952154980@qq.com> --- vllm/model_executor/models/arctic.py | 1 + vllm/model_executor/models/aria.py | 1 + vllm/model_executor/models/baichuan.py | 6 ++++-- vllm/model_executor/models/bamba.py | 1 + vllm/model_executor/models/bloom.py | 4 +++- vllm/model_executor/models/chameleon.py | 1 + vllm/model_executor/models/dbrx.py | 1 + vllm/model_executor/models/deepseek.py | 9 ++++++--- vllm/model_executor/models/deepseek_eagle.py | 3 ++- vllm/model_executor/models/deepseek_v2.py | 9 ++++++--- vllm/model_executor/models/dots1.py | 4 +++- vllm/model_executor/models/ernie45_moe.py | 4 +++- vllm/model_executor/models/ernie45_vl_moe.py | 4 +++- vllm/model_executor/models/ernie_mtp.py | 3 ++- vllm/model_executor/models/exaone.py | 1 + vllm/model_executor/models/exaone4.py | 1 + vllm/model_executor/models/falcon.py | 1 + vllm/model_executor/models/falcon_h1.py | 1 + vllm/model_executor/models/glm4_moe.py | 4 +++- vllm/model_executor/models/gpt_bigcode.py | 3 ++- vllm/model_executor/models/gpt_j.py | 1 + vllm/model_executor/models/gpt_oss.py | 1 + vllm/model_executor/models/granite.py | 1 + vllm/model_executor/models/granitemoe.py | 1 + vllm/model_executor/models/hunyuan_v1.py | 3 ++- vllm/model_executor/models/idefics3.py | 1 + vllm/model_executor/models/jais.py | 4 +++- vllm/model_executor/models/jamba.py | 1 + vllm/model_executor/models/kimi_vl.py | 1 + vllm/model_executor/models/llama_eagle3.py | 2 +- vllm/model_executor/models/mamba.py | 1 + vllm/model_executor/models/mamba2.py | 1 + vllm/model_executor/models/medusa.py | 3 +++ vllm/model_executor/models/mimo_mtp.py | 3 ++- vllm/model_executor/models/minicpm.py | 1 + vllm/model_executor/models/minicpm_eagle.py | 1 + vllm/model_executor/models/minimax_text_01.py | 1 + vllm/model_executor/models/mixtral.py | 1 + vllm/model_executor/models/molmo.py | 1 + vllm/model_executor/models/nemotron.py | 1 + vllm/model_executor/models/nemotron_h.py | 1 + vllm/model_executor/models/olmo.py | 1 + vllm/model_executor/models/olmoe.py | 3 ++- vllm/model_executor/models/opt.py | 4 +++- vllm/model_executor/models/orion.py | 3 ++- vllm/model_executor/models/persimmon.py | 3 ++- vllm/model_executor/models/phi.py | 3 ++- vllm/model_executor/models/phi4flash.py | 1 + vllm/model_executor/models/phi4mm.py | 1 + vllm/model_executor/models/phimoe.py | 1 + vllm/model_executor/models/qwen.py | 3 ++- vllm/model_executor/models/qwen2_moe.py | 3 ++- vllm/model_executor/models/qwen3_moe.py | 3 ++- vllm/model_executor/models/qwen3_next.py | 2 +- vllm/model_executor/models/qwen3_next_mtp.py | 3 ++- vllm/model_executor/models/solar.py | 1 + vllm/model_executor/models/step3_text.py | 4 +++- vllm/model_executor/models/zamba2.py | 1 + 58 files changed, 102 insertions(+), 31 deletions(-) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index c566611266af7..b6dd559968415 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -427,6 +427,7 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant): self.vocab_size, config.hidden_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index db262447d7fa8..a7cb6b35a4ab4 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -539,6 +539,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): config.text_config.hidden_size, org_num_embeddings=self.language_model.org_vocab_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 4563c356666ac..ae25033410407 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -51,7 +51,8 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .utils import (AutoWeightsLoader, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -394,7 +395,8 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, position_embedding=position_embedding) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) self.lm_head.weight.weight_loader = self.lm_head_weight_loader if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index a72bbdebe5317..397089f31cdf6 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -514,6 +514,7 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, # We need bigger padding if using lora for kernel # compatibility if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head"), ) # Used to track and store by the Mamba cache between steps. self.mamba_cache: Optional[MambaCacheManager] = None diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index f8ed92314c3d2..4c37622b049c8 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -330,7 +330,9 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant): self.lm_head = self.transformer.word_embeddings else: self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size) + self.config.hidden_size, + prefix=maybe_prefix( + prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 28a1a66c23291..7a56236483749 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -960,6 +960,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, self.lm_head = ParallelLMHead( self.unpadded_vocab_size, config.hidden_size, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 519cd522213b2..003cf4563a22f 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -438,6 +438,7 @@ class DbrxForCausalLM(nn.Module, SupportsPP): org_num_embeddings=config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 3f9349d766df6..4395b11b7d0f0 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -453,9 +453,12 @@ class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.quant_config = quant_config self.model = DeepseekModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 5e8447a7f48f9..b1d7f24c2f18b 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -199,7 +199,8 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): self.lm_head = ParallelLMHead(self.config.vocab_size, self.config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) logit_scale = getattr(self.config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.config.vocab_size, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index e4a21febc5bde..636554bd648f2 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -823,9 +823,12 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, self.model = DeepseekV2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 4ddf906dddefe..20555e48b73d4 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -504,7 +504,9 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 33ec27fc630e0..ebab018ed67e7 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -562,7 +562,9 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 6034505fa7d68..7f791852ceb91 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -557,7 +557,9 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP): if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py index 90a1267b28f0a..57c5348874378 100644 --- a/vllm/model_executor/models/ernie_mtp.py +++ b/vllm/model_executor/models/ernie_mtp.py @@ -158,7 +158,8 @@ class ErnieMTP(nn.Module, SupportsPP): prefix=maybe_prefix( prefix, "model")) self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size) + self.config.hidden_size, + prefix=maybe_prefix(prefix, "lm_head")) self.sampler = get_sampler() if self.config.tie_word_embeddings: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 942db0143a457..f503fb0f9364a 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -502,6 +502,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.transformer.wte.weight diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index e94c43a47f76a..9f7d57d938140 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -485,6 +485,7 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index a9fe0924babd8..42c378e5c389a 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -473,6 +473,7 @@ class FalconForCausalLM(nn.Module, SupportsPP): config.vocab_size, config.hidden_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 5e2b6d69124c8..757051b3b1447 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -607,6 +607,7 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, # compatibility if not lora_config else lora_config.lora_vocab_padding_size), + prefix=maybe_prefix(prefix, "lm_head"), ) self.lm_head_multiplier = config.lm_head_multiplier if self.tie_word_embeddings: diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 1fb4576092892..e7d967edaf246 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -608,7 +608,9 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index d5c2604145eed..745d0b7759991 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -302,7 +302,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.lm_head = ParallelLMHead( self.transformer.vocab_size, self.transformer.embed_dim, - org_num_embeddings=self.config.vocab_size) + org_num_embeddings=self.config.vocab_size, + prefix=maybe_prefix(prefix, "lm_head")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 584c7f5d8a2d1..77df6ae6f30c8 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -306,6 +306,7 @@ class GPTJForCausalLM(nn.Module, SupportsPP): config.n_embd, bias=True, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index e0b4df7728757..990a1d6d883a1 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -655,6 +655,7 @@ class GptOssForCausalLM(nn.Module, SupportsPP): self.lm_head = ParallelLMHead( self.config.vocab_size, self.config.hidden_size, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(self.config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index f8ba0229210a9..4f9cc2532bd8c 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -434,6 +434,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 07ad75bcf1665..da16c72000c0e 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -487,6 +487,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index a74a44bc2b511..db054b5c537e8 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -58,7 +58,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, - make_layers) + make_layers, maybe_prefix) def _is_moe(config: PretrainedConfig) -> bool: @@ -871,6 +871,7 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP): org_num_embeddings=config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 63307470d959b..9153a0e2c1e5a 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -606,6 +606,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, config.text_config.vocab_size, config.text_config.hidden_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if self.config.text_config.tie_word_embeddings: self.lm_head.weight = self.model.text_model.wte.weight diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 91a06dd502474..4fee8c32fd581 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -302,7 +302,9 @@ class JAISLMHeadModel(nn.Module, SupportsPP): self.lm_head = self.transformer.wte else: self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size) + self.config.hidden_size, + prefix=maybe_prefix( + prefix, "lm_head")) if hasattr(config, "width_scale"): self.output_logits_scale = config.width_scale else: diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 550fde17b6c53..5b8fbc7226866 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -502,6 +502,7 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, # We need bigger padding if using lora for kernel # compatibility if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head"), ) # Used to track and store by the Mamba cache between steps. self.mamba_cache: Optional[MambaCacheManager] = None diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 4f76d4afdb20e..94a5933a61416 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -328,6 +328,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, config.text_config.hidden_size, org_num_embeddings=self.config.text_config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, + prefix=maybe_prefix(prefix, "lm_head"), ) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 99b77729b5018..7027138dfcb17 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -220,7 +220,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): self.config.hidden_size, org_num_embeddings=self.config.draft_vocab_size, padding_size=(DEFAULT_VOCAB_PADDING_SIZE), - prefix="") + prefix=maybe_prefix(prefix, "lm_head")) self.logits_processor = LogitsProcessor(self.config.draft_vocab_size, scale=logit_scale) self.draft_id_to_target_id = nn.Parameter( diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index f02499a4f96b5..9d1017dac8aa1 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -223,6 +223,7 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): # We need bigger padding if using lora for kernel # compatibility if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head"), ) # Used to track and store by the Mamba cache between steps. diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 81b9a125380aa..b1a4138cb8f6c 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -278,6 +278,7 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): # We need bigger padding if using lora for kernel # compatibility if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings) diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 709a5a993c6f7..6ba8ad372c95a 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -15,6 +15,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata +from .utils import maybe_prefix + class ResidualBlock(nn.Module): @@ -71,6 +73,7 @@ class Medusa(nn.Module): config.hidden_size, org_num_embeddings=self.truncated_vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, + prefix=maybe_prefix(prefix, "lm_head"), ) self.lm_heads = [ self.lm_head for _ in range(self.config.num_heads) diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index 5a2079bf5121a..ac835edc001ea 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -158,7 +158,8 @@ class MiMoMTP(nn.Module): prefix=maybe_prefix( prefix, "model")) self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size) + self.config.hidden_size, + prefix=maybe_prefix(prefix, "lm_head")) def forward( self, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 5632f8c8cc4fb..c7be7f76dba15 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -547,6 +547,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 06c2eb4e80afb..848a97b8bb2a0 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -338,6 +338,7 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index ef1fe86c5b5c0..6ce883be0a83c 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -702,6 +702,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): self.config.hidden_size, org_num_embeddings=self.config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index b02030b6d6272..8b3474d809532 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -507,6 +507,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 5d999a02b4e65..2475fe1316097 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1403,6 +1403,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, config.embedding_size or config.vocab_size, config.hidden_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(config.embedding_size diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 10adc62d3de38..21f785e4b91af 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -466,6 +466,7 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index da8628df1fe57..1e1f0524bd063 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -565,6 +565,7 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, # We need bigger padding if using lora for kernel # compatibility if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head"), ) # Used to track and store by the Mamba cache between steps. self.mamba_cache: Optional[MambaCacheManager] = None diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 71575989565a8..7be3c16528b52 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -364,6 +364,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): config.hidden_size, org_num_embeddings=config.vocab_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 9b8525bfadece..892e967e4a21f 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -450,7 +450,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index b92e586f0bf21..365aab205b211 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -375,7 +375,9 @@ class OPTForCausalLM(nn.Module, SupportsPP): self.lm_head = self.model.decoder.embed_tokens else: self.lm_head = ParallelLMHead(config.vocab_size, - config.word_embed_proj_dim) + config.word_embed_proj_dim, + prefix=maybe_prefix( + prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index add751ebf09cc..944a9151d75d3 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -314,7 +314,8 @@ class OrionForCausalLM(nn.Module, SupportsPP): prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 6bdd38d068800..3e854e4d561ff 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -307,7 +307,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP): prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - bias=False) + bias=False, + prefix=maybe_prefix(prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 789b24eb0f6be..6f39afbecf35b 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -322,7 +322,8 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index fcdfcb7bc1603..c4548ee168bd7 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -630,6 +630,7 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): # compatibility if not lora_config else lora_config.lora_vocab_padding_size), quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) self.embedding_bias = None # Used to track and store by the Mamba cache between steps. diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 46963828186cc..b3fc55dab6eca 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -989,6 +989,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): org_num_embeddings=config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 15ae081a9f5fc..01d16f1f2c387 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -645,6 +645,7 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if not lora_config else lora_config.lora_vocab_padding_size), quant_config=None, bias=True, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index e32dc51f00c09..7470948499005 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -271,7 +271,8 @@ class QWenBaseModel(nn.Module): prefix, "transformer")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) if self.config.tie_word_embeddings: self.lm_head.weight = self.transformer.wte.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5551ad8c32329..5e6dea67c9404 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -519,7 +519,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 0a504d90cde1f..f66e8b0b454bf 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -605,7 +605,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 86e26da5b9b86..3c5407916c0bd 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -1089,7 +1089,7 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, # We need bigger padding if using lora for kernel # compatibility if not lora_config else lora_config.lora_vocab_padding_size, - ) + prefix=maybe_prefix(prefix, "lm_head")) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index e7aff377e9aec..190a1750e673a 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -238,7 +238,8 @@ class Qwen3NextMTP(nn.Module, SupportsPP): self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE) + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + prefix=maybe_prefix(prefix, "lm_head")) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 8dd52f1d204a5..94c862258b7ad 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -469,6 +469,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # compatibility if not lora_config else lora_config.lora_vocab_padding_size, quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) if config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 97611d3e140ec..b8733fa5e6129 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -35,7 +35,8 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -386,6 +387,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP): org_num_embeddings=config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 86335d48c1454..e601bc3adb6e9 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -941,6 +941,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): # We need bigger padding if using lora for kernel # compatibility if not lora_config else lora_config.lora_vocab_padding_size, + prefix=maybe_prefix(prefix, "lm_head"), ) # Tie weights with input embeddings if using same dimensions self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) From c15309a730fae1c2ad8670f14e8971a9b8accdcf Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Wed, 17 Sep 2025 16:02:31 +0800 Subject: [PATCH 352/584] [Model] Apply SharedFusedMoE to glm4_moe. (#24849) Signed-off-by: whx-sjtu <2952154980@qq.com> --- vllm/model_executor/models/glm4_moe.py | 85 +++++++++++++++++--------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index e7d967edaf246..1acbd18091fb3 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -46,6 +46,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -146,25 +147,6 @@ class Glm4MoE(nn.Module): self.physical_expert_end = (self.physical_expert_start + self.n_local_physical_experts) - self.experts = FusedMoE( - num_experts=config.n_routed_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - use_grouped_topk=True, - num_expert_group=config.n_group, - topk_group=config.topk_group, - prefix=f"{prefix}.experts", - scoring_func="sigmoid", - # we do scaling outside, set factor to 1.0 to avoid double mul - routed_scaling_factor=1.0, - e_score_correction_bias=self.gate.e_score_correction_bias, - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts) - if config.n_shared_experts is not None: intermediate_size = (config.moe_intermediate_size * config.n_shared_experts) @@ -173,25 +155,68 @@ class Glm4MoE(nn.Module): intermediate_size=intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - reduce_results=self.experts.must_reduce_shared_expert_outputs( - ), + reduce_results=False, prefix=f"{prefix}.shared_experts", ) + self.experts = SharedFusedMoE( + shared_experts=self.shared_experts, + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func="sigmoid", + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, + e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + ) + else: + self.experts = FusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func="sigmoid", + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, + e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - if self.n_shared_experts is not None: - shared_output = self.shared_experts(hidden_states) - else: - shared_output = None + # router_logits: (num_tokens, n_experts) router_logits = self.gate(hidden_states.to(dtype=torch.float32)) - final_hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits) * self.routed_scaling_factor - if shared_output is not None: - final_hidden_states = final_hidden_states + shared_output + + fused_moe_out = self.experts(hidden_states=hidden_states, + router_logits=router_logits) + + if self.shared_experts is not None: + shared_output, final_hidden_states = fused_moe_out + assert shared_output is not None + final_hidden_states = \ + final_hidden_states * self.routed_scaling_factor\ + + shared_output + else: + final_hidden_states = fused_moe_out * self.routed_scaling_factor + if self.tp_size > 1: final_hidden_states = ( self.experts.maybe_all_reduce_tensor_model_parallel( From 6c47f6bfa4794178035d9d941d7d40c1d71473b7 Mon Sep 17 00:00:00 2001 From: Zhuohan Li <zhuohan123@gmail.com> Date: Wed, 17 Sep 2025 01:42:59 -0700 Subject: [PATCH 353/584] [Core] Remove tokenizer group in vLLM (#24078) Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> --- tests/detokenizer/test_stop_checker.py | 8 +- tests/engine/test_stop_checker.py | 7 +- tests/entrypoints/conftest.py | 22 --- tests/entrypoints/openai/test_chat.py | 9 +- tests/entrypoints/openai/test_completion.py | 23 +-- .../test_completion_with_prompt_embeds.py | 5 +- .../entrypoints/openai/test_lora_adapters.py | 10 +- tests/entrypoints/openai/test_models.py | 2 - tests/entrypoints/openai/test_tokenization.py | 28 +-- .../tool_parsers/test_hermes_tool_parser.py | 2 + tests/entrypoints/test_chat_utils.py | 56 +----- tests/lora/test_llama_tp.py | 28 +-- tests/lora/test_lora_allowed_token_ids.py | 135 -------------- tests/lora/test_quant_model.py | 29 +-- tests/lora/test_tokenizer_group.py | 72 -------- tests/test_cache_block_hashing.py | 11 +- tests/tokenization/test_detokenize.py | 16 +- tests/tokenization/test_tokenizer_group.py | 27 --- tests/tokenization/test_tokenizer_registry.py | 4 + tests/v1/engine/conftest.py | 8 +- tests/v1/engine/test_output_processor.py | 10 +- tests/v1/engine/utils.py | 6 +- .../llm/test_struct_output_generate.py | 2 +- vllm/benchmarks/datasets.py | 173 ++++++++---------- vllm/engine/async_llm_engine.py | 15 +- vllm/engine/llm_engine.py | 57 ++---- vllm/engine/output_processor/interfaces.py | 6 +- vllm/engine/output_processor/stop_checker.py | 5 +- vllm/engine/protocol.py | 10 +- vllm/entrypoints/llm.py | 20 +- vllm/entrypoints/openai/serving_chat.py | 2 +- .../openai/serving_classification.py | 5 +- vllm/entrypoints/openai/serving_completion.py | 3 +- vllm/entrypoints/openai/serving_embedding.py | 7 +- vllm/entrypoints/openai/serving_pooling.py | 3 +- vllm/entrypoints/openai/serving_responses.py | 2 +- vllm/entrypoints/openai/serving_score.py | 2 +- .../openai/serving_tokenization.py | 4 +- vllm/inputs/preprocess.py | 84 ++------- vllm/transformers_utils/detokenizer.py | 23 +-- vllm/transformers_utils/tokenizer.py | 33 ++-- vllm/transformers_utils/tokenizer_base.py | 5 + vllm/transformers_utils/tokenizer_group.py | 132 ------------- vllm/transformers_utils/tokenizers/mistral.py | 4 + vllm/v1/engine/async_llm.py | 15 +- vllm/v1/engine/llm_engine.py | 10 +- vllm/v1/engine/output_processor.py | 8 +- vllm/v1/engine/processor.py | 55 +++--- vllm/v1/structured_output/__init__.py | 7 +- 49 files changed, 276 insertions(+), 934 deletions(-) delete mode 100644 tests/lora/test_lora_allowed_token_ids.py delete mode 100644 tests/lora/test_tokenizer_group.py delete mode 100644 tests/tokenization/test_tokenizer_group.py delete mode 100644 vllm/transformers_utils/tokenizer_group.py diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py index bd221977224f9..2ca10c072b342 100644 --- a/tests/detokenizer/test_stop_checker.py +++ b/tests/detokenizer/test_stop_checker.py @@ -1,10 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from unittest.mock import MagicMock - import pytest -from transformers import PreTrainedTokenizer from vllm.engine.output_processor.stop_checker import StopChecker from vllm.inputs import token_inputs @@ -54,10 +51,7 @@ def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int, - When the EOS token should be ignored, and the sequence continues """ - tokenizer = MagicMock(spec=PreTrainedTokenizer) - get_tokenizer_for_seq = MagicMock(return_value=tokenizer) - stop_checker = StopChecker(max_model_len=1024, - get_tokenizer_for_seq=get_tokenizer_for_seq) + stop_checker = StopChecker(max_model_len=1024) seq = sequence_with_eos( text=text_wo_eos, diff --git a/tests/engine/test_stop_checker.py b/tests/engine/test_stop_checker.py index 3d1e1c8032a48..34f4cb13ab0a5 100644 --- a/tests/engine/test_stop_checker.py +++ b/tests/engine/test_stop_checker.py @@ -58,16 +58,13 @@ def deepseek_r1_qwen_tokenizer(): @pytest.fixture def stop_checker(): - return StopChecker(max_model_len=10, - get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer) + return StopChecker(max_model_len=10) @pytest.fixture def stop_checker_with_reasoner(): reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer) - return StopChecker(max_model_len=10, - get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer, - reasoner=reasoner) + return StopChecker(max_model_len=10, reasoner=reasoner) def test_eos_token_stopping(stop_checker): diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index 48fd848e88200..c23eeee271869 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -208,25 +208,3 @@ def zephyr_lora_files(): """Download zephyr LoRA files once per test session.""" from huggingface_hub import snapshot_download return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora") - - -@pytest.fixture(scope="session") -def zephyr_lora_added_tokens_files(zephyr_lora_files): - """Create zephyr LoRA files with added tokens once per test session.""" - import shutil - from tempfile import TemporaryDirectory - - from transformers import AutoTokenizer - - tmp_dir = TemporaryDirectory() - tmp_model_dir = f"{tmp_dir.name}/zephyr" - shutil.copytree(zephyr_lora_files, tmp_model_dir) - tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") - # Copy tokenizer to adapter and add some unique tokens - # 32000, 32001, 32002 - added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], - special_tokens=True) - assert added == 3 - tokenizer.save_pretrained(tmp_model_dir) - yield tmp_model_dir - tmp_dir.cleanup() diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 4608850c7dae2..d5924b7b3ae34 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -29,11 +29,7 @@ def monkeypatch_module(): @pytest.fixture(scope="module", params=[False, True]) -def server( - request, - monkeypatch_module, - zephyr_lora_files, #noqa: F811 - zephyr_lora_added_tokens_files): # noqa: F811 +def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 use_v1 = request.param monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') @@ -49,7 +45,6 @@ def server( "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_added_tokens_files}", "--max-lora-rank", "64", "--max-cpu-loras", @@ -79,7 +74,7 @@ async def client(server): @pytest.mark.parametrize( # first test base model, then test loras "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], + [MODEL_NAME, "zephyr-lora"], ) async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): messages = [{ diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index d55f8d9d65d9b..3650b15792575 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -27,7 +27,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"] @pytest.fixture(scope="module") -def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files): +def default_server_args(zephyr_lora_files): return [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -41,7 +41,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files): "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_added_tokens_files}", "--max-lora-rank", "64", "--max-cpu-loras", @@ -87,7 +86,7 @@ async def client(server): @pytest.mark.parametrize( # first test base model, then test loras "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], + [MODEL_NAME, "zephyr-lora"], ) async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): completion = await client.completions.create(model=model_name, @@ -115,20 +114,6 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): assert completion.choices[0].prompt_logprobs is None -@pytest.mark.asyncio -async def test_added_lora_tokens(client: openai.AsyncOpenAI): - # test using token IDs - completion = await client.completions.create( - model="zephyr-lora2", - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - # Added tokens should appear in tokenized prompt - assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3") - - @pytest.mark.asyncio async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): # test using token IDs @@ -147,7 +132,7 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): @pytest.mark.parametrize( # first test base model, then test loras "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], + [MODEL_NAME, "zephyr-lora"], ) async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -713,7 +698,7 @@ async def test_guided_grammar(client: openai.AsyncOpenAI, @pytest.mark.parametrize( # first test base model, then test loras "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], + [MODEL_NAME, "zephyr-lora"], ) @pytest.mark.parametrize("logprobs_arg", [1, 0]) async def test_echo_logprob_completion(client: openai.AsyncOpenAI, diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index a0ef31762ea15..dbfb1b024f7c2 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -21,10 +21,7 @@ CONFIG = AutoConfig.from_pretrained(MODEL_NAME) @pytest.fixture(scope="module") -def default_server_args( - zephyr_lora_files, - zephyr_lora_added_tokens_files, -) -> list[str]: +def default_server_args() -> list[str]: return [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index f91dcf194b839..10c0cb5f4d151 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -67,12 +67,6 @@ def server_with_lora_modules_json(request, monkeypatch_module, "base_model_name": MODEL_NAME } - lora_module_2 = { - "name": "zephyr-lora2", - "path": zephyr_lora_files, - "base_model_name": MODEL_NAME - } - args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -84,7 +78,6 @@ def server_with_lora_modules_json(request, monkeypatch_module, "--enable-lora", "--lora-modules", json.dumps(lora_module_1), - json.dumps(lora_module_2), "--max-lora-rank", "64", "--max-cpu-loras", @@ -121,7 +114,6 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, for lora_model in lora_models) assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) assert lora_models[0].id == "zephyr-lora" - assert lora_models[1].id == "zephyr-lora2" @pytest.mark.asyncio @@ -209,7 +201,7 @@ async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path, @pytest.mark.asyncio async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files): - """Validate that many loras can be dynamically registered and inferenced + """Validate that many loras can be dynamically registered and inferenced with concurrently""" # This test file configures the server with --max-cpu-loras=2 and this test diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 7cd3ca196a431..4ee34b19dea32 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -26,7 +26,6 @@ def server(zephyr_lora_files): "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", "--max-lora-rank", "64", "--max-cpu-loras", @@ -56,4 +55,3 @@ async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files): assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models) assert lora_models[0].id == "zephyr-lora" - assert lora_models[1].id == "zephyr-lora2" diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 72c8a3510c9b0..ecb7f50fa7400 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -14,7 +14,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @pytest.fixture(scope="module") -def server(zephyr_lora_added_tokens_files: str): # noqa: F811 +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -24,12 +24,6 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811 "--enforce-eager", "--max-num-seqs", "128", - # lora config - "--enable-lora", - "--lora-modules", - f"zephyr-lora2={zephyr_lora_added_tokens_files}", - "--max-lora-rank", - "64", "--enable-tokenizer-info-endpoint", ] @@ -38,10 +32,8 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811 @pytest.fixture(scope="module") -def tokenizer_name(model_name: str, - zephyr_lora_added_tokens_files: str): # noqa: F811 - return zephyr_lora_added_tokens_files if ( - model_name == "zephyr-lora2") else model_name +def tokenizer_name(model_name: str): + return model_name @pytest_asyncio.fixture @@ -53,7 +45,7 @@ async def client(server): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name,tokenizer_name", - [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + [(MODEL_NAME, MODEL_NAME)], indirect=["tokenizer_name"], ) async def test_tokenize_completions( @@ -86,7 +78,7 @@ async def test_tokenize_completions( @pytest.mark.asyncio @pytest.mark.parametrize( "model_name,tokenizer_name", - [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + [(MODEL_NAME, MODEL_NAME)], indirect=["tokenizer_name"], ) async def test_tokenize_chat( @@ -148,7 +140,7 @@ async def test_tokenize_chat( @pytest.mark.asyncio @pytest.mark.parametrize( "model_name,tokenizer_name", - [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + [(MODEL_NAME, MODEL_NAME)], indirect=["tokenizer_name"], ) async def test_tokenize_chat_with_tools( @@ -225,7 +217,7 @@ async def test_tokenize_chat_with_tools( @pytest.mark.asyncio @pytest.mark.parametrize( "model_name, tokenizer_name", - [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + [(MODEL_NAME, MODEL_NAME)], indirect=["tokenizer_name"], ) async def test_tokenize_with_return_token_strs( @@ -260,7 +252,7 @@ async def test_tokenize_with_return_token_strs( @pytest.mark.asyncio @pytest.mark.parametrize( "model_name,tokenizer_name", - [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + [(MODEL_NAME, MODEL_NAME)], indirect=["tokenizer_name"], ) async def test_detokenize( @@ -287,7 +279,7 @@ async def test_detokenize( @pytest.mark.asyncio @pytest.mark.parametrize( "model_name,tokenizer_name", - [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + [(MODEL_NAME, MODEL_NAME)], indirect=["tokenizer_name"], ) async def test_tokenizer_info_basic( @@ -384,4 +376,4 @@ async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer): if chat_template: assert isinstance(chat_template, str), ("Chat template should be a string") - assert chat_template.strip(), "Chat template should not be empty" \ No newline at end of file + assert chat_template.strip(), "Chat template should not be empty" diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index 28b1f8358d80b..4bab849f47c27 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -18,6 +18,8 @@ SERVER_ARGS = [ "--enable-lora", "--lora-modules", f"{LORA_MODEL}={LORA_MODEL}", + "--tokenizer", + f"{LORA_MODEL}", ] TOOLS = [{ diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 84dab737ece26..78370d199b566 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -23,7 +23,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64, encode_video_base64) -from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from ..models.registry import HF_EXAMPLE_MODELS @@ -69,12 +69,7 @@ def phi3v_model_config_mm_interleaved(): @pytest.fixture(scope="module") def phi3v_tokenizer(): - return TokenizerGroup( - tokenizer_id=PHI3V_MODEL_ID, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - ) + return get_tokenizer(PHI3V_MODEL_ID) @pytest.fixture(scope="function") @@ -91,12 +86,7 @@ def qwen2_audio_model_config(): @pytest.fixture(scope="module") def qwen2_audio_tokenizer(): - return TokenizerGroup( - tokenizer_id=QWEN2AUDIO_MODEL_ID, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - ) + return get_tokenizer(QWEN2AUDIO_MODEL_ID) @pytest.fixture(scope="function") @@ -115,12 +105,7 @@ def qwen25omni_model_config_mm_interleaved(): @pytest.fixture(scope="module") def qwen25omni_tokenizer(): - return TokenizerGroup( - tokenizer_id=QWEN25OMNI_MODEL_ID, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - ) + return get_tokenizer(QWEN25OMNI_MODEL_ID) @pytest.fixture(scope="function") @@ -136,12 +121,7 @@ def mistral_model_config(): @pytest.fixture(scope="module") def mistral_tokenizer(): - return TokenizerGroup( - tokenizer_id=MISTRAL_MODEL_ID, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - ) + return get_tokenizer(MISTRAL_MODEL_ID) @pytest.fixture(scope="module") @@ -2250,15 +2230,11 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): enforce_eager=model_info.enforce_eager, dtype=model_info.dtype) - # Build the tokenizer group and grab the underlying tokenizer - tokenizer_group = TokenizerGroup( + # Build the tokenizer + tokenizer = get_tokenizer( model, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, trust_remote_code=model_config.trust_remote_code, ) - tokenizer = tokenizer_group.tokenizer tools = ([{ "type": "function", @@ -2307,14 +2283,10 @@ def test_resolve_content_format_hf_defined(model, expected_format): enforce_eager=model_info.enforce_eager, dtype=model_info.dtype) - tokenizer_group = TokenizerGroup( + tokenizer = get_tokenizer( model, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, trust_remote_code=model_config.trust_remote_code, ) - tokenizer = tokenizer_group.tokenizer # Test detecting the tokenizer's chat_template chat_template = resolve_hf_chat_template( @@ -2368,14 +2340,10 @@ def test_resolve_content_format_fallbacks(model, expected_format): enforce_eager=model_info.enforce_eager, dtype=model_info.dtype) - tokenizer_group = TokenizerGroup( + tokenizer = get_tokenizer( model_config.tokenizer, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, trust_remote_code=model_config.trust_remote_code, ) - tokenizer = tokenizer_group.tokenizer # Test detecting the tokenizer's chat_template chat_template = resolve_hf_chat_template( @@ -2432,14 +2400,10 @@ def test_resolve_content_format_examples(template_path, expected_format): trust_remote_code=True, ) - tokenizer_group = TokenizerGroup( + dummy_tokenizer = get_tokenizer( PHI3V_MODEL_ID, # Dummy - enable_lora=False, - max_num_seqs=5, - max_input_length=None, trust_remote_code=model_config.trust_remote_code, ) - dummy_tokenizer = tokenizer_group.tokenizer dummy_tokenizer.chat_template = None chat_template = load_chat_template(EXAMPLES_DIR / template_path) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 06196cc697cec..a6770e6d32af8 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -13,14 +13,6 @@ from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test MODEL_PATH = "meta-llama/Llama-2-7b-hf" -EXPECTED_NO_LORA_OUTPUT = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 -] EXPECTED_LORA_OUTPUT = [ " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 @@ -79,23 +71,12 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None): print("lora adapter created") - assert do_sample(llm, - sql_lora_files, - tensorizer_config_dict=tensorizer_config_dict, - lora_id=0) == EXPECTED_NO_LORA_OUTPUT - print("lora 1") assert do_sample(llm, sql_lora_files, tensorizer_config_dict=tensorizer_config_dict, lora_id=1) == EXPECTED_LORA_OUTPUT - print("no lora") - assert do_sample(llm, - sql_lora_files, - tensorizer_config_dict=tensorizer_config_dict, - lora_id=0) == EXPECTED_NO_LORA_OUTPUT - print("lora 2") assert do_sample(llm, sql_lora_files, @@ -110,6 +91,7 @@ def test_llama_lora(sql_lora_files): llm = vllm.LLM( MODEL_PATH, + tokenizer=sql_lora_files, enable_lora=True, # also test odd max_num_seqs max_num_seqs=13, @@ -123,6 +105,7 @@ def test_llama_lora_tp4(sql_lora_files): llm = vllm.LLM( MODEL_PATH, + tokenizer=sql_lora_files, enable_lora=True, max_num_seqs=16, max_loras=4, @@ -137,6 +120,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): llm = vllm.LLM( MODEL_PATH, + tokenizer=sql_lora_files, enable_lora=True, max_num_seqs=16, max_loras=4, @@ -184,6 +168,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) loaded_llm = LLM(model=model_ref, + tokenizer=sql_lora_files, load_format="tensorizer", enable_lora=True, enforce_eager=True, @@ -195,11 +180,6 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, tc_as_dict = tensorizer_config.to_serializable() print("lora adapter created") - assert do_sample(loaded_llm, - sql_lora_files, - tensorizer_config_dict=tc_as_dict, - lora_id=0) == EXPECTED_NO_LORA_OUTPUT - print("lora 1") assert do_sample(loaded_llm, sql_lora_files, diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py deleted file mode 100644 index be6409000ae77..0000000000000 --- a/tests/lora/test_lora_allowed_token_ids.py +++ /dev/null @@ -1,135 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig -from vllm.config.lora import LoRAConfig -from vllm.lora.request import LoRARequest -from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.v1.engine.processor import Processor - - -def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id, - sql_lora_files): - """ - Test that we properly resolve the range of allowed token ids for lora - adapters that define additional tokens. - """ - - # Set up a base model compatible with the sql_lora_files adapter and - # a known number of tokens in the base model. - model_config = ModelConfig( - model=llama_2_7b_base_huggingface_id, - tokenizer=llama_2_7b_base_huggingface_id, - tokenizer_mode="auto", - ) - - vllm_config = VllmConfig( - model_config=model_config, - cache_config=CacheConfig(), - device_config=DeviceConfig(), - lora_config=LoRAConfig(), - ) - - tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) - processor = Processor(vllm_config, tokenizer) - - lora_request = LoRARequest("1", 1, str(sql_lora_files)) - request_id = "1" - prompt = "a prompt" - - # tokens added in the lora adapter should not raise an error - lora_token_ids = [32000, 32001, 32002, 32003] - processor.process_inputs( - request_id, - prompt, - params=SamplingParams(allowed_token_ids=lora_token_ids), - lora_request=lora_request) - - # tokens in the base model should not raise an error - base_token_ids = [1000, 1001, 1002, 1003] - processor.process_inputs( - request_id, - prompt, - params=SamplingParams(allowed_token_ids=base_token_ids), - lora_request=lora_request) - - # tokens not in the lora adapter should raise an error - invalid_token_ids = [35000, 35001, 35002, 35003] - with pytest.raises(ValueError): - processor.process_inputs( - request_id, - prompt, - params=SamplingParams(allowed_token_ids=invalid_token_ids), - lora_request=lora_request) - - # tokens in the lora adapter with no lora request should raise an error - with pytest.raises(ValueError): - processor.process_inputs( - request_id, - prompt, - params=SamplingParams(allowed_token_ids=lora_token_ids), - ) - - -def test_allowed_token_ids_with_lora_adapter_no_vocab( - qwen25vl_base_huggingface_id, qwen25vl_lora_files): - """ - Test that we properly resolve the range of allowed token ids for lora - adapters that do not define additional tokens. - """ - - # Set up a base model compatible with the qwen25vl_lora_files adapter and - # a known number of tokens in the base model. - model_config = ModelConfig( - model=qwen25vl_base_huggingface_id, - tokenizer=qwen25vl_base_huggingface_id, - tokenizer_mode="auto", - ) - - vllm_config = VllmConfig( - model_config=model_config, - cache_config=CacheConfig(), - device_config=DeviceConfig(), - lora_config=LoRAConfig(), - ) - - tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) - processor = Processor(vllm_config, tokenizer) - - lora_request = LoRARequest("1", 1, str(qwen25vl_lora_files)) - request_id = "1" - prompt = "a prompt" - - # tokens in the base model should not raise an error - base_token_ids = [1000, 1001, 1002, 1003] - processor.process_inputs( - request_id, - prompt, - params=SamplingParams(allowed_token_ids=base_token_ids), - lora_request=lora_request) - - # tokens in the base model with no lora request should not raise an error - base_token_ids = [1000, 1001, 1002, 1003] - processor.process_inputs( - request_id, - prompt, - params=SamplingParams(allowed_token_ids=base_token_ids), - ) - - # tokens not in the base model should raise an error - invalid_token_ids = [200000, 200001, 200002, 200003] - with pytest.raises(ValueError): - processor.process_inputs( - request_id, - prompt, - params=SamplingParams(allowed_token_ids=invalid_token_ids), - lora_request=lora_request) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index caa31fdb0e73e..2b54b2edd6a9c 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -82,31 +82,20 @@ def test_quant_model_lora(tinyllama_lora_files, model): gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, trust_remote_code=True, - enable_chunked_prefill=True) + enable_chunked_prefill=True, + tokenizer=tinyllama_lora_files) if model.quantization is None: - expected_no_lora_output = [ - "Here are some examples of orange-brown colors", - "I'm sorry, I don't have" - ] expected_lora_output = [ "#ff8050", "#ff8080", ] elif model.quantization == "awq": - expected_no_lora_output = [ - "I'm sorry, I don't understand", - "I'm sorry, I don't understand", - ] expected_lora_output = [ "#f07700: A v", "#f00000: A v", ] elif model.quantization == "gptq": - expected_no_lora_output = [ - "I'm sorry, I don't have", - "I'm sorry, I don't have", - ] expected_lora_output = [ "#f08800: This is", "#f07788 \n#", @@ -117,7 +106,6 @@ def test_quant_model_lora(tinyllama_lora_files, model): # Assert that the outputs changed. if (model.quantization == "gptq" and expected_output is expected_lora_output): - assert output != expected_no_lora_output for i, o in enumerate(output): assert o.startswith( '#'), f"Expected example {i} to start with # but got {o}" @@ -127,12 +115,6 @@ def test_quant_model_lora(tinyllama_lora_files, model): max_tokens = 10 print("lora adapter created") - output = do_sample(llm, - tinyllama_lora_files, - lora_id=0, - max_tokens=max_tokens) - expect_match(output, expected_no_lora_output) - print("lora 1") output = do_sample(llm, tinyllama_lora_files, @@ -140,13 +122,6 @@ def test_quant_model_lora(tinyllama_lora_files, model): max_tokens=max_tokens) expect_match(output, expected_lora_output) - print("no lora") - output = do_sample(llm, - tinyllama_lora_files, - lora_id=0, - max_tokens=max_tokens) - expect_match(output, expected_no_lora_output) - print("lora 2") output = do_sample(llm, tinyllama_lora_files, diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py deleted file mode 100644 index 6cfdaf50d33c4..0000000000000 --- a/tests/lora/test_tokenizer_group.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer import get_lora_tokenizer -from vllm.transformers_utils.tokenizer_group import TokenizerGroup - - -@pytest.mark.asyncio -@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) -async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): - reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files) - tokenizer_group = TokenizerGroup( - tokenizer_id="gpt2", - enable_lora=True, - max_num_seqs=1, - max_loras=1, - max_input_length=None, - ) - lora_request = LoRARequest("1", 1, sql_lora_files) - assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( - prompt="prompt", lora_request=lora_request) - assert reference_tokenizer.encode( - "prompt") == await tokenizer_group.encode_async( - prompt="prompt", lora_request=lora_request) - assert isinstance(tokenizer_group.get_lora_tokenizer(None), - PreTrainedTokenizerBase) - assert tokenizer_group.get_lora_tokenizer( - None) == await tokenizer_group.get_lora_tokenizer_async(None) - - assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request), - PreTrainedTokenizerBase) - assert tokenizer_group.get_lora_tokenizer( - lora_request) != tokenizer_group.get_lora_tokenizer(None) - assert tokenizer_group.get_lora_tokenizer( - lora_request) == await tokenizer_group.get_lora_tokenizer_async( - lora_request) - - -def test_get_lora_tokenizer(sql_lora_files, tmp_path): - lora_request = None - tokenizer = get_lora_tokenizer(lora_request) - assert not tokenizer - - lora_request = LoRARequest("1", 1, sql_lora_files) - tokenizer = get_lora_tokenizer(lora_request) - assert tokenizer.get_added_vocab() - - lora_request = LoRARequest("1", 1, str(tmp_path)) - tokenizer = get_lora_tokenizer(lora_request) - assert not tokenizer - - -@pytest.mark.parametrize("enable_lora", [True, False]) -@pytest.mark.parametrize("max_num_seqs", [1, 2]) -@pytest.mark.parametrize("max_loras", [1, 2]) -def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras): - tokenizer_group = TokenizerGroup( - tokenizer_id="gpt2", - enable_lora=enable_lora, - max_num_seqs=max_num_seqs, - max_loras=max_loras, - max_input_length=None, - ) - if enable_lora: - assert tokenizer_group.lora_tokenizers.capacity == max( - max_num_seqs, max_loras) - else: - assert tokenizer_group.lora_tokenizers.capacity == 0 diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index edc0849dff33f..1dba0fd0fb3d3 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -11,7 +11,7 @@ import pytest from vllm.inputs import token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Sequence -from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.transformers_utils.tokenizer import get_tokenizer # Make two prefixes with different first blocks. prefix_start = [("You are an expert"), ("You are a")] @@ -47,12 +47,7 @@ def flatten_2d(li): def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, concurrent_lora_int_ids: list[Optional[int]]): - tokenizer = TokenizerGroup( - tokenizer_id="facebook/opt-125m", - enable_lora=False, - max_num_seqs=max_num_seqs, - max_input_length=None, - ) + tokenizer = get_tokenizer("facebook/opt-125m") hashes: list[list[list[int]]] = [] @@ -76,7 +71,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, inputs=token_inputs(prompt_token_ids, prompt=prompt), block_size=block_size, - eos_token_id=tokenizer.tokenizer.eos_token_id, + eos_token_id=tokenizer.eos_token_id, lora_request=lora_request) num_blocks = len(prompt_token_ids) // block_size diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 527aad97d4fae..15ea55afe963b 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -11,7 +11,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, from vllm.inputs import token_inputs from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer, @@ -221,17 +221,14 @@ def test_oov_decode(tokenizer, fast): @pytest.fixture def detokenizer(tokenizer_name: str) -> Detokenizer: - tokenizer_group = TokenizerGroup( - tokenizer_id=tokenizer_name, - enable_lora=False, - max_num_seqs=100, - max_input_length=None, + tokenizer = get_tokenizer( + tokenizer_name, tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto", trust_remote_code=False, revision=None, ) - return Detokenizer(tokenizer_group) + return Detokenizer(tokenizer) @pytest.fixture(name="complete_sequence_token_ids") @@ -312,8 +309,7 @@ def test_decode_prompt_logprobs(complete_sequence: str, # don't support that. if complete_sequence not in SPECIAL_TOKS_TRUTH: skip_special_tokens = True - elif not isinstance(detokenizer.tokenizer_group.get_lora_tokenizer(None), - MistralTokenizer): + elif not isinstance(detokenizer.tokenizer, MistralTokenizer): skip_special_tokens = False else: pytest.skip("MistralTokenizers don't support " @@ -339,7 +335,7 @@ def test_decode_prompt_logprobs(complete_sequence: str, # decoded_prompt_logprobs doesn't contain the first token. token_ids = complete_sequence_token_ids - tokenizer = detokenizer.get_tokenizer_for_seq(seq) + tokenizer = detokenizer.tokenizer text_full = tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) text_first = tokenizer.decode(token_ids[0], diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py deleted file mode 100644 index 0570c1525e111..0000000000000 --- a/tests/tokenization/test_tokenizer_group.py +++ /dev/null @@ -1,27 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm.transformers_utils.tokenizer_group import TokenizerGroup - - -@pytest.mark.asyncio -async def test_tokenizer_group(): - reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") - tokenizer_group = TokenizerGroup( - tokenizer_id="gpt2", - enable_lora=False, - max_num_seqs=1, - max_input_length=None, - ) - assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( - prompt="prompt", lora_request=None) - assert reference_tokenizer.encode( - "prompt") == await tokenizer_group.encode_async(prompt="prompt", - lora_request=None) - assert isinstance(tokenizer_group.get_lora_tokenizer(None), - PreTrainedTokenizerBase) - assert tokenizer_group.get_lora_tokenizer( - None) == await tokenizer_group.get_lora_tokenizer_async(None) diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py index 5abb101644086..68d4b416b4c99 100644 --- a/tests/tokenization/test_tokenizer_registry.py +++ b/tests/tokenization/test_tokenizer_registry.py @@ -57,6 +57,10 @@ class TestTokenizer(TokenizerBase): def max_token_id(self) -> int: raise NotImplementedError() + @property + def truncation_side(self) -> str: + raise NotImplementedError() + def __call__( self, text: Union[str, list[str], list[int]], diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py index d7722142b207f..a73a9a6999f72 100644 --- a/tests/v1/engine/conftest.py +++ b/tests/v1/engine/conftest.py @@ -12,7 +12,6 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST, generate_dummy_prompt_logprobs_tensors, generate_dummy_sample_logprobs) from vllm.engine.arg_utils import EngineArgs -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from ...distributed.conftest import publisher_config, random_port # noqa: F401 @@ -24,7 +23,7 @@ EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor] def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors: """Generate output processor dummy test vectors, without logprobs - + Returns: DummyOutputProcessorTestVectors instance with no logprobs """ @@ -48,9 +47,6 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors: ] return DummyOutputProcessorTestVectors( tokenizer=tokenizer, - tokenizer_group=init_tokenizer_from_configs( - vllm_config.model_config, vllm_config.scheduler_config, - vllm_config.lora_config), vllm_config=vllm_config, full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS], prompt_tokens=prompt_tokens, @@ -68,7 +64,7 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors: @pytest.fixture def dummy_test_vectors() -> DummyOutputProcessorTestVectors: """Generate output processor dummy test vectors, with logprobs - + Returns: DummyOutputProcessorTestVectors instance with logprobs """ diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 6544e8b017e70..a9632ce54eac8 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -43,7 +43,7 @@ def _ref_convert_id_to_token( [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) def test_incremental_detokenization(request_output_kind: RequestOutputKind, dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group, + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) engine_core = MockEngineCore( tokens_list=dummy_test_vectors.generation_tokens) @@ -382,7 +382,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, num_sample_logprobs: Optional[int], num_prompt_logprobs: Optional[int], dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group, + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) engine_core = MockEngineCore( tokens_list=dummy_test_vectors.generation_tokens, @@ -535,7 +535,7 @@ def test_stop_token(include_stop_str_in_output: bool, ) # '<|end_of_text|>' stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>' - output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group, + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) # Dummy engine core outputs, with control tokens suffixed to test stops suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids) @@ -642,7 +642,7 @@ def test_stop_token(include_stop_str_in_output: bool, [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST]) def test_stop_string(include_stop_str_in_output: bool, num_sample_logprobs: Optional[int], dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group, + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) engine_core = MockEngineCore( tokens_list=dummy_test_vectors.generation_tokens, @@ -763,7 +763,7 @@ def test_stop_string(include_stop_str_in_output: bool, def test_iteration_stats(dummy_test_vectors): - output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group, + output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True) engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) engine_core_timestamp = time.monotonic() diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index b58bc75fc9565..689b2c95f927e 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -9,7 +9,6 @@ import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.engine.arg_utils import EngineArgs -from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.v1.engine import EngineCoreOutput, FinishReason from vllm.v1.outputs import LogprobsLists, LogprobsTensors @@ -39,7 +38,7 @@ def _create_random_top_logprob_test_vector( upper: float, ) -> torch.Tensor: """Create a random vector of top logprob float values. - + Use to create fake sample logprobs for testing. Note that a real production scenario would require @@ -63,7 +62,7 @@ def _create_random_top_logprob_test_matrix( upper: float, ) -> torch.Tensor: """Create a random matrix of top logprob float values. - + Use to create fake prompt logprobs for testing. Note that a real production scenario would require @@ -296,7 +295,6 @@ def generate_dummy_prompt_logprobs_tensors( class DummyOutputProcessorTestVectors: """Dummy test vectors for output processor tests""" tokenizer: GeneralTokenizerType - tokenizer_group: TokenizerGroup vllm_config: EngineArgs full_tokens: list[list[int]] # Prompt + generated tokens prompt_tokens: list[list[int]] diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 126d8ce8c8e00..ad62914195b44 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -582,7 +582,7 @@ def test_structured_output_with_reasoning_matrices( reasoning_parser=reasoning_parser, speculative_config=speculative_config, ) - tokenizer = llm.get_tokenizer(None) + tokenizer = llm.get_tokenizer() reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)( tokenizer=tokenizer) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 8d11b19066bba..a38090edb0b42 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -37,7 +37,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict from vllm.multimodal.image import convert_image_mode -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import PlaceholderModule try: @@ -100,8 +100,8 @@ class BenchmarkDataset(ABC): ) -> None: """ Initialize the BenchmarkDataset with an optional dataset path and random - seed. - + seed. + Args: dataset_path (Optional[str]): Path to the dataset. If None, it indicates that a default or random dataset might be used. @@ -133,10 +133,10 @@ class BenchmarkDataset(ABC): elif isinstance(mm_content, dict): content.append(mm_content) else: - raise TypeError( + raise TypeError( "Could not process multimodal content of type: " + - f"{type(mm_content)}" - ) + f"{type(mm_content)}" + ) return [{"role": "user", "content": content}] def load_data(self) -> None: @@ -155,34 +155,26 @@ class BenchmarkDataset(ABC): def get_random_lora_request( self, - tokenizer: PreTrainedTokenizerBase, max_loras: Optional[int] = None, lora_path: Optional[str] = None, - ) -> tuple[Optional[LoRARequest], AnyTokenizer]: + ) -> Optional[LoRARequest]: """ - Optionally select a random LoRA request and return its associated - tokenizer. + Optionally select a random LoRA request. This method is used when LoRA parameters are provided. It randomly - selects a LoRA based on max_loras and retrieves a cached tokenizer for - that LoRA if available. Otherwise, it returns the base tokenizer. + selects a LoRA based on max_loras. Args: - tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no - LoRA is selected. max_loras (Optional[int]): The maximum number of LoRAs available. If `None`, LoRA is not used. lora_path (Optional[str]): Path to the LoRA parameters on disk. If `None`, LoRA is not used. Returns: - A tuple with the following elements: - - A new [LoRARequest][] (or `None` if not applicable). - - The tokenizer associated with the LoRA request - (or the base tokenizer). + A new [LoRARequest][] (or `None` if not applicable). """ if max_loras is None or lora_path is None: - return None, tokenizer + return None # Generate a random LoRA ID in the range [1, max_loras]. lora_id = random.randint(1, max_loras) @@ -191,11 +183,7 @@ class BenchmarkDataset(ABC): lora_int_id=lora_id, lora_path=lora_path_on_disk(lora_path), ) - if lora_id not in lora_tokenizer_cache: - lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) - # Return lora_request and the cached tokenizer if available; otherwise, - # return the base tokenizer - return lora_request, lora_tokenizer_cache[lora_id] or tokenizer + return lora_request @abstractmethod def sample(self, tokenizer: PreTrainedTokenizerBase, @@ -213,7 +201,7 @@ class BenchmarkDataset(ABC): for processing the dataset's text. num_requests (int): The number of sample requests to generate. request_id_prefix (str) The prefix of request_id. - + Returns: list[SampleRequest]: A list of sample requests generated from the @@ -527,7 +515,7 @@ class RandomDataset(BenchmarkDataset): size=num_requests) output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests) - offsets = self._rng.integers(0, tokenizer.vocab_size, + offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests) return input_lens, output_lens, offsets @@ -555,7 +543,7 @@ class RandomDataset(BenchmarkDataset): the encoded sequence is truncated before being decoded again. """ # Build the inner sequence by sampling sequentially from the vocab - inner_seq = ((offset + index + np.arange(input_len)) + inner_seq = ((offset + index + np.arange(input_len)) % vocab_size).tolist() token_sequence = prefix_token_ids + inner_seq @@ -590,9 +578,9 @@ class RandomMultiModalDataset(RandomDataset): `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0. The maximum is further clamped to the sum of per-modality limits. 2) Each item’s modality and shape is sampled from `bucket_config`, a dict - mapping (height, width, num_frames) → probability. We treat - `num_frames`=1 as image and and `num_frames` > 1 as video. - Entries with zero probability are removed and the rest are renormalized + mapping (height, width, num_frames) → probability. We treat + `num_frames`=1 as image and and `num_frames` > 1 as video. + Entries with zero probability are removed and the rest are renormalized to sum to 1. 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`. When a modality reaches its cap, all of its buckets are excluded and the @@ -600,8 +588,8 @@ class RandomMultiModalDataset(RandomDataset): Example bucket configuration: {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1} - - Two image buckets (`num_frames`=1) and one video bucket - (`num_frames`=16). + - Two image buckets (`num_frames`=1) and one video bucket + (`num_frames`=16). OBS.: Only image sampling is supported for now. """ @@ -624,9 +612,9 @@ class RandomMultiModalDataset(RandomDataset): def generate_synthetic_image(self, width: int, height: int) -> Image.Image: """Generate synthetic PIL image with random RGB values. - - NOTE: iid pixel sampling results in worst-case compression - (good for stressing I/O), but very unlike real photos. + + NOTE: iid pixel sampling results in worst-case compression + (good for stressing I/O), but very unlike real photos. We could consider a “low-freq” mode (e.g., noise blur) to emulate network realism instead of max stress. """ @@ -638,11 +626,11 @@ class RandomMultiModalDataset(RandomDataset): ) return Image.fromarray(random_pixels) - def generate_synthetic_video(self, width: int, - height: int, + def generate_synthetic_video(self, width: int, + height: int, num_frames: int) -> Any: """Generate synthetic video with random values. - + TODO: Finish this method. """ raise NotImplementedError("Video sampling is WIP.") @@ -656,7 +644,7 @@ class RandomMultiModalDataset(RandomDataset): else: raise ValueError(f"Invalid multimodal item configuration: {config}") - def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], + def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], float]) -> dict[tuple[int, int, int], float]: """ Remove zero probability entries @@ -676,24 +664,24 @@ class RandomMultiModalDataset(RandomDataset): return {k: v / total for k, v in bucket_config.items()} - def generate_mm_item(self, + def generate_mm_item(self, mm_item_config: tuple[int, int, int], ) -> Mapping[str, Any]: """ - Create synthetic images and videos and + Create synthetic images and videos and apply process_image/process_video respectively. This follows the OpenAI API chat completions https://github.com/openai/openai-python """ - + if self.map_config_to_modality(mm_item_config) == "image": return process_image(self.generate_synthetic_image( mm_item_config[1], mm_item_config[0])) elif self.map_config_to_modality(mm_item_config) == "video": return process_video(self.generate_synthetic_video( - mm_item_config[1], - mm_item_config[0], + mm_item_config[1], + mm_item_config[0], mm_item_config[2])) else: raise ValueError(f"Invalid multimodal item configuration: " @@ -723,17 +711,17 @@ class RandomMultiModalDataset(RandomDataset): f"limit_mm_per_prompt: " f"{limit_mm_per_prompt.keys()}") - # Remove zero probability entries + # Remove zero probability entries # and normalize bucket config to sum to 1 bucket_config = self.normalize_bucket_config(bucket_config) logger.info( "Normalized bucket config: %s", bucket_config, ) # Only consider limit per prompt for modalities in bucket config - allowed_modalities = {self.map_config_to_modality(cfg) + allowed_modalities = {self.map_config_to_modality(cfg) for cfg in bucket_config} limit_mm_per_prompt = { - k: v for k, v in limit_mm_per_prompt.items() + k: v for k, v in limit_mm_per_prompt.items() if k in allowed_modalities} if not limit_mm_per_prompt: raise ValueError("No valid limits for modalities present in " @@ -746,19 +734,19 @@ class RandomMultiModalDataset(RandomDataset): # Get max and min num mm items and ensure # it is at most the sum of limit_mm_per_prompt for all modalities max_num_mm_items = min( - sum(limit_mm_per_prompt.values()), + sum(limit_mm_per_prompt.values()), math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)) ) # Ensure min num mm items is at least 0 min_num_mm_items = max( - 0, + 0, math.floor(base_items_per_request * (1 - num_mm_items_range_ratio)) ) # Raise error if min num mm items is greater than max num mm items if min_num_mm_items > max_num_mm_items: raise ValueError(f"Min num mm items is greater than max mm items: " f"{min_num_mm_items} > {max_num_mm_items}") - + logger.info( "Sampling number of multimodal items from [%s, %s]", min_num_mm_items, max_num_mm_items, @@ -783,8 +771,8 @@ class RandomMultiModalDataset(RandomDataset): whose size is between min_num_mm_items and max_num_mm_items. Loop over the bucket config and sample a multimodal item. - Loop until the number of multimodal items sampled is equal to - request_num_mm_items or limit of multimodal items per prompt + Loop until the number of multimodal items sampled is equal to + request_num_mm_items or limit of multimodal items per prompt for all modalities is reached. Note: @@ -796,19 +784,19 @@ class RandomMultiModalDataset(RandomDataset): # Get the number of multimodal items to sample request_num_mm_items = int( self._rng.integers(min_num_mm_items, max_num_mm_items + 1) - ) + ) # If request_num_mm_items is 0, yield an empty iterator if request_num_mm_items == 0: return # Initialize modality counters - modality_counter = {self.map_config_to_modality(k): 0 + modality_counter = {self.map_config_to_modality(k): 0 for k in bucket_config} # Copy the bucket config to avoid modifying the original bucket_config_copy = bucket_config.copy() # Loop over the number of multimodal items to sample while sum(modality_counter.values()) < request_num_mm_items: # Sample a multimodal item config - mm_item_config = self._rng.choice(list(bucket_config_copy.keys()), + mm_item_config = self._rng.choice(list(bucket_config_copy.keys()), p=list(bucket_config_copy.values())) modality = self.map_config_to_modality(mm_item_config) # Check that modality count is less than limit per prompt @@ -849,7 +837,7 @@ class RandomMultiModalDataset(RandomDataset): limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT, base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST, num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, - bucket_config: dict[tuple[int, int, int], float] = + bucket_config: dict[tuple[int, int, int], float] = DEFAULT_MM_ITEM_BUCKET_CONFIG, enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT, **kwargs, @@ -857,7 +845,7 @@ class RandomMultiModalDataset(RandomDataset): # NOTE: Video sampling is WIP. Raise error if video is in bucket config # and probability is non-zero. - if any(self.map_config_to_modality(cfg) == "video" and p > 0 + if any(self.map_config_to_modality(cfg) == "video" and p > 0 for cfg, p in bucket_config.items()): raise NotImplementedError("Video sampling not implemented; " "set its probability to 0.") @@ -908,7 +896,7 @@ class RandomMultiModalDataset(RandomDataset): ]) if enable_multimodal_chat: - # NOTE: For now this option is only provided for completeness + # NOTE: For now this option is only provided for completeness # given that the serve.py benchmark currently does not use it. mm_chat_prompt: Any = prompt mm_chat_prompt = self.apply_multimodal_chat_transformation( @@ -982,8 +970,8 @@ class ShareGPTDataset(BenchmarkDataset): entry["conversations"][1]["value"], ) - lora_request, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + lora_request = self.get_random_lora_request( + max_loras=max_loras, lora_path=lora_path) prompt_ids = tokenizer(prompt).input_ids completion_ids = tokenizer(completion).input_ids prompt_len = len(prompt_ids) @@ -994,11 +982,11 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None): continue - if image_path := entry.get("image"): - mm_content = process_image(image_path) - elif video_path := entry.get("video"): + if image_path := entry.get("image"): + mm_content = process_image(image_path) + elif video_path := entry.get("video"): mm_content = process_video(video_path) - else: + else: mm_content = None if enable_multimodal_chat: prompt = self.apply_multimodal_chat_transformation( @@ -1013,9 +1001,9 @@ class ShareGPTDataset(BenchmarkDataset): request_id=request_id_prefix + str(ind), )) ind += 1 - self.maybe_oversample_requests(samples, - num_requests, - request_id_prefix, + self.maybe_oversample_requests(samples, + num_requests, + request_id_prefix, no_oversample) return samples @@ -1024,11 +1012,11 @@ class _ValidateDatasetArgs(argparse.Action): """Argparse action to validate dataset name and path compatibility.""" def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, values) - + # Get current values of both dataset_name and dataset_path dataset_name = getattr(namespace, 'dataset_name', 'random') dataset_path = getattr(namespace, 'dataset_path', None) - + # Validate the combination if dataset_name == "random" and dataset_path is not None: parser.error( @@ -1053,7 +1041,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): default="random", action=_ValidateDatasetArgs, choices=[ - "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", + "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", "custom", "prefix_repetition", "spec_bench" ], help="Name of the dataset to benchmark on.", @@ -1502,7 +1490,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: # For datasets that follow a similar structure, use a mapping. dataset_mapping = { "spec_bench": - lambda: SpecBench(dataset_path=args.dataset_path, + lambda: SpecBench(dataset_path=args.dataset_path, category=args.spec_bench_category).sample( num_requests=args.num_prompts, tokenizer=tokenizer, @@ -1660,7 +1648,7 @@ class CustomDataset(BenchmarkDataset): logger.info("num_requests is set to 0 or negative, " "so using all available samples: %d", num_requests) - + sampled_requests = [] for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: @@ -1686,7 +1674,7 @@ class CustomDataset(BenchmarkDataset): expected_output_len=output_len, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) return sampled_requests @@ -1700,7 +1688,7 @@ class CustomDataset(BenchmarkDataset): class SpecBench(CustomDataset): """ Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench - Download the dataset using: + Download the dataset using: wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl """ # noqa: E501 @@ -1736,8 +1724,8 @@ class SpecBench(CustomDataset): # leverage CustomDataset sample kwargs["skip_chat_template"] = False return super().sample(**kwargs) - - + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- @@ -1882,8 +1870,8 @@ class BurstGPTDataset(BenchmarkDataset): for i in range(num_requests): input_len = int(data[i][2]) output_len = int(data[i][3]) - lora_req, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + lora_req = self.get_random_lora_request( + max_loras=max_loras, lora_path=lora_path) vocab_size = tokenizer.vocab_size # Generate a synthetic prompt: a list of token IDs computed as (i + # j) modulo vocab_size. @@ -1995,7 +1983,7 @@ class ConversationDataset(HuggingFaceDataset): request_id=request_id_prefix + str(ind), )) ind += 1 - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) return sampled_requests @@ -2055,7 +2043,7 @@ class VisionArenaDataset(HuggingFaceDataset): multi_modal_data=mm_content, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) return sampled_requests @@ -2172,7 +2160,7 @@ class InstructCoderDataset(HuggingFaceDataset): expected_output_len=output_len, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) return sampled_requests @@ -2234,7 +2222,7 @@ class MTBenchDataset(HuggingFaceDataset): expected_output_len=output_len, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) return sampled_requests @@ -2288,8 +2276,8 @@ class BlazeditDataset(HuggingFaceDataset): # compare the levenshtein distance normalized by code length if norm_distance < min_distance or norm_distance > max_distance: continue - - # template copied from + + # template copied from # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501 instruction = f"""Given a code file, please apply the change requests and generate the new file. @@ -2322,9 +2310,9 @@ Please generate the new code file in the "New file" section below.""" # noqa: E5 expected_output_len=output_len, request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) - + return sampled_requests @@ -2376,7 +2364,6 @@ class AIMODataset(HuggingFaceDataset): expected_output_len=output_len, multi_modal_data=None, request_id=request_id_prefix + str(ind), - )) ind += 1 self.maybe_oversample_requests(sampled_requests, num_requests, @@ -2470,9 +2457,9 @@ class NextEditPredictionDataset(HuggingFaceDataset): )) if len(samples) >= num_requests: break - self.maybe_oversample_requests(samples, - num_requests, - request_id_prefix, + self.maybe_oversample_requests(samples, + num_requests, + request_id_prefix, no_oversample) return samples @@ -2562,7 +2549,7 @@ class ASRDataset(HuggingFaceDataset): " what Whisper supports.", skipped, ) - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) return sampled_requests @@ -2647,7 +2634,7 @@ class MLPerfDataset(HuggingFaceDataset): ) ind += 1 - self.maybe_oversample_requests(sampled_requests, num_requests, + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) return sampled_requests @@ -2658,7 +2645,7 @@ class MLPerfDataset(HuggingFaceDataset): class PrefixRepetitionRandomDataset(BenchmarkDataset): - # Default values copied from benchmark_serving.py for the repeated prefix + # Default values copied from benchmark_serving.py for the repeated prefix # dataset. DEFAULT_PREFIX_LEN = 256 DEFAULT_SUFFIX_LEN = 256 diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index c53ece18964cb..1ae82c9f6f6f9 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -390,11 +390,8 @@ class _AsyncLLMEngine(LLMEngine): """Stop the remote worker execution loop.""" await self.model_executor.stop_remote_worker_execution_loop_async() - async def get_tokenizer_async(self, - lora_request: Optional[LoRARequest] = None - ) -> AnyTokenizer: - return await ( - self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) + async def get_tokenizer_async(self) -> AnyTokenizer: + return self.get_tokenizer() async def add_request_async( self, @@ -435,7 +432,6 @@ class _AsyncLLMEngine(LLMEngine): processed_inputs = await self.input_preprocessor.preprocess_async( prompt, - lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, ) @@ -614,11 +610,8 @@ class AsyncLLMEngine(EngineClient): async def get_input_preprocessor(self) -> InputPreprocessor: return self.engine.input_preprocessor - async def get_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - return await self.engine.get_tokenizer_async(lora_request) + async def get_tokenizer(self) -> AnyTokenizer: + return self.engine.get_tokenizer() def start_background_loop(self) -> None: """Start the background loop.""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0fdd651425b90..c35bd20371d0a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -49,9 +49,8 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, init_tracer) from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.transformers_utils.tokenizer_group import ( - TokenizerGroup, init_tokenizer_from_configs) +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + init_tokenizer_from_configs) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind @@ -186,7 +185,7 @@ class LLMEngine: return outputs_ - tokenizer: Optional[TokenizerGroup] + tokenizer: Optional[AnyTokenizer] def __init__( self, @@ -233,18 +232,9 @@ class LLMEngine: if self.model_config.skip_tokenizer_init: self.tokenizer = None self.detokenizer = None - tokenizer_group = None else: self.tokenizer = self._init_tokenizer() self.detokenizer = Detokenizer(self.tokenizer) - tokenizer_group = self.get_tokenizer_group() - - # Ensure that the function doesn't contain a reference to self, - # to avoid engine GC issues - def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: - assert tokenizer_group, ("tokenizer_group cannot be None, " - "make sure skip_tokenizer_init is False") - return tokenizer_group.get_lora_tokenizer(sequence.lora_request) self.seq_counter = Counter() self.generation_config_fields = ( @@ -389,10 +379,8 @@ class LLMEngine: self.detokenizer, self.scheduler, self.seq_counter, - get_tokenizer_for_seq, stop_checker=StopChecker( self.scheduler_config.max_model_len, - get_tokenizer_for_seq, self.reasoner if self.decoding_config.reasoning_backend and self.tokenizer else None, ), @@ -521,24 +509,15 @@ class LLMEngine: if model_executor := getattr(self, "model_executor", None): model_executor.shutdown() - def get_tokenizer_group(self) -> TokenizerGroup: + def get_tokenizer(self) -> AnyTokenizer: if self.tokenizer is None: raise ValueError("Unable to get tokenizer because " "skip_tokenizer_init is True") return self.tokenizer - def get_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - return self.get_tokenizer_group().get_lora_tokenizer(lora_request) - - def _init_tokenizer(self) -> TokenizerGroup: - return init_tokenizer_from_configs( - model_config=self.model_config, - scheduler_config=self.scheduler_config, - lora_config=self.lora_config) + def _init_tokenizer(self) -> AnyTokenizer: + return init_tokenizer_from_configs(model_config=self.model_config) def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) @@ -574,11 +553,11 @@ class LLMEngine: ) return None - self._validate_model_inputs(processed_inputs, lora_request) + self._validate_model_inputs(processed_inputs) # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + eos_token_id = self.input_preprocessor.get_eos_token_id() encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) @@ -700,7 +679,6 @@ class LLMEngine: processed_inputs = self.input_preprocessor.preprocess( prompt, tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, ) self._add_processed_request( @@ -1739,29 +1717,22 @@ class LLMEngine: SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) - def _validate_model_inputs(self, inputs: ProcessorInputs, - lora_request: Optional[LoRARequest]): + def _validate_model_inputs(self, inputs: ProcessorInputs): encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs) if encoder_inputs is not None: - self._validate_model_input(encoder_inputs, - lora_request, - prompt_type="encoder") + self._validate_model_input(encoder_inputs, prompt_type="encoder") - self._validate_model_input(decoder_inputs, - lora_request, - prompt_type="decoder") + self._validate_model_input(decoder_inputs, prompt_type="decoder") def _validate_model_input( self, prompt_inputs: SingletonInputs, - lora_request: Optional[LoRARequest], *, prompt_type: Literal["encoder", "decoder"], ): model_config = self.model_config - tokenizer = (None if self.tokenizer is None else - self.tokenizer.get_lora_tokenizer(lora_request)) + tokenizer = self.tokenizer prompt_ids = prompt_inputs.get("prompt_token_ids", []) if not prompt_ids: @@ -1822,7 +1793,7 @@ class LLMEngine: logits_processors = [] if (sampling_params.logit_bias or sampling_params.allowed_token_ids): - tokenizer = self.get_tokenizer(lora_request=lora_request) + tokenizer = self.get_tokenizer() processors = get_openai_logits_processors( logit_bias=sampling_params.logit_bias, @@ -1835,7 +1806,7 @@ class LLMEngine: sampling_params.allowed_token_ids = None if len(sampling_params.bad_words) > 0: - tokenizer = self.get_tokenizer(lora_request) + tokenizer = self.get_tokenizer() processors = get_bad_words_logits_processors( bad_words=sampling_params.bad_words, tokenizer=tokenizer) logits_processors.extend(processors) diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 4d75719c1719b..587a9221e32c8 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -2,14 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod -from typing import Callable, List +from typing import List from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput +from vllm.sequence import SequenceGroup, SequenceGroupOutput from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import Counter @@ -31,7 +30,6 @@ class SequenceGroupOutputProcessor(ABC): detokenizer: Detokenizer, scheduler: List[Scheduler], seq_counter: Counter, - get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], stop_checker: "StopChecker", ): """Create an output processor. diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 68a63044df05e..0916f1c918c85 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,13 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable, List, Optional, Tuple +from typing import List, Optional, Tuple from vllm.lora.request import LoRARequest from vllm.reasoning import ReasoningParser from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceStatus -from vllm.transformers_utils.tokenizer import AnyTokenizer class StopChecker: @@ -20,12 +19,10 @@ class StopChecker: def __init__( self, max_model_len: int, - get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], reasoner: Optional[ReasoningParser] = None, ): # Do not use it directly, but use `self._get_max_model_len`. self._max_model_len = max_model_len - self.get_tokenizer_for_seq = get_tokenizer_for_seq self.reasoner = reasoner def _get_max_model_len(self, lora_req: Optional[LoRARequest]): diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 94eacfbdfb301..808d2d0ce3d28 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -76,8 +76,7 @@ class EngineClient(ABC): include_stop_str_in_output = params.include_stop_str_in_output preprocessor = await self.get_input_preprocessor() - tokenizer_group = preprocessor.get_tokenizer_group() - tokenizer = await tokenizer_group.get_lora_tokenizer_async() + tokenizer = preprocessor.get_tokenizer() eos_token_id = tokenizer.eos_token_id if is_explicit_encoder_decoder_prompt(prompt): @@ -260,11 +259,8 @@ class EngineClient(ABC): ... @abstractmethod - async def get_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - """Get the appropriate tokenizer for the request""" + async def get_tokenizer(self) -> AnyTokenizer: + """Get the tokenizer""" ... async def get_io_processor(self) -> IOProcessor: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 4b51dbcd8acb9..f2264292fa660 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -301,23 +301,17 @@ class LLM: self.io_processor = get_io_processor(self.llm_engine.vllm_config, io_processor_plugin) - def get_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - return self.llm_engine.get_tokenizer_group().get_lora_tokenizer( - lora_request) + def get_tokenizer(self) -> AnyTokenizer: + return self.llm_engine.get_tokenizer() def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: - tokenizer_group = self.llm_engine.get_tokenizer_group() - # While CachedTokenizer is dynamic, have no choice but # compare class name. Misjudgment will arise from # user-defined tokenizer started with 'Cached' if tokenizer.__class__.__name__.startswith("Cached"): - tokenizer_group.tokenizer = tokenizer + self.llm_engine.tokenizer = tokenizer else: - tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer) + self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer) def get_default_sampling_params(self) -> SamplingParams: if self.default_sampling_params is None: @@ -707,7 +701,6 @@ class LLM: self, messages: Union[list[ChatCompletionMessageParam], list[list[ChatCompletionMessageParam]]], - lora_request: Optional[LoRARequest] = None, chat_template: Optional[str] = None, chat_template_content_format: ChatTemplateContentFormatOption = "auto", add_generation_prompt: bool = True, @@ -739,7 +732,7 @@ class LLM: cast(list[ChatCompletionMessageParam], messages) ] - tokenizer = self.get_tokenizer(lora_request) + tokenizer = self.get_tokenizer() model_config = self.llm_engine.get_model_config() resolved_content_format = resolve_chat_template_content_format( chat_template, @@ -872,7 +865,6 @@ class LLM: prompts = self.preprocess_chat( messages=messages, - lora_request=lora_request, chat_template=chat_template, chat_template_content_format=chat_template_content_format, add_generation_prompt=add_generation_prompt, @@ -1519,7 +1511,7 @@ class LLM: ): """ Validate that if any multi-modal data is skipped (i.e. None), - then its corresponding UUID must be set. + then its corresponding UUID must be set. """ if multi_modal_data is None: return diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 6c9c1ae85f570..61d65bd8f119a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -188,7 +188,7 @@ class OpenAIServingChat(OpenAIServing): model_name = self.models.model_name(lora_request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + tokenizer = await self.engine_client.get_tokenizer() tool_parser = self.tool_parser diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 7e88424c169ce..fc56668aeb1b6 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -50,10 +50,7 @@ class ClassificationMixin(OpenAIServing): return None try: - ctx.lora_request = self._maybe_get_adapters(ctx.request) - - ctx.tokenizer = await self.engine_client.get_tokenizer( - ctx.lora_request) + ctx.tokenizer = await self.engine_client.get_tokenizer() renderer = self._get_renderer(ctx.tokenizer) ctx.engine_prompts = await renderer.render_prompt( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c2de449a96994..044f08f32b0d3 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -127,8 +127,7 @@ class OpenAIServingCompletion(OpenAIServing): if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = await self.engine_client.get_tokenizer(lora_request - ) + tokenizer = await self.engine_client.get_tokenizer() renderer = self._get_renderer(tokenizer) engine_prompts = await renderer.render_prompt_and_embeds( diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index c0d1fe4b6e168..647e7daed6598 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -76,8 +76,7 @@ class EmbeddingMixin(OpenAIServing): try: ctx.lora_request = self._maybe_get_adapters(ctx.request) - tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request - ) + tokenizer = await self.engine_client.get_tokenizer() renderer = self._get_renderer(tokenizer) if isinstance(ctx.request, EmbeddingChatRequest): @@ -394,8 +393,8 @@ class EmbeddingMixin(OpenAIServing): ) -> Optional[ErrorResponse]: """Collect and aggregate batch results with support for chunked processing. - - For chunked requests, performs online aggregation to + + For chunked requests, performs online aggregation to minimize memory usage. For regular requests, collects results normally. """ diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index cac1d1ba56839..0750c7ec3e9f1 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -103,8 +103,7 @@ class OpenAIServingPooling(OpenAIServing): if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = await self.engine_client.get_tokenizer(lora_request - ) + tokenizer = await self.engine_client.get_tokenizer() renderer = self._get_renderer(tokenizer) if getattr(request, "dimensions", None) is not None: diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 7be5e54208bd4..b81b2c7223efc 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -240,7 +240,7 @@ class OpenAIServingResponses(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) model_name = self.models.model_name(lora_request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + tokenizer = await self.engine_client.get_tokenizer() if self.use_harmony: messages, request_prompts, engine_prompts = ( diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 24767ed66fc6a..623b1c863f779 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -269,7 +269,7 @@ class ServingScores(OpenAIServing): ) -> Union[list[PoolingRequestOutput], ErrorResponse]: lora_request = self._maybe_get_adapters(request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + tokenizer = await self.engine_client.get_tokenizer() truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 1efd9678571c4..3918d08ebf81d 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -65,7 +65,7 @@ class OpenAIServingTokenization(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + tokenizer = await self.engine_client.get_tokenizer() renderer = self._get_renderer(tokenizer) if isinstance(request, TokenizeChatRequest): @@ -130,7 +130,7 @@ class OpenAIServingTokenization(OpenAIServing): lora_request = self._maybe_get_adapters(request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + tokenizer = await self.engine_client.get_tokenizer() self._log_inputs(request_id, request.tokens, diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 22287aa6f41e0..cb3a5cdb840e6 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -9,13 +9,11 @@ from typing_extensions import assert_never from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalInputs, MultiModalUUIDDict) from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.transformers_utils.tokenizer_group import TokenizerGroup from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt, EncoderDecoderInputs, ProcessorInputs, PromptType, @@ -31,7 +29,7 @@ class InputPreprocessor: def __init__( self, model_config: ModelConfig, - tokenizer: Optional[TokenizerGroup], + tokenizer: Optional[AnyTokenizer], mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None, ) -> None: @@ -42,32 +40,28 @@ class InputPreprocessor: self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache - def get_tokenizer_group(self) -> TokenizerGroup: + def get_tokenizer(self) -> AnyTokenizer: if self.tokenizer is None: raise ValueError("You cannot pass text prompts when " "`skip_tokenizer_init` is True") return self.tokenizer - def get_bos_token_id(self, - lora_request: Optional[LoRARequest] = None - ) -> Optional[int]: + def get_bos_token_id(self) -> Optional[int]: if self.tokenizer is None: logger.warning("Using None for BOS token id because tokenizer " "is not initialized") return None - return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id + return self.tokenizer.bos_token_id - def get_eos_token_id(self, - lora_request: Optional[LoRARequest] = None - ) -> Optional[int]: + def get_eos_token_id(self) -> Optional[int]: if self.tokenizer is None: logger.warning("Using None for EOS token id because tokenizer " "is not initialized") return None - return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id + return self.tokenizer.eos_token_id def get_decoder_start_token_id(self) -> Optional[int]: """ @@ -190,14 +184,13 @@ class InputPreprocessor: def _tokenize_prompt( self, prompt: str, - lora_request: Optional[LoRARequest], tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[int]: """ Apply the model's tokenizer to a text prompt, returning the corresponding token IDs. """ - tokenizer = self.get_tokenizer_group() + tokenizer = self.get_tokenizer() tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) encoder_config = self.model_config.encoder_config @@ -205,50 +198,39 @@ class InputPreprocessor: if encoder_config and encoder_config.get("do_lower_case", False): prompt = prompt.lower() - return tokenizer.encode(prompt=prompt, - lora_request=lora_request, - **tokenization_kwargs) + return tokenizer.encode(prompt, **tokenization_kwargs) async def _tokenize_prompt_async( self, prompt: str, - lora_request: Optional[LoRARequest], tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[int]: """ Async version of [`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt]. """ - tokenizer = self.get_tokenizer_group() + tokenizer = self.get_tokenizer() tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) - return await tokenizer.encode_async(prompt=prompt, - lora_request=lora_request, - **tokenization_kwargs) + return tokenizer.encode(prompt, **tokenization_kwargs) - def _get_mm_tokenizer( - self, - lora_request: Optional[LoRARequest], - ) -> AnyTokenizer: + def _get_mm_tokenizer(self) -> AnyTokenizer: # PrithviGeoSpatialMAE needs to be initialized without a tokenizer # while using also multi-modal input if not self.tokenizer: return cast(AnyTokenizer, object()) # Dummy - tokenizer_group = self.get_tokenizer_group() - return tokenizer_group.get_lora_tokenizer(lora_request) + tokenizer = self.get_tokenizer() + return tokenizer - async def _get_mm_tokenizer_async( - self, - lora_request: Optional[LoRARequest], - ) -> AnyTokenizer: + async def _get_mm_tokenizer_async(self) -> AnyTokenizer: # PrithviGeoSpatialMAE needs to be initialized without a tokenizer # while using also multi-modal input if not self.tokenizer: return cast(AnyTokenizer, object()) # Dummy - tokenizer_group = self.get_tokenizer_group() - return await tokenizer_group.get_lora_tokenizer_async(lora_request) + tokenizer = self.get_tokenizer() + return tokenizer def _process_multimodal( self, @@ -256,7 +238,6 @@ class InputPreprocessor: mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: @@ -264,7 +245,7 @@ class InputPreprocessor: Apply the model's multi-modal processor to a multi-modal prompt, returning the corresponding token IDs and metadata. """ - tokenizer = self._get_mm_tokenizer(lora_request) + tokenizer = self._get_mm_tokenizer() mm_processor = self.mm_registry.create_processor( self.model_config, @@ -299,7 +280,6 @@ class InputPreprocessor: mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: @@ -307,7 +287,7 @@ class InputPreprocessor: Async version of [`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal]. """ - tokenizer = await self._get_mm_tokenizer_async(lora_request) + tokenizer = await self._get_mm_tokenizer_async() mm_processor = self.mm_registry.create_processor( self.model_config, @@ -386,7 +366,6 @@ class InputPreprocessor: self, parsed_content: TokensPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: @@ -400,7 +379,6 @@ class InputPreprocessor: multi_modal_data, parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) else: @@ -415,7 +393,6 @@ class InputPreprocessor: self, parsed_content: TokensPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: @@ -429,7 +406,6 @@ class InputPreprocessor: multi_modal_data, parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) else: @@ -444,7 +420,6 @@ class InputPreprocessor: self, parsed_content: TextPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: @@ -457,13 +432,11 @@ class InputPreprocessor: multi_modal_data, parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) else: prompt_token_ids = self._tokenize_prompt( prompt_text, - lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, ) inputs = token_inputs( @@ -480,7 +453,6 @@ class InputPreprocessor: self, parsed_content: TextPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: @@ -493,13 +465,11 @@ class InputPreprocessor: multi_modal_data, parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) else: prompt_token_ids = await self._tokenize_prompt_async( prompt_text, - lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, ) inputs = token_inputs( @@ -516,7 +486,6 @@ class InputPreprocessor: self, prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> SingletonInputs: @@ -526,7 +495,6 @@ class InputPreprocessor: Arguments: * prompt: single encoder or decoder input prompt - * lora_request: this is only valid for decoder prompts Returns: @@ -539,21 +507,18 @@ class InputPreprocessor: if parsed["type"] == "tokens": return self._process_tokens( parsed["content"], - lora_request=lora_request, mm_uuids=mm_uuids, ) if parsed["type"] == "text": return self._process_text( parsed["content"], tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) if parsed["type"] == "str": return self._process_text( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) @@ -563,7 +528,6 @@ class InputPreprocessor: self, prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> SingletonInputs: @@ -578,21 +542,18 @@ class InputPreprocessor: if parsed["type"] == "tokens": return await self._process_tokens_async( parsed["content"], - lora_request=lora_request, mm_uuids=mm_uuids, ) if parsed["type"] == "text": return await self._process_text_async( parsed["content"], tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) if parsed["type"] == "str": return await self._process_text_async( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) @@ -844,7 +805,6 @@ class InputPreprocessor: self, prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> DecoderOnlyInputs: @@ -856,7 +816,6 @@ class InputPreprocessor: Arguments: * prompt: input prompt - * lora_request Returns: @@ -866,7 +825,6 @@ class InputPreprocessor: prompt_comps = self._prompt_to_llm_inputs( prompt, tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) @@ -876,7 +834,6 @@ class InputPreprocessor: self, prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> DecoderOnlyInputs: @@ -887,7 +844,6 @@ class InputPreprocessor: prompt_comps = await self._prompt_to_llm_inputs_async( prompt, tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) @@ -897,7 +853,6 @@ class InputPreprocessor: self, prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> ProcessorInputs: @@ -919,7 +874,6 @@ class InputPreprocessor: return self._process_decoder_only_prompt( prompt, tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) @@ -927,7 +881,6 @@ class InputPreprocessor: self, prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, - lora_request: Optional[LoRARequest] = None, *, mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> ProcessorInputs: @@ -952,7 +905,6 @@ class InputPreprocessor: return await self._process_decoder_only_prompt_async( prompt, tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 56b01ecf78c46..e2d2846a28073 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -10,18 +10,13 @@ from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence, from .detokenizer_utils import (convert_prompt_ids_to_tokens, detokenize_incrementally) from .tokenizer import AnyTokenizer -from .tokenizer_group import TokenizerGroup class Detokenizer: """Provides methods to decode the output of a model into text.""" - def __init__(self, tokenizer_group: TokenizerGroup): - self.tokenizer_group = tokenizer_group - - def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer: - """Returns the HF tokenizer to use for a given sequence.""" - return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request) + def __init__(self, tokenizer: AnyTokenizer): + self.tokenizer = tokenizer def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, prompt_logprobs: list[Optional[dict[ @@ -32,9 +27,9 @@ class Detokenizer: Args: seq_group: The sequence group to decode. prompt_logprobs: The logprobs to decode. - position_offset: Offset of the first index of the logprobs + position_offset: Offset of the first index of the logprobs relative to the start of the sequence (for chunked prefill). - + Returns: The prompt logprobs with the decoded tokens. """ @@ -46,7 +41,6 @@ class Detokenizer: # Only prompt, without the generated token. all_token_ids = seq.get_token_ids() prompt_token_ids = all_token_ids[:-1] - tokenizer = self.get_tokenizer_for_seq(seq) prefix_offset = 0 read_offset = 0 next_iter_prefix_offset = 0 @@ -70,7 +64,7 @@ class Detokenizer: prompt_token_ids[:token_position] + [token_id]) (new_tokens, new_text, new_prefix_offset, new_read_offset) = detokenize_incrementally( - tokenizer=tokenizer, + tokenizer=self.tokenizer, all_input_ids=prompt_token_ids_with_token, prev_tokens=prev_tokens, prefix_offset=prefix_offset, @@ -111,7 +105,6 @@ class Detokenizer: """ all_input_ids = seq.get_token_ids() token_id_generated_this_iteration = all_input_ids[-1] - tokenizer = self.get_tokenizer_for_seq(seq) # Convert prompt token IDs to tokens if necessary. # Do it here so that we don't have to repeat this @@ -119,14 +112,14 @@ class Detokenizer: if seq.tokens is None: (seq.tokens, seq.prefix_offset, seq.read_offset) = convert_prompt_ids_to_tokens( - tokenizer=tokenizer, + tokenizer=self.tokenizer, prompt_ids=all_input_ids[:-1], skip_special_tokens=prms.skip_special_tokens, ) (new_tokens, new_decoded_token_text, prefix_offset, read_offset) = detokenize_incrementally( - tokenizer=tokenizer, + tokenizer=self.tokenizer, all_input_ids=all_input_ids, prev_tokens=seq.tokens, prefix_offset=seq.prefix_offset, @@ -150,7 +143,7 @@ class Detokenizer: and token_id != VLLM_INVALID_TOKEN_ID): all_input_ids_with_logprob = previous_tokens + [token_id] (_, new_text, _, _) = detokenize_incrementally( - tokenizer=tokenizer, + tokenizer=self.tokenizer, all_input_ids=all_input_ids_with_logprob, prev_tokens=seq.tokens, prefix_offset=seq.prefix_offset, diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index b3f1977f26cf4..9aaac66817394 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Any, Optional, Union import huggingface_hub from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) +from typing_extensions import assert_never from vllm import envs from vllm.logger import init_logger @@ -19,7 +20,6 @@ from vllm.transformers_utils.config import ( get_sentence_transformer_tokenizer_config) from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import make_async if TYPE_CHECKING: from vllm.config import ModelConfig @@ -274,20 +274,19 @@ def cached_tokenizer_from_config( ) -def get_lora_tokenizer(lora_request: LoRARequest, *args, - **kwargs) -> Optional[AnyTokenizer]: - if lora_request is None: - return None - try: - tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs) - except Exception as e: - # No tokenizer was found in the LoRA folder, - # use base model tokenizer - logger.warning( - "No tokenizer found in %s, using base model tokenizer instead. " - "(Exception: %s)", lora_request.lora_path, e) - tokenizer = None - return tokenizer +def init_tokenizer_from_configs(model_config: ModelConfig): + runner_type = model_config.runner_type + if runner_type == "generate" or runner_type == "draft": + truncation_side = "left" + elif runner_type == "pooling": + truncation_side = "right" + else: + assert_never(runner_type) - -get_lora_tokenizer_async = make_async(get_lora_tokenizer) + return get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision, + truncation_side=truncation_side, + ) diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index 20e5fea714e70..b1f84a023fc31 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -61,6 +61,11 @@ class TokenizerBase(ABC): def max_token_id(self) -> int: raise NotImplementedError() + @property + @abstractmethod + def truncation_side(self) -> str: + raise NotImplementedError() + def __len__(self) -> int: return self.vocab_size diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py deleted file mode 100644 index 6b519cccd3cc6..0000000000000 --- a/vllm/transformers_utils/tokenizer_group.py +++ /dev/null @@ -1,132 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -from typing_extensions import assert_never - -from vllm.config import ModelConfig, SchedulerConfig -from vllm.config.lora import LoRAConfig -from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, - get_lora_tokenizer, - get_lora_tokenizer_async, - get_tokenizer) -from vllm.utils import LRUCache - - -class TokenizerGroup: - """A group of tokenizers that can be used for LoRA adapters.""" - - def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, - max_input_length: Optional[int], **tokenizer_config): - self.tokenizer_id = tokenizer_id - self.tokenizer_config = tokenizer_config - self.enable_lora = enable_lora - self.max_input_length = max_input_length - self.truncation_side = tokenizer_config.get("truncation_side", "left") - self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) - max_loras = tokenizer_config.get("max_loras", 0) - self.lora_tokenizers = LRUCache[int, AnyTokenizer]( - capacity=max(max_loras, max_num_seqs) if enable_lora else 0) - - def get_max_input_len(self, - lora_request: Optional[LoRARequest] = None - ) -> Optional[int]: - """Get the maximum input length for the LoRA request.""" - return self.max_input_length - - def _raise_if_input_too_long(self, - encoded_tokens: list[int], - lora_request: Optional[LoRARequest] = None): - input_length = len(encoded_tokens) - if lora_request: - max_input_length = (lora_request.long_lora_max_len - or self.max_input_length) - else: - max_input_length = self.max_input_length - if max_input_length is not None and input_length > max_input_length: - raise ValueError("Input too long.", input_length, max_input_length) - - def encode(self, - prompt: str, - max_length: Optional[int] = None, - truncation: Optional[bool] = None, - lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: - - tokenizer = self.get_lora_tokenizer(lora_request) - ret = encode_tokens(tokenizer, - prompt, - max_length=max_length, - truncation=truncation, - add_special_tokens=add_special_tokens) - self._raise_if_input_too_long(ret, lora_request) - return ret - - async def encode_async( - self, - prompt: str, - max_length: Optional[int] = None, - truncation: Optional[bool] = None, - lora_request: Optional[LoRARequest] = None, - add_special_tokens: Optional[bool] = None) -> list[int]: - tokenizer = await self.get_lora_tokenizer_async(lora_request) - ret = encode_tokens(tokenizer, - prompt, - max_length=max_length, - truncation=truncation, - add_special_tokens=add_special_tokens) - self._raise_if_input_too_long(ret, lora_request) - return ret - - def get_lora_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - if not lora_request or not self.enable_lora: - return self.tokenizer - if lora_request.lora_int_id not in self.lora_tokenizers: - tokenizer = (get_lora_tokenizer( - lora_request, **self.tokenizer_config) or self.tokenizer) - self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) - return tokenizer - else: - return self.lora_tokenizers[lora_request.lora_int_id] - - async def get_lora_tokenizer_async( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - if not lora_request or not self.enable_lora: - return self.tokenizer - if lora_request.lora_int_id not in self.lora_tokenizers: - tokenizer = (await get_lora_tokenizer_async( - lora_request, **self.tokenizer_config) or self.tokenizer) - self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) - return tokenizer - else: - return self.lora_tokenizers[lora_request.lora_int_id] - - -def init_tokenizer_from_configs(model_config: ModelConfig, - scheduler_config: SchedulerConfig, - lora_config: Optional[LoRAConfig]): - runner_type = model_config.runner_type - if runner_type == "generate" or runner_type == "draft": - truncation_side = "left" - elif runner_type == "pooling": - truncation_side = "right" - else: - assert_never(runner_type) - - return TokenizerGroup( - tokenizer_id=model_config.tokenizer, - enable_lora=bool(lora_config), - max_num_seqs=scheduler_config.max_num_seqs, - max_loras=lora_config.max_loras if lora_config else 0, - max_input_length=None, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision, - truncation_side=truncation_side) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index f545993a5a980..5b07327cf2b81 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -327,6 +327,10 @@ class MistralTokenizer(TokenizerBase): def max_token_id(self) -> int: return self._max_token_id + @property + def truncation_side(self) -> str: + raise NotImplementedError() + def __len__(self) -> int: return self.vocab_size diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a9ced402b974f..f17c269e4709e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -29,8 +29,8 @@ from vllm.tasks import SupportedTask from vllm.tracing import init_tracer from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv, deprecate_kwargs) @@ -112,9 +112,7 @@ class AsyncLLM(EngineClient): else: # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) + model_config=vllm_config.model_config) # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor( @@ -596,15 +594,12 @@ class AsyncLLM(EngineClient): async def get_input_preprocessor(self) -> InputPreprocessor: return self.processor.input_preprocessor - async def get_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: + async def get_tokenizer(self) -> AnyTokenizer: if self.tokenizer is None: raise ValueError("Unable to get tokenizer because " "skip_tokenizer_init is True") - return self.tokenizer.get_lora_tokenizer(lora_request) + return self.tokenizer async def is_tracing_enabled(self) -> bool: return self.observability_config.otlp_traces_endpoint is not None diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index fca5a783bc3bf..c93bfc35f0aeb 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -20,8 +20,8 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask from vllm.tracing import init_tracer -from vllm.transformers_utils.tokenizer_group import ( - TokenizerGroup, init_tokenizer_from_configs) +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.utils import Device from vllm.v1.engine.core_client import EngineCoreClient @@ -89,9 +89,7 @@ class LLMEngine: else: # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) + model_config=vllm_config.model_config) # Processor (convert Inputs --> EngineCoreRequests) self.processor = Processor(vllm_config=vllm_config, @@ -297,7 +295,7 @@ class LLMEngine: assert self.log_stats, "Stat logging disabled" return get_metrics_snapshot() - def get_tokenizer_group(self) -> TokenizerGroup: + def get_tokenizer(self) -> AnyTokenizer: if self.tokenizer is None: raise ValueError("Unable to get tokenizer because " "skip_tokenizer_init is True") diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 14ac1e3e5afa8..5dad63988daa4 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -14,7 +14,6 @@ from vllm.sampling_params import RequestOutputKind from vllm.tracing import (SpanAttributes, SpanKind, Tracer, extract_trace_context) from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason from vllm.v1.engine.detokenizer import IncrementalDetokenizer from vllm.v1.engine.logprobs import LogprobsProcessor @@ -290,7 +289,7 @@ class RequestState: class OutputProcessor: """Process EngineCoreOutputs into RequestOutputs.""" - def __init__(self, tokenizer: TokenizerGroup, log_stats: bool): + def __init__(self, tokenizer: AnyTokenizer, log_stats: bool): self.log_stats = log_stats self.tokenizer = tokenizer self.request_states: dict[str, RequestState] = {} @@ -347,10 +346,7 @@ class OutputProcessor: if request_id in self.request_states: raise ValueError(f"Request id {request_id} already running.") - tokenizer = None if not self.tokenizer else \ - self.tokenizer.get_lora_tokenizer(request.lora_request) - - req_state = RequestState.from_new_request(tokenizer=tokenizer, + req_state = RequestState.from_new_request(tokenizer=self.tokenizer, request=request, prompt=prompt, parent_req=parent_req, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 327b4e2705485..8d9f2ba1ec825 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -9,6 +9,7 @@ from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import split_enc_dec_inputs from vllm.inputs.preprocess import InputPreprocessor +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import processor_cache_from_config @@ -17,7 +18,7 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.structured_output.backend_guidance import ( validate_guidance_grammar) @@ -28,13 +29,15 @@ from vllm.v1.structured_output.backend_outlines import ( from vllm.v1.structured_output.backend_xgrammar import ( validate_xgrammar_grammar) +logger = init_logger(__name__) + class Processor: def __init__( self, vllm_config: VllmConfig, - tokenizer: TokenizerGroup, + tokenizer: AnyTokenizer, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): @@ -90,7 +93,6 @@ class Processor: def _validate_sampling_params( self, params: SamplingParams, - lora_request: Optional[LoRARequest], ) -> None: self._validate_structured_output(params) self._validate_logit_bias(params) @@ -103,8 +105,7 @@ class Processor: # When skip_tokenizer_init=True, we can't validate token IDs # Skip validation and let the model handle invalid tokens return - tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) - vocab_size = len(tokenizer) + vocab_size = len(self.tokenizer) if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): raise ValueError( "allowed_token_ids contains out-of-vocab token id!") @@ -144,7 +145,6 @@ class Processor: def _validate_params( self, params: Union[SamplingParams, PoolingParams], - lora_request: Optional[LoRARequest], ): """ Validate supported SamplingParam. @@ -155,14 +155,14 @@ class Processor: return self._validate_logprobs(params) - self._validate_sampling_params(params, lora_request) + self._validate_sampling_params(params) self._validate_supported_sampling_params(params) def _validate_multi_modal_uuids(self, prompt: PromptType) -> None: """ Validate that user-provided multi_modal_uuids align with multi_modal_data in the incoming request prompt(s). - Only checks lengths; `None` entries are allowed and will be + Only checks lengths; `None` entries are allowed and will be auto-hashed downstream. """ @@ -202,10 +202,22 @@ class Processor: _validate_single_prompt(prompt) # type: ignore[arg-type] def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: - if lora_request is not None and not self.lora_config: + if lora_request is None: + return + + # LoRA request passed in while LoRA is not enabled + if not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") + if self.tokenizer is not None: + logger.warning_once( + "vLLM has deprecated support for supporting different " + "tokenizers for different LoRAs. By default, vLLM uses base " + "model's tokenizer. If you are using a LoRA " + "with its own tokenizer, consider specifying `--tokenizer " + "[lora_path]` to use the LoRA tokenizer.") + def _validate_structured_output(self, params: SamplingParams) -> None: if not params.guided_decoding or not self.decoding_config: return @@ -326,7 +338,7 @@ class Processor: # TODO(woosuk): Support pooling models. self._validate_lora(lora_request) - self._validate_params(params, lora_request) + self._validate_params(params) data_parallel_size = self.vllm_config.parallel_config.data_parallel_size if data_parallel_rank is not None and not (0 <= data_parallel_rank < @@ -365,7 +377,6 @@ class Processor: processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, tokenization_kwargs=tokenization_kwargs, - lora_request=lora_request, mm_uuids=mm_uuids, ) from vllm.platforms import current_platform @@ -375,9 +386,9 @@ class Processor: processed_inputs=processed_inputs, ) - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + eos_token_id = self.input_preprocessor.get_eos_token_id() - self._validate_model_inputs(processed_inputs, lora_request) + self._validate_model_inputs(processed_inputs) encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) @@ -394,8 +405,7 @@ class Processor: sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) if self.tokenizer is not None: - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) + sampling_params.update_from_tokenizer(self.tokenizer) else: pooling_params = params.clone() @@ -436,24 +446,17 @@ class Processor: trace_headers=trace_headers, ) - def _validate_model_inputs(self, - inputs: ProcessorInputs, - lora_request: Optional[LoRARequest] = None): + def _validate_model_inputs(self, inputs: ProcessorInputs): encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs) if encoder_inputs is not None: - self._validate_model_input(encoder_inputs, - lora_request, - prompt_type="encoder") + self._validate_model_input(encoder_inputs, prompt_type="encoder") - self._validate_model_input(decoder_inputs, - lora_request, - prompt_type="decoder") + self._validate_model_input(decoder_inputs, prompt_type="decoder") def _validate_model_input( self, prompt_inputs: SingletonInputs, - lora_request: Optional[LoRARequest], *, prompt_type: Literal["encoder", "decoder"], ): @@ -469,7 +472,7 @@ class Processor: if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) + tokenizer = self.tokenizer max_input_id = max(prompt_ids, default=0) # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 57854cc112041..1ab29dfecd9e4 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Optional from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_guidance import GuidanceBackend from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, @@ -60,10 +60,7 @@ class StructuredOutputManager: max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) self.tokenizer = init_tokenizer_from_configs( - model_config=self.vllm_config.model_config, - scheduler_config=self.vllm_config.scheduler_config, - lora_config=self.vllm_config.lora_config, - ).get_lora_tokenizer(None) + model_config=self.vllm_config.model_config) reasoning_backend = \ self.vllm_config.decoding_config.reasoning_backend if reasoning_backend: From 0fb2551c238c7ccbcf6f25ef4646ce6c92f684d1 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Wed, 17 Sep 2025 16:49:19 +0800 Subject: [PATCH 354/584] [Docs] Fix griffe warning in base_static_graph.py (#25018) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- vllm/compilation/base_static_graph.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py index 161d066ce9fb8..6ee82e74963d9 100644 --- a/vllm/compilation/base_static_graph.py +++ b/vllm/compilation/base_static_graph.py @@ -12,8 +12,13 @@ class AbstractStaticGraphWrapper(Protocol): to be captured as a static graph. """ - def __init__(self, runnable: Callable, vllm_config: VllmConfig, - runtime_mode: CUDAGraphMode, **kwargs): + def __init__( + self, + runnable: Callable[..., Any], + vllm_config: VllmConfig, + runtime_mode: CUDAGraphMode, + **kwargs: Any, + ) -> None: """ Initializes the StaticGraphWrapper class with graph capturing and execution-related configurations. @@ -31,7 +36,7 @@ class AbstractStaticGraphWrapper(Protocol): """ raise NotImplementedError - def __call__(self, *args, **kwargs) -> Any: + def __call__(self, *args: Any, **kwargs: Any) -> Any: """ Executes the wrapped callable. From bb58dc8c20315038ea5e14007de7269dfaec1ce4 Mon Sep 17 00:00:00 2001 From: Xinyu Chen <xinyu1.chen@intel.com> Date: Wed, 17 Sep 2025 16:57:25 +0800 Subject: [PATCH 355/584] [DP] Create placement groups by ray_device_key (#25026) Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> --- vllm/v1/engine/utils.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index df2fd8d9df078..18ef25ceb6f5e 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -334,20 +334,22 @@ class CoreEngineActorManager: "No nodes with resources found in Ray cluster.") assert dp_master_ip_key in nodes[0], ( "The DP master node (ip: %s) is missing or dead", dp_master_ip) + device_str = current_platform.ray_device_key for node_resources in nodes: - if "GPU" not in node_resources: + if device_str not in node_resources: continue # For now, each DP rank can only be assigned to one node # TODO(rui): support allocating a single DP rank # to multiple nodes - available_engine_count = int(node_resources["GPU"]) // world_size + available_engine_count = int( + node_resources[device_str]) // world_size if dp_master_ip_key in node_resources: assert available_engine_count >= local_engine_count, ( "Not enough resources to allocate DP ranks " f"on DP master node {dp_master_ip}") for i in range(local_engine_count): bundles = [{ - "GPU": 1.0, + device_str: 1.0, "node:" + dp_master_ip: 0.001 }] * world_size + [{ "CPU": 1.0 @@ -363,7 +365,7 @@ class CoreEngineActorManager: for i in range(available_engine_count): if len(placement_groups) == num_pg_to_create: break - bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] + bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}] pg = ray.util.placement_group( name=f"dp_rank_{len(placement_groups)}", strategy="STRICT_PACK", @@ -415,17 +417,18 @@ class CoreEngineActorManager: local_dp_ranks = [] num_pg_created = 0 + device_str = current_platform.ray_device_key for node in nodes: if num_pg_created >= num_pg_to_create: break node_ip = node.node_ip node_id = node.node_id - available_gpus = int(available_resources[node_id]["GPU"]) + available_gpus = int(available_resources[node_id][device_str]) # Get total GPUs on this node from the node's resources # Ray stores node resources with node ID as key - total_gpus = int(total_resources[node_id]["GPU"]) + total_gpus = int(total_resources[node_id][device_str]) # Calculate used GPUs and used engines on this node used_gpus = max(0, total_gpus - available_gpus) @@ -444,13 +447,13 @@ class CoreEngineActorManager: # Create bundles with node constraint for master node if node_ip == dp_master_ip: bundles = [{ - "GPU": 1.0, + device_str: 1.0, "node:" + dp_master_ip: 0.001 }] * world_size + [{ "CPU": 1.0 }] else: - bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] + bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}] pg = ray.util.placement_group( name=f"dp_rank_{rank}", From 544fe76b95aacdb6d0636c41813bee6236fb0027 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Wed, 17 Sep 2025 17:03:52 +0800 Subject: [PATCH 356/584] [Frontend] Support returning all prompt logprobs (#24956) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> --- tests/entrypoints/openai/test_chat_echo.py | 22 ++++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 24 ++++++++++++++-------- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index de63f4ed218b6..0f459dd3d8574 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -22,6 +22,8 @@ def server(): "--enforce-eager", "--max-model-len", "4080", + "--max-logprobs", # test prompt_logprobs equal to -1 + "151936" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -77,3 +79,23 @@ async def test_chat_session_with_echo_and_continue_final_message( else: assert message.content is not None and saying not in message.content assert message.role == "assistant" + + +@pytest.mark.asyncio +async def test_prompt_logprobs(client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "Beijing is the capital of which country?" + }] + + completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + extra_body={"prompt_logprobs": -1}, + ) + + assert completion.prompt_logprobs is not None + assert len(completion.prompt_logprobs) > 0 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 8ecb1a8239c35..6b4c3f531dbce 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -822,13 +822,17 @@ class ChatCompletionRequest(OpenAIBaseModel): @classmethod def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: - if data.get("stream") and prompt_logprobs > 0: + if data.get("stream") and (prompt_logprobs > 0 + or prompt_logprobs == -1): raise ValueError( "`prompt_logprobs` are not available when `stream=True`.") - if prompt_logprobs < 0: - raise ValueError("`prompt_logprobs` must be a positive value.") - + if prompt_logprobs < 0 and prompt_logprobs != -1: + raise ValueError( + "`prompt_logprobs` must be a positive value or -1.") + if prompt_logprobs == -1 and not envs.VLLM_USE_V1: + raise ValueError("`prompt_logprobs=-1` is only supported with " + "vLLM engine V1.") if (top_logprobs := data.get("top_logprobs")) is not None: if top_logprobs < 0: raise ValueError("`top_logprobs` must be a positive value.") @@ -1246,13 +1250,17 @@ class CompletionRequest(OpenAIBaseModel): @classmethod def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: - if data.get("stream") and prompt_logprobs > 0: + if data.get("stream") and (prompt_logprobs > 0 + or prompt_logprobs == -1): raise ValueError( "`prompt_logprobs` are not available when `stream=True`.") - if prompt_logprobs < 0: - raise ValueError("`prompt_logprobs` must be a positive value.") - + if prompt_logprobs < 0 and prompt_logprobs != -1: + raise ValueError( + "`prompt_logprobs` must be a positive value or -1.") + if prompt_logprobs == -1 and not envs.VLLM_USE_V1: + raise ValueError("`prompt_logprobs=-1` is only supported with " + "vLLM engine V1.") if (logprobs := data.get("logprobs")) is not None and logprobs < 0: raise ValueError("`logprobs` must be a positive value.") From 2b856970313e80a649a573879d94e9d3430ba018 Mon Sep 17 00:00:00 2001 From: Shijun Yin <shijun.yin@outlook.com> Date: Wed, 17 Sep 2025 17:21:18 +0800 Subject: [PATCH 357/584] [BugFix] enable DOTALL to match multi-line tool_call parameters in extract_tool_call_required_streaming (#24668) Signed-off-by: Shijun Yin <shijun.yin@outlook.com> --- vllm/entrypoints/openai/serving_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 61d65bd8f119a..cd85baa9ba661 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -418,7 +418,7 @@ class OpenAIServingChat(OpenAIServing): if not function_name_returned: # get partly generated arguments from the latest tool call param_match = re.search(r'.*"parameters":\s*(.*)', - current_text) + current_text, re.DOTALL) arguments = param_match.group(1) if param_match else "" arguments, _ = OpenAIServingChat._filter_delta_text( arguments, previous_text) From e120533d7ae3bc8c3ef39f215de274f1280bb454 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Wed, 17 Sep 2025 20:19:15 +0800 Subject: [PATCH 358/584] [Misc] Avoid use of deprecated `AutoModelForVision2Seq` (#25065) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- tests/models/multimodal/generation/test_common.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index d61b182761e44..79f9d607f3386 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -10,7 +10,7 @@ from pathlib import PosixPath import pytest from transformers import (AutoModel, AutoModelForImageTextToText, - AutoModelForTextToWaveform, AutoModelForVision2Seq) + AutoModelForTextToWaveform) from vllm.platforms import current_platform from vllm.utils import identity @@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = { video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], @@ -502,7 +502,7 @@ VLM_TEST_SETTINGS = { num_video_frames=16, max_model_len=16384, hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( @@ -518,7 +518,7 @@ VLM_TEST_SETTINGS = { num_video_frames=16, max_model_len=4096, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, ), "mantis": VLMTestInfo( @@ -680,7 +680,7 @@ VLM_TEST_SETTINGS = { multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501 max_model_len=4096, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.cpu_model], @@ -784,7 +784,7 @@ VLM_TEST_SETTINGS = { test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, custom_test_opts=[CustomTestOptions( @@ -800,7 +800,7 @@ VLM_TEST_SETTINGS = { test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=4096, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, custom_test_opts=[CustomTestOptions( inputs=custom_inputs.windows_attention_image_qwen2_5_vl(), From 252ada5559808783d6a23b489156b3705cea0417 Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:53:30 +0300 Subject: [PATCH 359/584] Add RADIO Vision Encoder Support to vLLM (#24595) Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com> Co-authored-by: root <root@cw-dfw-h100-001-305-026.cm.cluster> --- tests/models/multimodal/pooling/test_radio.py | 86 +++ .../model_executor/models/nano_nemotron_vl.py | 114 ++-- vllm/model_executor/models/radio.py | 576 ++++++++++++++++++ vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/radio.py | 104 ++++ 5 files changed, 826 insertions(+), 56 deletions(-) create mode 100644 tests/models/multimodal/pooling/test_radio.py create mode 100644 vllm/model_executor/models/radio.py create mode 100644 vllm/transformers_utils/configs/radio.py diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py new file mode 100644 index 0000000000000..27b9fe369e800 --- /dev/null +++ b/tests/models/multimodal/pooling/test_radio.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +import torch.nn as nn +from huggingface_hub import snapshot_download +from transformers import AutoConfig, AutoModel, CLIPImageProcessor + +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.model_executor.models.radio import RadioModel +from vllm.transformers_utils.configs.radio import RadioConfig +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE + +from ....conftest import ImageTestAssets + +# we use snapshot_download to prevent conflicts between +# dynamic_module and trust_remote_code for hf_runner +DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] + + +@torch.inference_mode() +def run_radio_test( + image_assets: ImageTestAssets, + model_id: str, + *, + dtype: str, +): + model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN) + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] + + img_processor = CLIPImageProcessor.from_pretrained(model) + images = [asset.pil_image for asset in image_assets] + # Input resolution must be a multiple of `self.min_resolution_step`. + # Using `self.get_nearest_supported_resolution`, for assets 432x642 the + # nearest supported resolution is 432x640. + pixel_values = [ + img_processor( + image, + return_tensors='pt').pixel_values.to(torch_dtype)[:, :, :, :640] + for image in images + ] + + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + + hf_model = AutoModel.from_pretrained( + model_id, + config=config, + torch_dtype=torch_dtype, + trust_remote_code=True, + ).to("cuda") + hf_model.eval() + + hf_outputs_per_image = [ + hf_model(pixel_value.to("cuda")).features + for pixel_value in pixel_values + ] + + radio_config = RadioConfig(model_name=config.args["model"], + reg_tokens=config.args["register_multiple"]) + vllm_model = RadioModel(radio_config) + vllm_model.load_weights(hf_model.state_dict()) + vllm_model = vllm_model.to("cuda", torch_dtype) + + vllm_outputs_per_image = [ + vllm_model(pixel_values=pixel_value.to("cuda")) + for pixel_value in pixel_values + ] + del vllm_model, hf_model + cleanup_dist_env_and_memory() + + cos_similar = nn.CosineSimilarity(dim=-1) + for vllm_output, hf_output in zip(vllm_outputs_per_image, + hf_outputs_per_image): + assert cos_similar(vllm_output, hf_output).mean() > 0.99 + + +@pytest.mark.parametrize("model_id", [ + "nvidia/C-RADIOv2-H", +]) +@pytest.mark.parametrize("dtype", ["half"]) +def test_radio(dist_init, image_assets, model_id, dtype: str) -> None: + run_radio_test( + image_assets, + model_id, + dtype=dtype, + ) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 21765a483b8e0..4f8652c006941 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -18,8 +18,8 @@ import torch import torch.nn as nn import torchvision.transforms as T from PIL import Image -from transformers import (AutoModel, BatchEncoding, BatchFeature, - PretrainedConfig, TensorType) +from transformers import (BatchEncoding, BatchFeature, PretrainedConfig, + TensorType) from vllm.config import VllmConfig from vllm.model_executor.layers.activation import ReLUSquaredActivation @@ -32,6 +32,7 @@ from vllm.model_executor.models.internvl import (calculate_internvl_targets, get_internvl_target_ratios) from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM +from vllm.model_executor.models.radio import RadioModel from vllm.model_executor.models.utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, @@ -48,6 +49,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.radio import RadioConfig from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -122,11 +124,6 @@ NanoNemotronVLVideoInputs = Union[NanoNemotronVLVideoPixelInputs, NanoNemotronVLVideoEmbeddingInputs] -def input_conditioner(x, norm_mean, norm_std): - y = (x - norm_mean) / norm_std - return y - - def dynamic_preprocess(image, *, image_size=512, @@ -305,8 +302,7 @@ class BaseNanoNemotronVLProcessor(ABC): images, max_num_tiles) image_inputs: dict[str, NestedTensors] = { "pixel_values_flat": - input_conditioner(torch.cat(pixel_values_lst), self.norm_mean, - self.norm_std), + torch.cat(pixel_values_lst), "image_num_patches": torch.tensor([len(item) for item in pixel_values_lst]), } @@ -428,8 +424,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): video_inputs: dict[str, NestedTensors] = { "pixel_values_flat_video": - input_conditioner(torch.cat(pixel_values_lst_video), - self.norm_mean, self.norm_std), + torch.cat(pixel_values_lst_video), "video_num_patches": torch.tensor([len(item) for item in pixel_values_lst_video]), } @@ -905,18 +900,9 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid, hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), ) - self.vision_model = AutoModel.from_config(config.vision_config, - trust_remote_code=True) - self.vision_model.model._initialize_weights = ( - self.vision_model.model._init_weights) - # Move input normalization to processor to mirror original HF - # implementation where normalization is done in fp32 - self.vision_model.radio_model.make_preprocessor_external() - self.vision_model = self.vision_model.to( + self.vision_model = self.get_vit_model_from_radio_config(config).to( self.language_model.config.torch_dtype) - self.drop_vision_class_token = True - # Construct the vision projection. vit_hidden_size = config.vit_hidden_size vision_projection_hidden_size = config.projector_hidden_size @@ -972,7 +958,7 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid, return x def extract_feature(self, pixel_values): - vit_embeds = self.vision_model(pixel_values).features + vit_embeds = self.vision_model(pixel_values) vit_embeds = vit_embeds.to(dtype=torch.bfloat16) h = w = int(vit_embeds.shape[1]**0.5) vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) @@ -1212,47 +1198,39 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid, sampling_metadata) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + adapter_dict = dict(self.mlp1.named_parameters()) - def is_vision_model_weights(weight: tuple[str, torch.Tensor]): - return weight[0].startswith("vision_model") + def is_llm(name: str) -> bool: + return name.startswith("language_model") def is_adapter_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("mlp1") - # Get references to parameters for direct loading - vision_model_dict = dict(self.vision_model.named_parameters()) - vision_model_buffers = dict(self.vision_model.named_buffers()) - adapter_dict = dict(self.mlp1.named_parameters()) + def is_vision_weights(name: str) -> bool: + return name.startswith("vision_model.radio_model.") - def llm_weights_generator(): - # Single pass over weights - for name, w in weights: - if is_vision_model_weights((name, w)): - # Load vision encoder weights directly - trimmed_name = ".".join(name.split(".")[1:]) - if "input_conditioner" in trimmed_name: - continue - if trimmed_name in vision_model_buffers: - param = vision_model_buffers[trimmed_name] - else: - param = vision_model_dict[trimmed_name] - with torch.no_grad(): - default_weight_loader(param, w) - elif is_adapter_weights((name, w)): - # Load vision-language adapter weights directly - trimmed_name = ".".join(name.split(".")[1:]) - param = adapter_dict[trimmed_name] - with torch.no_grad(): - default_weight_loader(param, w) - else: - # LLM weights: yield them to be loaded - # by language_model.load_weights - assert name.startswith("language_model") - trimmed_name = ".".join(name.split(".")[1:]) - yield (trimmed_name, w) + # Separate weights by component + llm_weights = [] + vision_weights = [] - # Now we call the language model load with the generator - self.language_model.load_weights(llm_weights_generator()) + for name, w in weights: + if is_llm(name): + # Strip 'language_model.' prefix for LLM weights + llm_weights.append((".".join(name.split(".")[1:]), w)) + elif is_adapter_weights((name, w)): + # Load vision-language adapter weights directly + trimmed_name = ".".join(name.split(".")[1:]) + param = adapter_dict[trimmed_name] + with torch.no_grad(): + default_weight_loader(param, w) + elif is_vision_weights(name): + # Convert: vision_model.radio_model.* → radio_model.* + hf_key = name[len( + "vision_model."):] # Remove "vision_model." prefix + vision_weights.append((hf_key, w)) + + self.language_model.load_weights(llm_weights) + self.vision_model.load_weights(vision_weights) def print_architecture(self, detailed: bool = True, @@ -1370,6 +1348,30 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid, }, } + def get_vit_model_from_radio_config(self, hf_config): + hf_config_vision = hf_config.vision_config + model_name = hf_config_vision.args.get("model") + if model_name is None: + raise ValueError(f'Unsupported vit model type: {model_name}') + + preferred_resolution = getattr(hf_config_vision, + "preferred_resolution", None) + image_size = preferred_resolution[0] if preferred_resolution else 224 + patch_size = getattr(hf_config_vision, "patch_size", 16) + + radio_config = RadioConfig( + model_name=model_name, + image_size=image_size, + patch_size=patch_size, + norm_mean=hf_config.norm_mean, + norm_std=hf_config.norm_std, + reg_tokens=(hf_config_vision.args.get("register_multiple") + if hasattr(hf_config_vision, "args") + and isinstance(hf_config_vision.args, dict) else None), + ) + + return RadioModel(config=radio_config) + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): return self.language_model.mamba_cache.copy_inputs_before_cuda_graphs( input_buffers, **kwargs) diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py new file mode 100644 index 0000000000000..9cbf844ae9f86 --- /dev/null +++ b/vllm/model_executor/models/radio.py @@ -0,0 +1,576 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import math +from collections.abc import Iterable +from itertools import repeat +from typing import Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from transformers import PretrainedConfig + +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.intern_vit import InternVisionEncoder + +input_dim_t = Union[int, tuple[int, int]] +norm_t = Union[tuple[float, float, float], torch.Tensor] + + +def _ntuple(n): + + def parse(x): + if isinstance(x, Iterable) and not isinstance(x, str): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +class InputConditioner(nn.Module): + + def __init__( + self, + input_scale: float, + norm_mean: norm_t, + norm_std: norm_t, + dtype: torch.dtype = None, + ): + super().__init__() + + self.dtype = dtype + + self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale) + self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale) + + def forward(self, x: torch.Tensor): + y = (x - self.norm_mean) / self.norm_std + if self.dtype is not None: + y = y.to(self.dtype) + return y + + +def _to_tensor(v: norm_t): + return torch.as_tensor(v, dtype=torch.float32).view(-1, 1, 1) + + +class ClsToken(nn.Module): + + def __init__( + self, + ndim: int, + num_tokens: int = 1, + enabled: bool = True, + register_multiple: Optional[int] = None, + num_registers: Optional[int] = None, + ): + super().__init__() + + self.ndim = ndim + self.enabled = enabled + self.num_registers = 0 + self.num_tokens = num_tokens + if enabled: + if num_registers: + self.num_registers = num_registers + elif register_multiple: + self.num_registers = register_multiple - (num_tokens % + register_multiple) + + scale = ndim**-0.5 + self.token = nn.Parameter( + torch.randn(num_tokens + self.num_registers, ndim) * scale) + + else: + self.token = None + + self.num_patches = self.num_tokens + self.num_registers + + def forward(self, x: torch.Tensor): + if self.token is None: + return x + + token = self.token.unsqueeze(0).expand(x.shape[0], -1, -1) + x = torch.cat([ + token, + x, + ], dim=1) + + return x + + +class ViTPatchGenerator(nn.Module): + + def __init__( + self, + # config: PretrainedConfig, + patch_size: int, + embed_dim: int, + input_dims: input_dim_t, + abs_pos: bool = True, + normalize_patches: bool = False, + cls_token: bool = False, + max_input_dims: Optional[input_dim_t] = None, + pos_dropout: float = 0.0, + return_pos_enc: bool = False, + num_cls_tokens: int = 1, + register_multiple: Optional[int] = None, + num_registers: Optional[int] = None, + patch_bias: bool = False, + device=None, + dtype=None, + ): + super().__init__() + if isinstance(input_dims, int): + input_dims = (input_dims, input_dims) + + if max_input_dims is None: + max_input_dims = input_dims + if isinstance(max_input_dims, int): + max_input_dims = (max_input_dims, max_input_dims) + + max_input_dims = tuple( + int(math.ceil(d / patch_size) * patch_size) + for d in max_input_dims) + + self.cpe_mode = max_input_dims != input_dims + self.pos_dropout = pos_dropout + self.return_pos_enc = return_pos_enc + + factory = dict(device=device, dtype=dtype) + + self.patch_size = patch_size + self.abs_pos = abs_pos + self.embed_dim = embed_dim + + self.num_rows = max_input_dims[0] // patch_size + self.num_cols = max_input_dims[1] // patch_size + self.input_dims = tuple(d // patch_size for d in input_dims) + self.num_patches = self.num_rows * self.num_cols + self.max_input_dims = max_input_dims + + self.im_to_patches = Im2Patches(patch_size) + self.embedder = ViTPatchLinear(patch_size, + embed_dim, + bias=patch_bias, + **factory) + + if abs_pos: + scale = embed_dim**-0.5 + self.pos_embed = nn.Parameter( + torch.randn(1, self.num_patches, embed_dim, **factory) * scale) + + self.cls_token = ClsToken( + embed_dim, + num_tokens=num_cls_tokens, + enabled=cls_token, + register_multiple=register_multiple, + num_registers=num_registers, + ) + + self.patch_normalizer = nn.LayerNorm( + embed_dim) if normalize_patches else nn.Identity() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + patches = self.embed_patches(x) + patches, pos_enc = self.apply_pos_enc(patches, input_size=x.shape[2:]) + patches = self.cls_token(patches) + patches = self.patch_normalizer(patches) + if self.return_pos_enc: + return patches, pos_enc + return patches + + @property + def apply_cls_token(self): + return self.cls_token.enabled + + @property + def num_cls_tokens(self): + return self.cls_token.num_tokens + + @property + def num_cls_patches(self): + return self.cls_token.num_patches + + @property + def num_registers(self): + return self.cls_token.num_registers + + @property + def num_skip(self): + return self.num_cls_tokens + self.num_registers + + def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter): + if src_embed.shape != targ_embed.shape: + src_size = int(math.sqrt(src_embed.shape[1])) + + assert src_size**2 == src_embed.shape[ + 1], 'Unable to interpolate non-square embedding' + + src_embed = rearrange(src_embed, + 'b (h w) c -> b c h w', + h=src_size, + w=src_size) + src_embed = F.interpolate(src_embed, + size=(self.num_rows, self.num_cols), + mode='bicubic', + align_corners=True, + antialias=False) + src_embed = rearrange(src_embed, 'b c h w -> b (h w) c') + targ_embed.data.copy_(src_embed) + + def _load_projection(self, src_proj_weight: torch.Tensor, + targ_proj_weight: torch.Tensor): + if src_proj_weight.shape != targ_proj_weight.shape: + src_patch_size = int(math.sqrt(src_proj_weight.shape[1] // 3)) + + assert (src_patch_size**2) * 3 == src_proj_weight.shape[ + 1], 'Unable to interpolate non-square patch size' + + src_proj_weight = rearrange(src_proj_weight, + 'b (c h w) -> b c h w', + c=3, + h=src_patch_size, + w=src_patch_size) + src_proj_weight = F.interpolate(src_proj_weight, + size=(self.patch_size, + self.patch_size), + mode='bicubic', + align_corners=True, + antialias=False) + src_proj_weight = rearrange(src_proj_weight, + 'b c h w -> b (c h w)') + targ_proj_weight.data.copy_(src_proj_weight) + + def embed_patches(self, x: torch.Tensor) -> torch.Tensor: + patches = self.im_to_patches(x) + patches = self.embedder(patches) + return patches + + def apply_pos_enc( + self, + patches: torch.Tensor, + patch_idxs: Optional[torch.Tensor] = None, + input_size: Optional[tuple[int, int]] = None, + ) -> torch.Tensor: + if not self.abs_pos: + return patches + + pos_enc = self.get_pos_enc(patches.shape[0], patch_idxs, input_size) + + if self.training and self.pos_dropout > 0: + keeps = torch.rand(patches.shape[0], + 1, + 1, + dtype=pos_enc.dtype, + device=pos_enc.device) > self.pos_dropout + pos_enc_drop = torch.where(keeps, pos_enc, 0) + else: + pos_enc_drop = pos_enc + + return patches + pos_enc_drop, pos_enc + + def get_pos_enc( + self, + batch_size: int, + patch_idxs: Optional[torch.Tensor] = None, + input_size: Optional[tuple[int, int]] = None, + ) -> torch.Tensor: + if input_size is None: + input_dims = self.input_dims + else: + input_dims = tuple(d // self.patch_size for d in input_size) + + pos_embed = self._get_pos_embeddings(batch_size, input_dims) + + if patch_idxs is None: + return pos_embed + + exp_patch_idxs = patch_idxs.unsqueeze(-1).expand( + -1, -1, pos_embed.shape[-1]) + + pos_embed = torch.gather(pos_embed.expand(patch_idxs.shape[0], -1, -1), + dim=1, + index=exp_patch_idxs) + return pos_embed + + def _get_pos_embeddings(self, batch_size: int, input_dims: tuple[int, + int]): + if (self.num_rows, self.num_cols) == input_dims: + return self.pos_embed + + pos_embed = self.pos_embed.reshape(1, self.num_rows, self.num_cols, + -1).permute(0, 3, 1, 2) + + def window_select(pos_embed): + if input_dims[0] < pos_embed.shape[-2]: + pos_embed = pos_embed[..., :input_dims[0], :] + if input_dims[1] < pos_embed.shape[-1]: + pos_embed = pos_embed[..., :, :input_dims[1]] + return pos_embed + + if self.cpe_mode: + if self.training: + min_scale = math.sqrt(0.1) + scale = torch.rand(batch_size, 1, 1, device=pos_embed.device + ) * (1 - min_scale) + min_scale + aspect_min = math.log(3 / 4) + aspect_max = -aspect_min + aspect = torch.exp( + torch.rand(batch_size, 1, 1, device=pos_embed.device) * + (aspect_max - aspect_min) + aspect_min) + + scale_x = scale * aspect + scale_y = scale * (1 / aspect) + scale_xy = torch.stack([scale_x, scale_y], dim=-1).clamp_(0, 1) + + pos_xy = torch.rand( + batch_size, 1, 1, 2, + device=pos_embed.device) * (1 - scale_xy) + + lin_x = torch.linspace( + 0, 1, steps=input_dims[1], + device=pos_embed.device)[None, None].expand( + batch_size, input_dims[0], -1) + lin_y = torch.linspace( + 0, 1, steps=input_dims[0], + device=pos_embed.device)[None, :, None].expand( + batch_size, -1, input_dims[1]) + + lin_xy = torch.stack([lin_x, lin_y], dim=-1) + + grid_xy = lin_xy * scale_xy + pos_xy + + # Convert to [-1, 1] range + grid_xy.mul_(2).sub_(1) + + pos_embed = F.grid_sample( + pos_embed.float().expand(batch_size, -1, -1, -1), + grid=grid_xy, + mode='bilinear', + padding_mode='zeros', + align_corners=True, + ).to(pos_embed.dtype) + else: + max_dim = max(input_dims) + pos_embed = F.interpolate(pos_embed.float(), + size=(max_dim, max_dim), + align_corners=True, + mode='bilinear').to(pos_embed.dtype) + + pos_embed = window_select(pos_embed) + else: + pos_embed = window_select(pos_embed) + + if pos_embed.shape[-2:] != input_dims: + pos_embed = F.interpolate(pos_embed.float(), + size=input_dims, + align_corners=True, + mode='bilinear').to(pos_embed.dtype) + + pos_embed = pos_embed.flatten(2).permute(0, 2, 1) + + return pos_embed + + +class Im2Patches(nn.Module): + + def __init__(self, patch_size: int): + super().__init__() + self.patch_size = patch_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.patch_size == 1: + patches = x.flatten(2) + patches = patches.permute(0, 2, 1) + return patches + + py = x.shape[-2] // self.patch_size + px = x.shape[-1] // self.patch_size + patches = rearrange( + x, + 'b c (py yy) (px xx) -> b (py px) (c yy xx)', + py=py, + yy=self.patch_size, + px=px, + xx=self.patch_size, + ) + return patches + + +class ViTPatchLinear(nn.Linear): + + def __init__(self, + patch_size: int, + embed_dim: int, + bias: bool = False, + **factory): + super().__init__(3 * (patch_size**2), embed_dim, bias=bias, **factory) + self.patch_size = patch_size + + +class RadioInternVisionModel(nn.Module): + packed_modules_mapping = { + "qkv": ["qkv"], + } + + def __init__( + self, + config: PretrainedConfig = None, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + num_dummy_heads: int = 0, + prefix: str = "", + ) -> None: + super().__init__() + + self.config = config + self.img_size, self.grid_size, self.num_patches = self._init_img_size( + to_2tuple(config.patch_size), config.image_size) + max_img_size = int( + round(config.max_img_size / config.patch_size) * config.patch_size) + self.patch_generator = ViTPatchGenerator( + config.patch_size, + config.hidden_size, + input_dims=self.img_size, + max_input_dims=max_img_size, + cls_token=True, + register_multiple=config.reg_tokens) + + self.encoder = InternVisionEncoder( + config=config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + num_dummy_heads=num_dummy_heads, + prefix=f"{prefix}.encoder", + ) + + def _init_img_size(self, patch_size, img_size: Union[int, tuple[int, + int]]): + if img_size is None: + return None, None, None + img_size = to_2tuple(img_size) + grid_size = tuple([s // p for s, p in zip(img_size, patch_size)]) + num_patches = grid_size[0] * grid_size[1] + return img_size, grid_size, num_patches + + def get_input_embeddings(self): + return self.embeddings + + def forward(self, x: torch.Tensor) -> torch.FloatTensor: + assert self.patch_generator is not None + hidden_states = self.patch_generator(x) + encoder_outputs = self.encoder(inputs_embeds=hidden_states) + return encoder_outputs + + +class RadioModel(nn.Module): + packed_modules_mapping = { + "qkv": ["qkv"], + } + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + num_dummy_heads: int = 0, + prefix: str = "", + ) -> None: + super().__init__() + + self.config = config + self.input_conditioner = InputConditioner( + input_scale=1.0, + norm_mean=config.norm_mean, + norm_std=config.norm_std, + ) + self.model = RadioInternVisionModel( + config=config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + num_dummy_heads=num_dummy_heads, + prefix=prefix) + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + pixel_embeds: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + x = self.input_conditioner(pixel_values) + y = self.model(x) + return self._extract_final(y) + + def load_weights(self, weights) -> set[str]: + loaded_params: set[str] = set() + params_dict = dict(self.named_parameters()) + + if isinstance(weights, dict): + weights_list = list(weights.items()) + else: + weights_list = list(weights) + + for name, weight in weights_list: + if not name.startswith("radio_model."): + # Skip non-radio weights + continue + + sub = name[len("radio_model."):] # drop "radio_model." prefix + + # Skip buffers not used in vLLM + if sub in {"summary_idxs"}: + continue + + vllm_key = None + if sub.startswith("model.patch_generator."): + vllm_key = f"model.patch_generator.{sub.split('.', 2)[-1]}" + elif sub.startswith("input_conditioner."): + vllm_key = f"input_conditioner.{sub.split('.', 1)[-1]}" + elif sub.startswith("model.blocks."): + # Encoder blocks: HF 'model.blocks.{i}.' -> + # vLLM 'model.encoder.layers.{i}.' + parts = sub.split(".") + if len(parts) >= 4: + layer_idx = parts[2] + suffix = ".".join(parts[3:]) + # Skip layer-scale entries that vLLM doesn't use + if suffix in {"ls1", "ls2"} or suffix.startswith( + ("ls1.", "ls2.")): + continue + vllm_key = f"model.encoder.layers.{layer_idx}.{suffix}" + + if vllm_key and vllm_key in params_dict: + param = params_dict[vllm_key] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(vllm_key) + + return loaded_params + + def _extract_final(self, y: torch.Tensor): + # Remove CLS + REGISTERS tokens + patch_gen = getattr(self.model, "patch_generator", None) + if patch_gen is not None: + all_feat = y[:, patch_gen.num_skip:] + + return all_feat diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index ca0d5def760a8..91bfeb8c55ee5 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -26,6 +26,7 @@ from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.olmo3 import Olmo3Config from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig +from vllm.transformers_utils.configs.radio import RadioConfig from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, Step3VisionEncoderConfig, @@ -48,6 +49,7 @@ __all__ = [ "Nemotron_Nano_VL_Config", "Olmo3Config", "OvisConfig", + "RadioConfig", "SpeculatorsConfig", "UltravoxConfig", "Step3VLConfig", diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py new file mode 100644 index 0000000000000..58ad7b8187bcd --- /dev/null +++ b/vllm/transformers_utils/configs/radio.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Radio vision model configuration""" + +from typing import Optional, Union + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = { + "vit_small_patch16_224": (384, 12, 6, 1536), + "vit_base_patch16_224": (768, 12, 12, 3072), + "vit_large_patch16_224": (1024, 24, 16, 4096), + "vit_huge_patch16_224": (1280, 32, 16, 5120), +} + +OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073) +OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711) + + +class RadioConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a Radio + vision model. It is used to instantiate a Radio model according to the + specified arguments, defining the model architecture. + + Args: + model_name (`str`, *optional*, defaults to "vit_base_patch16_224"): + Name of the vision transformer model (e.g., "vit_base_patch16_224"). + Used to determine architecture dimensions from + `VIT_TIMM_DIM_BY_NAME`. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + qkv_bias (`bool`, *optional*, defaults to True): + Whether to add a bias to the queries, keys and values. + qk_normalization (`bool`, *optional*, defaults to False): + Whether to apply normalization to queries and keys. + norm_type (`str`, *optional*, defaults to "layer_norm"): + The normalization type to use. + layer_norm_eps (`float`, *optional*, defaults to 1e-6): + The epsilon used by the layer normalization layers. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices. + hidden_act (`str`, *optional*, defaults to "gelu"): + The non-linear activation function in the encoder. + max_img_size (`int`, *optional*, defaults to 2048): + Maximum image size for position embeddings. + norm_mean (`tuple` or `list`, *optional*, + defaults to (0.48145466, 0.4578275, 0.40821073)): + Mean values for image normalization (RGB channels). + norm_std (`tuple` or `list`, *optional*, + defaults to (0.26862954, 0.26130258, 0.27577711)): + Standard deviation values for image normalization (RGB channels). + reg_tokens (`int`, *optional*): + Number of register tokens to use. + """ + + model_type = "radio" + + def __init__( + self, + model_name: str, + image_size: int = 224, + patch_size: int = 16, + qkv_bias: bool = True, + qk_normalization: bool = False, + norm_type: str = "layer_norm", + layer_norm_eps: float = 1e-6, + initializer_factor: float = 1.0, + hidden_act: str = "gelu", + max_img_size: int = 2048, + norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN, + norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD, + reg_tokens: Optional[int] = None, + **kwargs, + ): + self.model_name = model_name + ( + self.hidden_size, + self.num_hidden_layers, + self.num_attention_heads, + self.intermediate_size, + ) = VIT_TIMM_DIM_BY_NAME[model_name] + self.image_size = image_size + self.patch_size = patch_size + self.qkv_bias = qkv_bias + self.qk_normalization = qk_normalization + self.norm_type = norm_type + self.layer_norm_eps = layer_norm_eps + self.initializer_factor = initializer_factor + self.hidden_act = hidden_act + self.max_img_size = max_img_size + self.norm_mean = list(norm_mean) if isinstance(norm_mean, + (tuple, + list)) else norm_mean + self.norm_std = list(norm_std) if isinstance(norm_std, + (tuple, + list)) else norm_std + self.reg_tokens = reg_tokens + super().__init__(**kwargs) From 9fccd04e308b0b8a625dd78b7dfa4feed8131102 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" <jiang1.li@intel.com> Date: Wed, 17 Sep 2025 20:54:02 +0800 Subject: [PATCH 360/584] [Bugfix] Fix Stream usage in CPU model runner and OneDNN kernel check (#25046) Signed-off-by: jiang1.li <jiang1.li@intel.com> --- csrc/cpu/dnnl_kernels.cpp | 2 +- vllm/platforms/cpu.py | 5 +++++ vllm/v1/worker/cpu_model_runner.py | 8 ++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp index 9a3af4ac9d8a6..1c42a75bc2d61 100644 --- a/csrc/cpu/dnnl_kernels.cpp +++ b/csrc/cpu/dnnl_kernels.cpp @@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major CPU_KERNEL_GUARD_IN(onednn_mm) TORCH_CHECK(a.dim() == 2); TORCH_CHECK(a.stride(-1) == 1); - TORCH_CHECK(c.is_contiguous()); + TORCH_CHECK(c.stride(-1) == 1); MatMulPrimitiveHandler* ptr = reinterpret_cast<MatMulPrimitiveHandler*>(handler); diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index c5b6d91a62b6d..544e091491bf5 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -185,6 +185,11 @@ class CpuPlatform(Platform): parallel_config.distributed_executor_backend = "mp" if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker" + # Disable DBO + if parallel_config.enable_dbo: + logger.warning( + "Dual-Batch Overlap is not supported on CPU, disabled.") + parallel_config.enable_dbo = False # Note: workaround for v1 gpu_model_runner from vllm.config import CompilationLevel diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index ccdbeac64bce0..cd0f0af43e7e7 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -145,12 +145,20 @@ def _torch_cuda_wrapper(): self.record = lambda: None self.synchronize = lambda: None + class _StreamPlaceholder: + + def __init__(self, *args, **kwargs) -> None: + pass + cuda_event = torch.cuda.Event + cuda_stream = torch.cuda.Stream try: torch.cuda.Event = _EventPlaceholder + torch.cuda.Stream = _StreamPlaceholder yield finally: torch.cuda.Event = cuda_event + torch.cuda.Stream = cuda_stream @contextmanager From bfe93801614b73ee5b4ac8ff65f977686a674bf2 Mon Sep 17 00:00:00 2001 From: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:15:42 +0400 Subject: [PATCH 361/584] Apply fixes for CUDA 13 (#24599) Signed-off-by: Aidyn-A <aidyn.b.aitzhan@gmail.com> --- CMakeLists.txt | 10 ++++++++++ csrc/cub_helpers.h | 17 +++++++++++++++++ csrc/layernorm_kernels.cu | 13 ++++--------- csrc/layernorm_quant_kernels.cu | 13 ++++--------- csrc/moe/topk_softmax_kernels.cu | 16 +++------------- .../compressed_tensors/int8_quant_kernels.cu | 11 ++--------- csrc/quantization/fp8/common.cu | 9 ++------- .../fused_kernels/layernorm_utils.cuh | 14 +++++--------- 8 files changed, 47 insertions(+), 56 deletions(-) create mode 100644 csrc/cub_helpers.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 009c224dc7735..c48da948a0298 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() +# +# Set CUDA include flags for CXX compiler. +# +if(VLLM_GPU_LANG STREQUAL "CUDA") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include") + if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl") + endif() +endif() + # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. diff --git a/csrc/cub_helpers.h b/csrc/cub_helpers.h new file mode 100644 index 0000000000000..470a63a22cab0 --- /dev/null +++ b/csrc/cub_helpers.h @@ -0,0 +1,17 @@ +#pragma once + +#ifndef USE_ROCM + #include <cub/cub.cuh> + #if CUB_VERSION >= 200800 + #include <cuda/std/functional> +using CubAddOp = cuda::std::plus<>; +using CubMaxOp = cuda::maximum<>; + #else // if CUB_VERSION < 200800 +using CubAddOp = cub::Sum; +using CubMaxOp = cub::Max; + #endif // CUB_VERSION +#else + #include <hipcub/hipcub.hpp> +using CubAddOp = cub::Sum; +using CubMaxOp = cub::Max; +#endif // USE_ROCM diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 05be023de0f28..93c73d58390e1 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -1,15 +1,10 @@ #include "type_convert.cuh" #include "dispatch_utils.h" +#include "cub_helpers.h" #include <torch/cuda.h> #include <c10/cuda/CUDAGuard.h> -#ifndef USE_ROCM - #include <cub/cub.cuh> -#else - #include <hipcub/hipcub.hpp> -#endif - namespace vllm { // TODO(woosuk): Further optimize this kernel. @@ -30,7 +25,7 @@ __global__ void rms_norm_kernel( using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -85,7 +80,7 @@ fused_add_rms_norm_kernel( using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -126,7 +121,7 @@ fused_add_rms_norm_kernel( using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu index 0fd5849d9626c..be134089bd6d4 100644 --- a/csrc/layernorm_quant_kernels.cu +++ b/csrc/layernorm_quant_kernels.cu @@ -8,16 +8,11 @@ #include "type_convert.cuh" #include "quantization/fp8/common.cuh" #include "dispatch_utils.h" +#include "cub_helpers.h" #include <torch/cuda.h> #include <c10/cuda/CUDAGuard.h> -#ifndef USE_ROCM - #include <cub/cub.cuh> -#else - #include <hipcub/hipcub.hpp> -#endif - namespace vllm { // TODO(woosuk): Further optimize this kernel. @@ -39,7 +34,7 @@ __global__ void rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -100,7 +95,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -149,7 +144,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index cd80bfda7dfde..53573ada86ba9 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -20,17 +20,7 @@ #include <ATen/cuda/CUDAContext.h> #include <c10/cuda/CUDAGuard.h> #include "../cuda_compat.h" - -#ifndef USE_ROCM - #include <cub/util_type.cuh> - #include <cub/cub.cuh> - #include <cuda/std/functional> - using AddOp = cuda::std::plus<float>; -#else - #include <hipcub/util_type.hpp> - #include <hipcub/hipcub.hpp> - using AddOp = cub::Sum; -#endif +#include "../cub_helpers.h" #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -79,7 +69,7 @@ __launch_bounds__(TPB) __global__ threadData = max(static_cast<float>(input[idx]), threadData); } - const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max()); + const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp()); if (threadIdx.x == 0) { float_max = maxElem; @@ -94,7 +84,7 @@ __launch_bounds__(TPB) __global__ threadData += exp((static_cast<float>(input[idx]) - float_max)); } - const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp()); + const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp()); if (threadIdx.x == 0) { diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index d8369108d0bd3..bcfde9fbcbbef 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -7,17 +7,10 @@ #include <cmath> +#include "../../cub_helpers.h" #include "../../dispatch_utils.h" #include "../vectorization_utils.cuh" -#ifndef USE_ROCM - #include <cub/cub.cuh> - #include <cub/util_type.cuh> -#else - #include <hipcub/hipcub.hpp> - #include <hipcub/util_type.hpp> -#endif - static inline __device__ int8_t float_to_int8_rn(float x) { #ifdef USE_ROCM static constexpr auto i8_min = @@ -173,7 +166,7 @@ __global__ void dynamic_scaled_int8_quant_kernel( }); using BlockReduce = cub::BlockReduce<float, 256>; __shared__ typename BlockReduce::TempStorage tmp; - float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x); + float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x); __shared__ float absmax; if (tid == 0) { absmax = block_max; diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 5fe5dd04bd891..45d6d5082ce49 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -1,15 +1,10 @@ #include "common.cuh" #include "dispatch_utils.h" +#include "../../cub_helpers.h" #include "../vectorization_utils.cuh" #include <c10/cuda/CUDAGuard.h> #include <ATen/cuda/Exceptions.h> -#ifndef USE_ROCM - #include <cub/cub.cuh> -#else - #include <hipcub/hipcub.hpp> -#endif - namespace vllm { template <typename scalar_t, typename fp8_type> @@ -116,7 +111,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided( using BlockReduce = cub::BlockReduce<float, 256>; __shared__ typename BlockReduce::TempStorage tmp; const float block_max = - BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x); + BlockReduce(tmp).Reduce(absmax_val, CubMaxOp{}, blockDim.x); __shared__ float token_scale; if (tid == 0) { diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh index 3f188872d80d3..2d2fd771205c7 100644 --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -8,11 +8,7 @@ #include "quantization/utils.cuh" #include "quant_conversions.cuh" -#ifndef USE_ROCM - #include <cub/cub.cuh> -#else - #include <hipcub/hipcub.hpp> -#endif +#include "../../cub_helpers.h" namespace vllm { @@ -36,7 +32,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x); __shared__ float s_rms; if (threadIdx.x == 0) { @@ -73,7 +69,7 @@ __device__ void compute_dynamic_per_token_scales( __shared__ typename BlockReduce::TempStorage reduceStore; block_absmax_val_maybe = BlockReduce(reduceStore) - .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x); + .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x); __shared__ float s_token_scale; if (threadIdx.x == 0) { @@ -169,7 +165,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, using BlockReduce = cub::BlockReduce<float, 1024>; __shared__ typename BlockReduce::TempStorage reduceStore; - ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x); __shared__ float s_rms; if (threadIdx.x == 0) { @@ -240,7 +236,7 @@ __device__ void compute_dynamic_per_token_scales( __shared__ typename BlockReduce::TempStorage reduceStore; block_absmax_val_maybe = BlockReduce(reduceStore) - .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x); + .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x); __shared__ float s_token_scale; if (threadIdx.x == 0) { From 1b962e24577dddc0d7441ae0b06392e1f9262a51 Mon Sep 17 00:00:00 2001 From: dolpm <34420038+dolpm@users.noreply.github.com> Date: Wed, 17 Sep 2025 06:22:25 -0700 Subject: [PATCH 362/584] [fix] lora benchmarks pass no_lora_flag_cpu (#23774) Signed-off-by: Dylan Maloy <34420038+dolpm@users.noreply.github.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> --- benchmarks/kernels/benchmark_lora.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 89309c79f0991..debb29744bfaa 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -464,7 +464,11 @@ class BenchmarkTensors: for field_name in LoRAKernelMeta.__dataclass_fields__: field = getattr(self.lora_kernel_meta, field_name) assert isinstance(field, torch.Tensor) - setattr(self.lora_kernel_meta, field_name, to_device(field)) + setattr( + self.lora_kernel_meta, + field_name, + to_device(field) if field_name != "no_lora_flag_cpu" else field, + ) def metadata(self) -> tuple[int, int, int]: """ @@ -512,6 +516,7 @@ class BenchmarkTensors: "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, "lora_ids": self.lora_kernel_meta.active_lora_ids, "scaling": 1.0, + "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: @@ -552,6 +557,7 @@ class BenchmarkTensors: "lora_ids": self.lora_kernel_meta.active_lora_ids, "offset_start": 0, "add_inputs": add_inputs, + "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } def bench_fn_kwargs( From dd6a910aac6504ec7fc3e50df36b08e476f1b80f Mon Sep 17 00:00:00 2001 From: Tao He <linzhu.ht@alibaba-inc.com> Date: Wed, 17 Sep 2025 21:59:09 +0800 Subject: [PATCH 363/584] [Bugfix][Qwen3-Next] fixes the varlen issue in qwen3-next's MTP implementation. (#24957) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> --- .../layers/mamba/ops/causal_conv1d.py | 132 +++++++++++++++--- vllm/model_executor/models/qwen3_next.py | 10 +- vllm/v1/attention/backends/gdn_attn.py | 31 ++-- 3 files changed, 139 insertions(+), 34 deletions(-) diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index a0478a359f91b..7e3ea561fd293 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -626,6 +626,7 @@ def _causal_conv1d_update_kernel( cache_seqlens_ptr, # circular buffer conv_state_indices_ptr, num_accepted_tokens_ptr, + query_start_loc_ptr, # (batch + 1) o_ptr, # (batch, dim, seqlen) # Matrix dimensions batch: int, @@ -652,6 +653,7 @@ def _causal_conv1d_update_kernel( HAS_BIAS: tl.constexpr, KERNEL_WIDTH: tl.constexpr, SILU_ACTIVATION: tl.constexpr, + IS_VARLEN: tl.constexpr, IS_CONTINUOUS_BATCHING: tl.constexpr, IS_SPEC_DECODING: tl.constexpr, NP2_STATELEN: tl.constexpr, @@ -678,6 +680,25 @@ def _causal_conv1d_update_kernel( # not processing as this is not the actual sequence return + if IS_VARLEN: + query_start_index = tl.load(query_start_loc_ptr + idx_seq).to(tl.int64) + query_end_index = tl.load(query_start_loc_ptr + (idx_seq + 1)).to( + tl.int64) + # revise state_len and seqlen + state_len = state_len - (seqlen - + (query_end_index - query_start_index)) + seqlen = query_end_index - query_start_index + x_offset = query_start_index * stride_x_token + o_offset = query_start_index * stride_o_token + else: + query_start_index = idx_seq * seqlen + query_end_index = query_start_index + seqlen + x_offset = idx_seq * stride_x_seq + o_offset = idx_seq * stride_o_seq + + if query_start_index == query_end_index: + return + if IS_SPEC_DECODING: # The rolling of conv state: # @@ -692,8 +713,8 @@ def _causal_conv1d_update_kernel( # - accept 1 tokens: [history2, ..., historyM, draft1] # - accept 2 tokens: [history3, ..., historyM, draft1, draft2] # - and so on. - conv_state_token_offset = (tl.load(num_accepted_tokens_ptr + idx_seq) - - 1) + conv_state_token_offset = ( + tl.load(num_accepted_tokens_ptr + idx_seq).to(tl.int64) - 1) else: conv_state_token_offset = 0 @@ -713,9 +734,12 @@ def _causal_conv1d_update_kernel( if KERNEL_WIDTH >= 4: conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok # [BLOCK_N] col2 = tl.load(conv_states_ptrs, mask_w, 0.0) - if KERNEL_WIDTH == 5: + if KERNEL_WIDTH >= 5: conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok # [BLOCK_N] col3 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 6: + conv_states_ptrs = prior_tokens + 4 * stride_conv_state_tok # [BLOCK_N] + col4 = tl.load(conv_states_ptrs, mask_w, 0.0) # STEP 2: assume state_len > seqlen idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] @@ -735,8 +759,7 @@ def _causal_conv1d_update_kernel( conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0) VAL = state_len - seqlen - x_base = x_ptr + (idx_seq * stride_x_seq) + (idx_feats * stride_x_dim - ) # [BLOCK_N] + x_base = x_ptr + x_offset + (idx_feats * stride_x_dim) # [BLOCK_N] x_ptrs = x_base[None, :] + ( (idx_tokens - VAL) * stride_x_token)[:, None] # [BLOCK_M, BLOCK_N] @@ -782,12 +805,18 @@ def _causal_conv1d_update_kernel( if KERNEL_WIDTH >= 4: w_ptrs = w_base + (3 * stride_w_width) # [BLOCK_N] tensor w_col3 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 5: + w_ptrs = w_base + (4 * stride_w_width) # [BLOCK_N] tensor + w_col4 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 6: + w_ptrs = w_base + (5 * stride_w_width) # [BLOCK_N] tensor + w_col5 = tl.load(w_ptrs, mask_w, other=0.0) x_base_1d = x_base # starting of chunk [BLOCK_N] mask_x_1d = idx_feats < dim # STEP 5: compute each token - for idx_token in tl.static_range(seqlen): + for idx_token in tl.range(seqlen): acc = acc_preload matrix_w = w_col0 @@ -817,6 +846,37 @@ def _causal_conv1d_update_kernel( matrix_w = w_col3 x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 5: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + matrix_x = col3 + elif j == 4: + matrix_w = w_col4 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 6: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + matrix_x = col3 + elif j == 4: + matrix_w = w_col4 + matrix_x = col4 + elif j == 5: + matrix_w = w_col5 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) acc += matrix_x * matrix_w # [BLOCK_N] @@ -829,14 +889,24 @@ def _causal_conv1d_update_kernel( col0 = col1 col1 = col2 col2 = matrix_x + elif KERNEL_WIDTH == 5: + col0 = col1 + col1 = col2 + col2 = col3 + col3 = matrix_x + elif KERNEL_WIDTH == 6: + col0 = col1 + col1 = col2 + col2 = col3 + col3 = col4 + col4 = matrix_x if SILU_ACTIVATION: acc = acc / (1 + tl.exp(-acc)) mask_1d = (idx_token < seqlen) & (idx_feats < dim ) # token-index # feature-index - o_ptrs = o_ptr + ( - idx_seq) * stride_o_seq + idx_token * stride_o_token + ( - idx_feats * stride_o_dim) + o_ptrs = o_ptr + o_offset + idx_token * stride_o_token + (idx_feats * + stride_o_dim) tl.store(o_ptrs, acc, mask=mask_1d) @@ -850,14 +920,18 @@ def causal_conv1d_update( cache_seqlens: Optional[torch.Tensor] = None, conv_state_indices: Optional[torch.Tensor] = None, num_accepted_tokens: Optional[torch.Tensor] = None, + query_start_loc: Optional[torch.Tensor] = None, + max_query_len: int = -1, pad_slot_id: int = PAD_SLOT_ID, metadata=None, validate_data=False, ): """ - x: (batch, dim) or (batch, dim, seqlen) + x: (batch, dim) or (batch, dim, seqlen) or (num_tokens, dim) [shape=2: single token prediction] [shape=3: single or multiple tokens prediction] + [shape=2 with num_tokens: continuous batching, where num_tokens is the + total tokens of all sequences in that batch] conv_state: (..., dim, state_len), where state_len >= width - 1 weight: (dim, width) bias: (dim,) @@ -870,13 +944,24 @@ def causal_conv1d_update( If not None, the conv_state is a larger tensor along the batch dim, and we are selecting the batch coords specified by conv_state_indices. Useful for a continuous batching scenario. + num_accepted_tokens: (batch,), dtype int32 + If not None, it indicates the number of accepted tokens for each + sequence in the batch. + This is used in speculative decoding, where the conv_state is updated + in a sliding window manner. + query_start_loc: (batch + 1,) int32 + If not None, the inputs is given in a varlen fashion and this indicates + the starting index of each sequence in the batch. + max_query_len: int + If query_start_loc is not None, this indicates the maximum query + length in the batch. pad_slot_id: int if cache_indices is passed, lets the kernel identify padded entries that will not be processed, for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] in this case, the kernel will not process entries at indices 0 and 3 - out: (batch, dim) or (batch, dim, seqlen) + out: (batch, dim) or (batch, dim, seqlen) or (num_tokens, dim), same shape as `x` """ if validate_data: assert cache_seqlens is None # not implemented yet - ok for vLLM @@ -886,11 +971,17 @@ def causal_conv1d_update( activation = "silu" if activation is True else None elif activation is not None: assert activation in ["silu", "swish"] - unsqueeze = x.dim() == 2 + unsqueeze = query_start_loc is None and x.dim() == 2 if unsqueeze: # make it (batch, dim, seqlen) with seqlen == 1 x = x.unsqueeze(-1) - batch, dim, seqlen = x.shape + if query_start_loc is None: + batch, dim, seqlen = x.shape + else: + assert conv_state_indices is not None + batch = conv_state_indices.size(0) + dim = x.size(1) + seqlen = max_query_len _, width = weight.shape # conv_state: (..., dim, state_len), where state_len >= width - 1 num_cache_lines, _, state_len = conv_state.size() @@ -916,10 +1007,17 @@ def causal_conv1d_update( out = x stride_w_dim, stride_w_width = weight.stride() - stride_x_seq, stride_x_dim, stride_x_token = x.stride( - ) # X (batch, dim, seqlen) + if query_start_loc is None: + # X (batch, dim, seqlen) + stride_x_seq, stride_x_dim, stride_x_token = x.stride() + stride_o_seq, stride_o_dim, stride_o_token = out.stride() + else: + # X (dim, cu_seqlen) + stride_x_token, stride_x_dim = x.stride() + stride_x_seq = 0 + stride_o_token, stride_o_dim = out.stride() + stride_o_seq = 0 - stride_o_seq, stride_o_dim, stride_o_token = out.stride() stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride( ) stride_state_indices = conv_state_indices.stride( @@ -945,6 +1043,7 @@ def causal_conv1d_update( cache_seqlens, conv_state_indices, num_accepted_tokens, + query_start_loc, out, # Matrix dimensions batch, @@ -971,6 +1070,7 @@ def causal_conv1d_update( HAS_BIAS=bias is not None, KERNEL_WIDTH=width, SILU_ACTIVATION=activation in ["silu", "swish"], + IS_VARLEN=query_start_loc is not None, IS_CONTINUOUS_BATCHING=conv_state_indices is not None, IS_SPEC_DECODING=num_accepted_tokens is not None, NP2_STATELEN=np2_statelen, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 3c5407916c0bd..fe63e93032352 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -417,9 +417,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): self_kv_cache = self.kv_cache[forward_context.virtual_engine] conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] - num_actual_tokens = (attn_metadata.num_prefill_tokens + - attn_metadata.num_decode_tokens + - attn_metadata.num_spec_decode_tokens) + num_actual_tokens = attn_metadata.num_actual_tokens num_accepted_tokens = attn_metadata.num_accepted_tokens # 1. Set up dimensions for reshapes later @@ -458,9 +456,6 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): # 2.1: process the mutli-query part if spec_sequence_masks is not None: - mixed_qkv_spec = mixed_qkv_spec.view( - attn_metadata.num_spec_decodes, -1, mixed_qkv_spec.size(-1)) - mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b l d -> b d l') mixed_qkv_spec = causal_conv1d_update( mixed_qkv_spec, conv_state, @@ -470,9 +465,10 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): conv_state_indices=spec_state_indices_tensor[:, 0] [:attn_metadata.num_spec_decodes], num_accepted_tokens=num_accepted_tokens, + query_start_loc=spec_query_start_loc, + max_query_len=spec_state_indices_tensor.size(-1), validate_data=False, ) - mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b d l -> (b l) d') # 2.2: process the remaining part if attn_metadata.num_prefills > 0: diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 74eb9ae9d3254..ba89f93e8b56d 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -31,6 +31,7 @@ class GDNAttentionMetadata: num_decode_tokens: int num_spec_decodes: int num_spec_decode_tokens: int + num_actual_tokens: int has_initial_state: Optional[torch.Tensor] = None @@ -74,8 +75,8 @@ class GDNAttentionMetadataBuilder( self.use_full_cuda_graph = \ self.compilation_config.cudagraph_mode.has_full_cudagraphs() self.decode_cudagraph_max_bs = min( - self.vllm_config.scheduler_config.max_num_seqs, - self.compilation_config.max_capture_size) + self.vllm_config.scheduler_config.max_num_seqs * + (self.num_spec + 1), self.compilation_config.max_capture_size) self.spec_state_indices_tensor = torch.empty( (self.decode_cudagraph_max_bs, self.num_spec + 1), @@ -194,9 +195,8 @@ class GDNAttentionMetadataBuilder( dim=0, out=non_spec_query_start_loc[1:]) - num_spec_decode_tokens = min( - num_spec_decodes * (self.num_spec + 1), - spec_token_masks.size(0)) + num_spec_decode_tokens = (query_lens.sum().item() - + num_prefill_tokens - num_decode_tokens) assert num_accepted_tokens is not None num_accepted_tokens = num_accepted_tokens[spec_sequence_masks] @@ -206,14 +206,22 @@ class GDNAttentionMetadataBuilder( has_initial_state = has_initial_state[~spec_sequence_masks] else: has_initial_state = None + num_actual_tokens = num_prefill_tokens + num_decode_tokens + \ + num_spec_decode_tokens # prepare tensors for cudagraph + # + # With speculative decoding, the xgrammar backend may rollback tokens + # and causing some sequences has less draft tokens than self.num_spec. + # + # In above cases, the max possible batch size for n tokens, can be + # min(n, cudagraph_max_bs). if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0 and num_spec_decodes <= self.decode_cudagraph_max_bs - and m.num_actual_tokens <= self.decode_cudagraph_max_bs): - num_total_tokens = self.vllm_config.pad_for_cudagraph( + and num_spec_decode_tokens <= self.decode_cudagraph_max_bs): + num_actual_tokens = self.vllm_config.pad_for_cudagraph( m.num_actual_tokens) - batch_size = num_total_tokens // (self.num_spec + 1) + batch_size = min(self.decode_cudagraph_max_bs, num_actual_tokens) self.spec_state_indices_tensor[:num_spec_decodes].copy_( spec_state_indices_tensor, non_blocking=True) @@ -229,7 +237,7 @@ class GDNAttentionMetadataBuilder( assert spec_token_masks is not None self.spec_token_masks[:spec_token_masks.size(0)].copy_( spec_token_masks, non_blocking=True) - spec_token_masks = self.spec_token_masks[:m.num_actual_tokens] + spec_token_masks = self.spec_token_masks[:num_actual_tokens] spec_token_masks[spec_token_masks.size(0):].fill_(False) self.spec_query_start_loc[:num_spec_decodes + 1].copy_( @@ -248,9 +256,9 @@ class GDNAttentionMetadataBuilder( if (self.use_full_cuda_graph and num_prefills == 0 and num_spec_decodes == 0 and num_decodes <= self.decode_cudagraph_max_bs): - num_total_tokens = self.vllm_config.pad_for_cudagraph( + num_actual_tokens = self.vllm_config.pad_for_cudagraph( m.num_actual_tokens) - batch_size = num_total_tokens + batch_size = num_actual_tokens self.non_spec_state_indices_tensor[:num_decodes].copy_( non_spec_state_indices_tensor, non_blocking=True) @@ -274,6 +282,7 @@ class GDNAttentionMetadataBuilder( num_decode_tokens=num_decode_tokens, num_spec_decodes=num_spec_decodes, num_spec_decode_tokens=num_spec_decode_tokens, + num_actual_tokens=num_actual_tokens, has_initial_state=has_initial_state, spec_query_start_loc=spec_query_start_loc, non_spec_query_start_loc=non_spec_query_start_loc, From 47f670b03b7dfb4e1149eb8b14ba9edcfc297255 Mon Sep 17 00:00:00 2001 From: samzong <samzong.lu@gmail.com> Date: Wed, 17 Sep 2025 22:31:20 +0800 Subject: [PATCH 364/584] [Docs] improve code formatting and comments for eliminate griffe build warning. (#25010) Signed-off-by: samzong <samzong.lu@gmail.com> --- vllm/benchmarks/serve.py | 2 +- vllm/distributed/eplb/eplb_state.py | 9 +++++---- vllm/distributed/eplb/rebalance_algo.py | 23 ++++++++++++++--------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 33e831e54bbc9..1aeef0fd5bd85 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -139,7 +139,7 @@ async def get_request( A lower burstiness value (0 < burstiness < 1) results in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. - ramp_up_strategy (optional): + ramp_up_strategy (optional): The ramp-up strategy. Can be "linear" or "exponential". If None, uses constant request rate (specified by request_rate). ramp_up_start_rps (optional): diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 8f8baa7d59db7..3e318d7848326 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -337,11 +337,12 @@ class EplbState: Args: model (MixtureOfExperts): The MoE model. is_dummy (bool): If `True`, this is a dummy step and the load - metrics recorded in this forward pass will not count. Defaults - to `False`. + metrics recorded in this forward pass will not count. + Defaults to `False`. is_profile (bool): If `True`, perform a dummy rearrangement - with maximum communication cost. This is used in `profile_run` - to reserve enough memory for the communication buffer. + with maximum communication cost. This is used in + `profile_run` to reserve enough memory + for the communication buffer. log_stats (bool): If `True`, log the expert load metrics. # Stats diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py index 3564a10dfc684..fc43dbe3b6533 100644 --- a/vllm/distributed/eplb/rebalance_algo.py +++ b/vllm/distributed/eplb/rebalance_algo.py @@ -109,13 +109,16 @@ def rebalance_experts_hierarchical( num_physical_experts: number of physical experts after replication num_groups: number of expert groups num_nodes: number of server nodes, where the intra-node network - (e.g, NVLink) is faster + (e.g., NVLink) is faster num_gpus: number of GPUs, must be a multiple of `num_nodes` Returns: - physical_to_logical_map: [num_moe_layers, num_physical_experts] - logical_to_physical_map: [num_moe_layers, num_logical_experts, X] - logical_count: [num_moe_layers, num_logical_experts] + physical_to_logical_map (torch.Tensor): + [num_moe_layers, num_physical_experts] + logical_to_physical_map (torch.Tensor): + [num_moe_layers, num_logical_experts, X] + logical_count (torch.Tensor): + [num_moe_layers, num_logical_experts] """ num_layers, num_logical_experts = weight.shape assert num_logical_experts % num_groups == 0 @@ -197,11 +200,13 @@ def rebalance_experts( num_gpus: number of GPUs, must be a multiple of `num_nodes` Returns: - physical_to_logical_map: [layers, num_replicas], the expert index of - each replica - logical_to_physical_map: [layers, num_logical_experts, X], the replica - indices for each expert - expert_count: [layers, num_logical_experts], number of physical + physical_to_logical_map: + [layers, num_replicas], the expert index of each replica + logical_to_physical_map: + [layers, num_logical_experts, X], the replica indices for each + expert + expert_count: + [layers, num_logical_experts], number of physical replicas for each logical expert """ num_layers, num_logical_experts = weight.shape From 8f3616f422e34ccb0e79f1f00d72366c4dab24f1 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni001@gmail.com> Date: Wed, 17 Sep 2025 10:31:43 -0400 Subject: [PATCH 365/584] Remove old cutlass mla (#23961) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> --- CMakeLists.txt | 2 - csrc/attention/mla/cutlass_mla_entry.cu | 38 --- csrc/attention/mla/cutlass_mla_kernels.cu | 225 ------------------ csrc/torch_bindings.cpp | 7 - vllm/_custom_ops.py | 9 - vllm/v1/attention/backends/mla/cutlass_mla.py | 74 +----- 6 files changed, 10 insertions(+), 345 deletions(-) delete mode 100644 csrc/attention/mla/cutlass_mla_entry.cu delete mode 100644 csrc/attention/mla/cutlass_mla_kernels.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index c48da948a0298..180b896a7abac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,7 +308,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/cutlass_extensions/common.cpp" - "csrc/attention/mla/cutlass_mla_entry.cu" "csrc/quantization/fp8/per_token_group_quant.cu") set_gencode_flags_for_srcs( @@ -595,7 +594,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) set(SRCS - "csrc/attention/mla/cutlass_mla_kernels.cu" "csrc/attention/mla/sm100_cutlass_mla_kernel.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" diff --git a/csrc/attention/mla/cutlass_mla_entry.cu b/csrc/attention/mla/cutlass_mla_entry.cu deleted file mode 100644 index 0319d1daf302f..0000000000000 --- a/csrc/attention/mla/cutlass_mla_entry.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <torch/all.h> - -#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA -void cutlass_mla_decode_sm100a(torch::Tensor const& out, - torch::Tensor const& q_nope, - torch::Tensor const& q_pe, - torch::Tensor const& kv_c_and_k_pe_cache, - torch::Tensor const& seq_lens, - torch::Tensor const& page_table, double scale); -#endif - -void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, - torch::Tensor const& q_pe, - torch::Tensor const& kv_c_and_k_pe_cache, - torch::Tensor const& seq_lens, - torch::Tensor const& page_table, double scale) { -#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA - return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache, - seq_lens, page_table, scale); -#endif - TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA"); -} diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu deleted file mode 100644 index 9d05d910dd81f..0000000000000 --- a/csrc/attention/mla/cutlass_mla_kernels.cu +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <torch/all.h> - -#include <ATen/cuda/CUDAContext.h> -#include <c10/cuda/CUDAGuard.h> - -#include "cute/tensor.hpp" - -#include "cutlass/cutlass.h" -#include "cutlass/kernel_hardware_info.h" - -#include "cutlass_extensions/common.hpp" - -#include "device/sm100_mla.hpp" -#include "kernel/sm100_mla_tile_scheduler.hpp" - -using namespace cute; -using namespace cutlass::fmha::kernel; - -template <typename T, bool PersistenceOption = true> -struct MlaSm100 { - using Element = T; - using ElementAcc = float; - using ElementOut = T; - - using TileShape = Shape<_128, _128, Shape<_512, _64>>; - using TileShapeH = cute::tuple_element_t<0, TileShape>; - using TileShapeD = cute::tuple_element_t<2, TileShape>; - - // H K (D_latent D_rope) B - using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>; - - using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B - using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B - using StrideO = StrideK; // H D B - using StrideLSE = cute::tuple<_1, int>; // H B - - using TileScheduler = - std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler, - Sm100MlaIndividualTileScheduler>; - - using FmhaKernel = - cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized< - TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler, - /*kIsCpAsync=*/true>; - using Fmha = cutlass::fmha::device::MLA<FmhaKernel>; -}; - -template <typename T> -typename T::Fmha::Arguments args_from_options( - at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe, - at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens, - at::Tensor const& page_table, double scale) { - cutlass::KernelHardwareInfo hw_info; - hw_info.device_id = q_nope.device().index(); - hw_info.sm_count = - cutlass::KernelHardwareInfo::query_device_multiprocessor_count( - hw_info.device_id); - - int batches = q_nope.sizes()[0]; - int page_count_per_seq = page_table.sizes()[1]; - int page_count_total = kv_c_and_k_pe_cache.sizes()[0]; - int page_size = kv_c_and_k_pe_cache.sizes()[1]; - int max_seq_len = page_size * page_count_per_seq; - using TileShapeH = typename T::TileShapeH; - using TileShapeD = typename T::TileShapeD; - auto problem_shape = - cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches); - - auto [H, K, D, B] = problem_shape; - auto [D_latent, D_rope] = D; - - using StrideQ = typename T::StrideQ; - using StrideK = typename T::StrideK; - using StrideO = typename T::StrideO; - using StrideLSE = typename T::StrideLSE; - - StrideQ stride_Q_latent = cute::make_tuple( - static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent)); - StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{}, - static_cast<int64_t>(H * D_rope)); - StrideK stride_C = - cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{}, - static_cast<int64_t>(page_size * (D_latent + D_rope))); - StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq); - StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H)); - StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{}, - static_cast<int64_t>(H * D_latent)); - - using Element = typename T::Element; - using ElementOut = typename T::ElementOut; - using ElementAcc = typename T::ElementAcc; - auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr()); - auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr()); - auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr()); - auto scale_f = static_cast<float>(scale); - typename T::Fmha::Arguments arguments{ - problem_shape, - {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr, - stride_C, C_ptr + D_latent, stride_C, - static_cast<int*>(seq_lens.data_ptr()), - static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total, - page_size}, - {static_cast<ElementOut*>(out.data_ptr()), stride_O, - static_cast<ElementAcc*>(nullptr), stride_LSE}, - hw_info, - 1, // split_kv - nullptr, // is_var_split_kv - }; - // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute - // split_kv automatically based on batch size and sequence length to balance - // workload across available SMs. Consider using var_split_kv for manual - // control if needed. - T::Fmha::set_split_kv(arguments); - return arguments; -} - -template <typename Element> -void runMla(at::Tensor const& out, at::Tensor const& q_nope, - at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, - at::Tensor const& seq_lens, at::Tensor const& page_table, - float scale, cudaStream_t stream) { - using MlaSm100Type = MlaSm100<Element>; - typename MlaSm100Type::Fmha fmha; - auto arguments = args_from_options<MlaSm100Type>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale); - size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments); - auto const workspace_options = - torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device()); - auto workspace = torch::empty(workspace_size, workspace_options); - - CUTLASS_CHECK(fmha.can_implement(arguments)); - - CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream)); - - CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream)); -} - -void cutlass_mla_decode_sm100a(torch::Tensor const& out, - torch::Tensor const& q_nope, - torch::Tensor const& q_pe, - torch::Tensor const& kv_c_and_k_pe_cache, - torch::Tensor const& seq_lens, - torch::Tensor const& page_table, double scale) { - TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA"); - TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor"); - TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor"); - TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3, - "kv_c_and_k_pe_cache must be a 3D tensor"); - TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor"); - TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor"); - TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor"); - - auto B_q_nope = q_nope.size(0); - auto H_q_nope = q_nope.size(1); - auto D_q_nope = q_nope.size(2); - auto B_q_pe = q_pe.size(0); - auto H_q_pe = q_pe.size(1); - auto D_q_pe = q_pe.size(2); - auto B_pt = page_table.size(0); - auto PAGE_NUM = page_table.size(1); - auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1); - auto D_ckv = kv_c_and_k_pe_cache.size(2); - auto B_o = out.size(0); - auto H_o = out.size(1); - auto D_o = out.size(2); - - TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512"); - TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64"); - TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576"); - TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128, - "H_q_nope, H_q_pe, and H_o must be equal to 128"); - TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0, - "PAGE_SIZE must be a power of 2"); - TORCH_CHECK( - B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o, - "Batch dims must be same for page_table, q_nope and q_pe, and out"); - TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0, - "PAGE_NUM must be divisible by 128 / PAGE_SIZE"); - TORCH_CHECK(D_o == 512, "D_o must be equal to 512"); - - TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half || - q_nope.dtype() == at::ScalarType::BFloat16 || - q_nope.dtype() == at::ScalarType::Float8_e4m3fn, - "q_nope must be a half, bfloat16, or float8_e4m3fn tensor"); - TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() && - q_nope.dtype() == q_pe.dtype(), - "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type"); - TORCH_CHECK(seq_lens.dtype() == torch::kInt32, - "seq_lens must be a 32-bit integer tensor"); - TORCH_CHECK(page_table.dtype() == torch::kInt32, - "page_table must be a 32-bit integer tensor"); - - auto in_dtype = q_nope.dtype(); - const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope)); - const cudaStream_t stream = - at::cuda::getCurrentCUDAStream(q_nope.get_device()); - if (in_dtype == at::ScalarType::Half) { - runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, - page_table, scale, stream); - } else if (in_dtype == at::ScalarType::BFloat16) { - runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, - seq_lens, page_table, scale, stream); - } else if (in_dtype == at::ScalarType::Float8_e4m3fn) { - runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, - seq_lens, page_table, scale, stream); - } else { - TORCH_CHECK(false, "Unsupported input data type of MLA"); - } -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index f22e23519831f..bc096406c51ae 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -510,13 +510,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]"); ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress); - // CUTLASS MLA decode - ops.def( - "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe," - " Tensor kv_c_and_k_pe_cache, Tensor seq_lens," - " Tensor page_table, float scale) -> ()"); - ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode); - // SM100 CUTLASS MLA decode ops.def( "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope," diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 456c6b3ba9234..712295aa92886 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1823,15 +1823,6 @@ def flash_mla_with_kvcache( return out, softmax_lse -def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, - q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, - seq_lens: torch.Tensor, page_table: torch.Tensor, - scale: float) -> torch.Tensor: - torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache, - seq_lens, page_table, scale) - return out - - def sm100_cutlass_mla_decode(out: torch.Tensor, lse: torch.Tensor, q_nope: torch.Tensor, q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 78af8d28f8892..21be17a750df4 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from typing import ClassVar, Optional +from typing import ClassVar, Optional, Union import torch @@ -109,12 +109,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): "are not implemented for " "CutlassMLAImpl") - self._use_old_cutlass_mla = False - force_old_cutlass = os.environ.get("FORCE_OLD_CUTLASS_MLA", None) - if force_old_cutlass: - logger.warning_once("Forcing old cutlass mla kernel") - self._use_old_cutlass_mla = True - # TODO: Currently, num_kv_splits is limited to 16 to avoid hanging # issues. In case the code hangs, use: # FORCE_NUM_KV_SPLITS=1 @@ -219,16 +213,22 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): return out, returned_lse - def _sm100_forward_decode( + def _forward_decode( self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, + q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: MLACommonMetadata, + layer: AttentionLayer, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None + if type(q) is tuple: + q_nope, q_pe = q + else: + q_nope, q_pe = torch.split( + q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + # Adjust workspace size (if necessary) self._workspace.ensure_size(attn_metadata, self._num_kv_splits) @@ -245,57 +245,3 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): ) return o, (lse if self.need_to_return_lse_for_decode else None) - - # TODO: Currently we leave it here only for backup in case something is - # wrong with the new SM100 CUTLASS MLA kernel - def _old_forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - assert attn_metadata.decode is not None - - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "FP8 Cutlass MLA not supported with FORCE_OLD_CUTLASS_MLA") - - B = q_nope.shape[0] - - o = torch.empty((B, self.num_heads, self.kv_lora_rank), - dtype=q_nope.dtype, - device=q_nope.device) - - # Run MLA - # Clone q_nope and q_pe to make sure strides computation is correct. - q_nope = q_nope.clone() - q_pe = q_pe.clone() - - ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache, - attn_metadata.decode.seq_lens, - attn_metadata.decode.block_table, self.scale) - - return o - - def _forward_decode( - self, - q: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - layer: AttentionLayer, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - if type(q) is tuple: - q_nope, q_pe = q - else: - q_nope, q_pe = torch.split( - q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - if self._use_old_cutlass_mla: - # TODO: Remove the old cutlass MLA kernel after more extensive - # testing - return self._old_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache, - attn_metadata), None - - return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache, - attn_metadata) From 4a2d33e3718c57a2789da8b621728965a8a1787a Mon Sep 17 00:00:00 2001 From: samzong <samzong.lu@gmail.com> Date: Wed, 17 Sep 2025 23:11:51 +0800 Subject: [PATCH 366/584] [Docs] vllm/benchmarks/datasets.py fix docstring param format. (#24970) Signed-off-by: samzong <samzong.lu@gmail.com> --- vllm/benchmarks/datasets.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index a38090edb0b42..1831539a6adbe 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -104,9 +104,9 @@ class BenchmarkDataset(ABC): Args: dataset_path (Optional[str]): Path to the dataset. If None, it - indicates that a default or random dataset might be used. + indicates that a default or random dataset might be used. random_seed (int): Seed value for reproducible shuffling or - sampling. Defaults to DEFAULT_SEED. + sampling. Defaults to DEFAULT_SEED. """ self.dataset_path = dataset_path # Set the random seed, ensuring that a None value is replaced with the @@ -200,8 +200,7 @@ class BenchmarkDataset(ABC): tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for processing the dataset's text. num_requests (int): The number of sample requests to generate. - request_id_prefix (str) The prefix of request_id. - + request_id_prefix (str): The prefix of request_id. Returns: list[SampleRequest]: A list of sample requests generated from the @@ -224,7 +223,8 @@ class BenchmarkDataset(ABC): requests (List[SampleRequest]): The current list of sampled requests. num_requests (int): The target number of requests. - request_id_prefix (str) The prefix of the request ids. + request_id_prefix (str): The prefix applied to generated request + identifiers. """ if no_oversample: From 087c6ffc9202599f438f1f7e0d6449020a958ac1 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Wed, 17 Sep 2025 11:28:58 -0400 Subject: [PATCH 367/584] [CI Bugfix] Fix failing test_invalid_env (#25078) Signed-off-by: mgoin <mgoin64@gmail.com> --- tests/kernels/attention/test_attention_selector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 4d969cf992d23..190c92e1251c2 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -368,4 +368,4 @@ def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): # Should raise ValueError for invalid backend with pytest.raises(ValueError) as exc_info: get_attn_backend(32, torch.float16, None, 16, False) - assert "Invalid attention backend: 'INVALID'" in str(exc_info.value) + assert "Invalid value 'INVALID'" in str(exc_info.value) From 4b946d693e0af15740e9ca9c0e059d5f333b1083 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 09:32:42 -0700 Subject: [PATCH 368/584] [V0 Deprecation] Remove V0 Core tests (#25082) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- .buildkite/test-pipeline.yaml | 11 - tests/core/__init__.py | 0 tests/core/block/__init__.py | 0 tests/core/block/conftest.py | 15 - tests/core/block/e2e/__init__.py | 0 tests/core/block/e2e/conftest.py | 71 - tests/core/block/e2e/test_correctness.py | 479 ------ .../e2e/test_correctness_sliding_window.py | 185 --- tests/core/block/test_block_manager.py | 341 ----- tests/core/block/test_block_table.py | 577 ------- tests/core/block/test_common.py | 45 - .../block/test_cpu_gpu_block_allocator.py | 96 -- tests/core/block/test_naive_block.py | 148 -- tests/core/block/test_prefix_caching_block.py | 1035 ------------- tests/core/conftest.py | 12 - tests/core/test_chunked_prefill_scheduler.py | 858 ----------- tests/core/test_num_computed_tokens_update.py | 67 - tests/core/test_scheduler.py | 1338 ----------------- tests/core/test_serialization.py | 36 - tests/core/utils.py | 392 ----- 20 files changed, 5706 deletions(-) delete mode 100644 tests/core/__init__.py delete mode 100644 tests/core/block/__init__.py delete mode 100644 tests/core/block/conftest.py delete mode 100644 tests/core/block/e2e/__init__.py delete mode 100644 tests/core/block/e2e/conftest.py delete mode 100644 tests/core/block/e2e/test_correctness.py delete mode 100644 tests/core/block/e2e/test_correctness_sliding_window.py delete mode 100644 tests/core/block/test_block_manager.py delete mode 100644 tests/core/block/test_block_table.py delete mode 100644 tests/core/block/test_common.py delete mode 100644 tests/core/block/test_cpu_gpu_block_allocator.py delete mode 100644 tests/core/block/test_naive_block.py delete mode 100644 tests/core/block/test_prefix_caching_block.py delete mode 100644 tests/core/conftest.py delete mode 100644 tests/core/test_chunked_prefill_scheduler.py delete mode 100644 tests/core/test_num_computed_tokens_update.py delete mode 100644 tests/core/test_scheduler.py delete mode 100644 tests/core/test_serialization.py delete mode 100644 tests/core/utils.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b5ea4407ef5bd..133ba792680de 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -91,17 +91,6 @@ steps: - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py -- label: Core Test # 22min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: true - source_file_dependencies: - - vllm/core - - vllm/distributed - - tests/core - commands: - - pytest -v -s core - - label: Entrypoints Unit Tests # 5min timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" diff --git a/tests/core/__init__.py b/tests/core/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/core/block/__init__.py b/tests/core/block/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py deleted file mode 100644 index 6afe98d78ce81..0000000000000 --- a/tests/core/block/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - - -@pytest.fixture() -def should_do_global_cleanup_after_test() -> bool: - """Disable the global cleanup fixture for tests in this directory. This - provides a ~10x speedup for unit tests that don't load a model to GPU. - - This requires that tests in this directory clean up after themselves if they - use the GPU. - """ - return False diff --git a/tests/core/block/e2e/__init__.py b/tests/core/block/e2e/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py deleted file mode 100644 index e2c6c66b259c8..0000000000000 --- a/tests/core/block/e2e/conftest.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Iterable -from typing import Callable, Optional - -import pytest - -from vllm import LLM -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.model_executor.utils import set_random_seed - - -@pytest.fixture -def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, seed): - return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, seed) - - -@pytest.fixture -def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, - test_llm_kwargs, seed): - return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, - test_llm_kwargs, seed) - - -def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, - distinct_llm_kwargs, seed): - kwargs = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **distinct_llm_kwargs, - } - - def generator_inner(): - llm = LLM(**kwargs) - - set_random_seed(seed) - - yield llm - del llm - cleanup_dist_env_and_memory() - - for llm in generator_inner(): - yield llm - del llm - - -def get_text_from_llm_generator(llm_generator: Iterable[LLM], - prompts, - sampling_params, - llm_cb: Optional[Callable[[LLM], - None]] = None): - for llm in llm_generator: - if llm_cb: - llm_cb(llm) - outputs = llm.generate(prompts, sampling_params, use_tqdm=True) - text = [output.outputs[0].text for output in outputs] - del llm - - return text - - -def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): - for llm in llm_generator: - outputs = llm.generate(prompts, sampling_params, use_tqdm=True) - token_ids = [output.outputs[0].token_ids for output in outputs] - del llm - - return token_ids diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py deleted file mode 100644 index 8de48ef59a013..0000000000000 --- a/tests/core/block/e2e/test_correctness.py +++ /dev/null @@ -1,479 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from itertools import cycle - -import pytest - -from vllm import SamplingParams - -from .conftest import get_token_ids_from_llm_generator - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - - # Allow only 5 sequences of ~1024 tokens in worst case. - "block_size": 16, - "num_gpu_blocks_override": 5 * (64 + 1), - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "preemption_mode": "swap" -}, { - "preemption_mode": "recompute" -}]) -@pytest.mark.parametrize("batch_size", [10]) -@pytest.mark.parametrize("seed", [1]) -def test_block_manager_with_preemption(baseline_llm_generator, - test_llm_generator, batch_size): - """Verify block manager produces same outputs even when there is preemption. - - This constructs two LLM, each with limited number of GPU blocks. The limit - is decided such that as the sequences in the batch grow, sequences must be - preempted and removed from cache. - - If the output token ids are equivalent, then we have confidence that the KV - cache is not corrupted. - - NOTE: We want a significant number of generated tokens so that any incorrect - KV mapping has time to build up error. - - NOTE(Kuntai): Though we have removed block manager v1, this test is still - useful as it asserts the behavior of block manager v2 (now it is called - SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we - keep this test. - """ - output_len = 1024 - temperature = 0.0 - - # We want to ensure equality even with preemption. - # We force the total block size to be 1 + cdiv(output_len, block_size) - # so that only one sequence can fit at a time (once the sequences grow). - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - baseline_token_ids = get_token_ids_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) - - test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, - prompts, sampling_params) - - for expected_token_ids, actual_token_ids in zip(baseline_token_ids, - test_token_ids): - assert expected_token_ids == actual_token_ids - - assert baseline_token_ids == test_token_ids - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # Our prompts will generate 128 tokens; since the prompts themselves are - # small, we don't need much KV space beyond 128. - "max_model_len": 160, - - # skip cuda graph creation for fast test. - "enforce_eager": True, - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - "block_size": 16, - - # Allow only 2 sequences of ~128 tokens in worst case. - # Note 8 = 128/block_size - "num_gpu_blocks_override": 2 * (8 + 1), - }, - { - "block_size": 8, - - # Allow only 2 sequences of ~128 tokens in worst case. - # Note 16 = 128/block_size - "num_gpu_blocks_override": 2 * (16 + 2), - } - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{ - "num_lookahead_slots": 0, -}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - # We run one test with block_size < lookahead_slots, one test with - # block_size > lookahead_slots - "num_lookahead_slots": 10, - "preemption_mode": "swap", - }, - { - "num_lookahead_slots": 10, - "preemption_mode": "recompute", - } - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, - test_llm_generator, - batch_size): - """Verify vLLM produces the same output with greedy sampling, when lookahead - scheduling is used vs. not. - - Lookahead scheduling is not expected to modify the output, as it simply - allocates empty slots ahead of the known token ids in a sliding fashion. - - This test constrains the total number of blocks to force preemption. It also - varies the block size so that the lookahead size is less than and greater - than the block size. - """ - output_len = 128 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - print('Getting token ids without lookahead scheduling') - baseline_token_ids = get_token_ids_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) - - print('Getting token ids with lookahead scheduling') - test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, - prompts, sampling_params) - - for expected_token_ids, actual_token_ids in zip(baseline_token_ids, - test_token_ids): - assert expected_token_ids == actual_token_ids - - assert baseline_token_ids == test_token_ids - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [ - { - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - "enable_chunked_prefill": True, - }, - ]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", - [{ - "block_size": 16, - "max_num_batched_tokens": 2, - "max_num_seqs": 2, - }, { - "block_size": 16, - "max_num_batched_tokens": 3, - "max_num_seqs": 2, - }, { - "block_size": 16, - "max_num_batched_tokens": 256, - "max_num_seqs": 10, - }]) -@pytest.mark.parametrize("baseline_llm_kwargs", [ - {}, -]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "num_lookahead_slots": 0, - }, - { - "num_lookahead_slots": 5, - }, -]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_chunked_prefill_block_manager(baseline_llm_generator, - test_llm_generator, batch_size): - """Verify that chunked prefill works with SelfAttnBlockSpaceManager, - with and without lookahead scheduling. - """ - output_len = 32 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - "The president of the United States is", - ("1 + " * 50) + " 1 = ", # Longer prompt. - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - print('Getting token ids with BlockManager') - baseline_token_ids = get_token_ids_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) - - print('Getting token ids with BlockManager, with lookahead slots.') - test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, - prompts, sampling_params) - - for expected_token_ids, actual_token_ids in zip(baseline_token_ids, - test_token_ids): - assert expected_token_ids == actual_token_ids - - assert baseline_token_ids == test_token_ids - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - - # Allow only 5 sequences of ~1024 tokens in worst case. - "block_size": 16, - "num_gpu_blocks_override": 5 * (64 + 1), - - # Enable prefill cache - "enable_prefix_caching": True, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "preemption_mode": "swap" -}, { - "preemption_mode": "recompute" -}]) -@pytest.mark.parametrize("batch_size", [10]) -@pytest.mark.parametrize("seed", [1]) -def test_block_manager_prefix_caching_enabled_with_preemption( - baseline_llm_generator, test_llm_generator, batch_size): - """Verify block manager produces same outputs even when there is preemption. - - This constructs two LLM, each with limited number of GPU blocks. The limit - is decided such that as the sequences in the batch grow, sequences must be - preempted and removed from cache. - - If the output token ids are equivalent, then we have confidence that the KV - cache is not corrupted. - - NOTE: We want a significant number of generated tokens so that any incorrect - KV mapping has time to build up error. - - NOTE(Kuntai): Though we have removed block manager v1, this test is still - useful as it asserts the behavior of block manager v2 (now it is called - SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we - keep this test. - """ - output_len = 1024 - temperature = 0.0 - - # We want to ensure equality even with preemption. - # We force the total block size to be 1 + cdiv(output_len, block_size) - # so that only one sequence can fit at a time (once the sequences grow). - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - print('Getting token ids from block manager') - baseline_token_ids = get_token_ids_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) - - print('Getting token ids from block manager, with preemption') - test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, - prompts, sampling_params) - - for expected_token_ids, actual_token_ids in zip(baseline_token_ids, - test_token_ids): - assert expected_token_ids == actual_token_ids - - assert baseline_token_ids == test_token_ids - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - - # Allow only 5 sequences of ~1024 tokens in worst case. - "block_size": 16, - "num_gpu_blocks_override": 5 * (64 + 1), - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{ - "enable_prefix_caching": False -}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "enable_prefix_caching": True, - "preemption_mode": "swap" -}, { - "enable_prefix_caching": True, - "preemption_mode": "recompute" -}]) -@pytest.mark.parametrize("batch_size", [10]) -@pytest.mark.parametrize("seed", [1]) -def test_auto_prefix_caching_with_preemption(baseline_llm_generator, - test_llm_generator, batch_size): - """Verify block manager v2 with auto prefix caching enabled produces same - outputs as auto prefix caching disabled, even when there is preemption. - - This constructs two LLM, each with limited number of GPU blocks. The limit - is decided such that as the sequences in the batch grow, sequences must be - preempted and removed from cache. - - If the output token ids are equivalent, then we have confidence that auto - prefix caching itself at least don't cause result error. - """ - output_len = 1024 - temperature = 0.0 - - # We want to ensure equality even with preemption. - # We force the total block size to be 1 + cdiv(output_len, block_size) - # so that only one sequence can fit at a time (once the sequences grow). - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - print('Getting token ids with APC disabled') - baseline_token_ids = get_token_ids_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) - - print('Getting token ids with APC enabled') - test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, - prompts, sampling_params) - - for expected_token_ids, actual_token_ids in zip(baseline_token_ids, - test_token_ids): - assert expected_token_ids == actual_token_ids - - assert baseline_token_ids == test_token_ids - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - - # we keep the blocks small, so that hit eviction quickly - "max_model_len": 48, - "block_size": 16, - "num_gpu_blocks_override": 3, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{ - "enable_prefix_caching": False -}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "enable_prefix_caching": True, -}]) -@pytest.mark.parametrize("seed", [1]) -def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator, - test_llm_generator): - """Verify block manager v2 with auto prefix caching could work normally - even when eviction started. - With APC enabled, all blocks are held by native block at the beginning. - Then blocks are managed by evictor instead. If cache hit at the evictor's - block, then it could be reused, or we need to recompute its kv cache. - """ - output_len = 10 - temperature = 0.0 - - prompts = [ - "You are a helpful assistant. Please answer truthfully and write " - "out your thinking step by step to be sure you get the right answer. " - "If you make a mistake, attempt to correct it. who are you?", - "You are a helpful assistant. Please answer truthfully and write out " - "your thinking step by step to be sure you get the right answer. You " - "are helpful and harmless and you follow ethical guidelines. " - "who are you?" - ] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - print('Getting token ids with APC disabled') - baseline_token_ids = get_token_ids_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) - - print('Getting token ids with APC enabled') - test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, - prompts, sampling_params) - - for expected_token_ids, actual_token_ids in zip(baseline_token_ids, - test_token_ids): - assert expected_token_ids == actual_token_ids - - assert baseline_token_ids == test_token_ids diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py deleted file mode 100644 index 27fe27a880e3d..0000000000000 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ /dev/null @@ -1,185 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random - -import pytest - -from tests.kernels.utils import override_backend_env_variable -from vllm import LLM, SamplingParams -from vllm.platforms import current_platform - -from .conftest import get_text_from_llm_generator - -# relatively small model with 4k sliding window -MODEL = "bigcode/starcoder2-3b" -BLOCK_SIZE = 16 - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model": MODEL, - - # skip cuda graph creation for fast test. - "enforce_eager": True, - "block_size": BLOCK_SIZE, - # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008 - "num_gpu_blocks_override": 100000 // BLOCK_SIZE, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [5]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"]) -def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator, - batch_size, seed, backend, monkeypatch): - """ - The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then - asks for value of one of them (which is outside the sliding window). - If we tell it upfront which we are going to be looking for, then - it answers correctly (mostly). - - Additionally, we compare the results of the v1 and v2 managers. - """ - if backend == "XFORMERS" and current_platform.is_rocm(): - pytest.skip("Xformers does not support ROCm/HIP.") - - override_backend_env_variable(monkeypatch, backend) - - sampling_params = SamplingParams( - max_tokens=1024, - ignore_eos=True, - temperature=0.0, - ) - - prompts, answer, indices = prep_prompts(batch_size) - - baseline_texts = get_text_from_llm_generator(baseline_llm_generator, - prompts, - sampling_params, - llm_cb=check_window(prompts)) - - check_answers(indices, answer, baseline_texts) - - print('Getting token ids from block manager v2') - test_texts = get_text_from_llm_generator(test_llm_generator, prompts, - sampling_params) - check_answers(indices, answer, test_texts) - - cmp = [ - expected_text == actual_text - for expected_text, actual_text in zip(baseline_texts, test_texts) - ] - print(cmp) - # make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768 - # however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290 - # states that xformers and flash_attn have different ideas about the window - # size anyways - assert sum(cmp) > 0.7 * len(cmp) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model": MODEL, - - # skip cuda graph creation for fast test. - "enforce_eager": True, - "block_size": BLOCK_SIZE, - "num_gpu_blocks_override": 100000 // BLOCK_SIZE, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}]) -@pytest.mark.parametrize("batch_size", [5]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"]) -def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, - backend, monkeypatch): - """ - This is similar to test_sliding_window_retrieval, however, it doesn't - compare against the v1 block manager since v1 doesn't support - chunked prefill with sliding window. - - The results with and without chunked prefill are not the same due to - numerical instabilities. - """ - if backend == "XFORMERS" and current_platform.is_rocm(): - pytest.skip("Xformers does not support ROCm/HIP.") - override_backend_env_variable(monkeypatch, backend) - - sampling_params = SamplingParams( - max_tokens=10, - ignore_eos=True, - temperature=0.0, - ) - - prompts, answer, indices = prep_prompts(batch_size) - - # We don't compare with the baseline model here, since the results - # slightly different due to different tailing in attention. - test_texts = get_text_from_llm_generator(test_llm_generator, - prompts, - sampling_params, - llm_cb=check_window(prompts)) - check_answers(indices, answer, test_texts) - - -def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)): - """ - Generate prompts which a bunch of assignments, - then asking for the value of one of them. - The prompt is just under 10k tokens; sliding window is 4k - so the answer is outside sliding window, but should still be correct. - - Args: - batch_size: number of prompts to generate - ln_range: an argument to control the length of the prompt - """ - prompts: list[str] = [] - answer: list[int] = [] - indices: list[int] = [] - random.seed(1) - for _ in range(batch_size): - idx = random.randint(30, 90) - indices.append(idx) - prompt = "```python\n# We set a number of variables, " + \ - f"x{idx} will be important later\n" - ln = random.randint(*ln_range) - for k in range(30, ln): - v = random.randint(10, 99) - if k == idx: - answer.append(v) - prompt += f"x{k} = {v}\n" - prompt += f"# Now, we check the value of x{idx}:\n" - prompt += f"assert x{idx} == " - prompts.append(prompt) - return prompts, answer, indices - - -def check_answers(indices: list[int], - answer: list[int], - outputs: list[str], - accept_rate: float = 0.7): - answer2 = [int(text[0:2].strip()) for text in outputs] - print(list(zip(indices, zip(answer, answer2)))) - numok = 0 - for a1, a2 in zip(answer, answer2): - if a1 == a2: - numok += 1 - frac_ok = numok / len(answer) - print(f"Num OK: {numok}/{len(answer)} {frac_ok}") - assert frac_ok >= accept_rate - - -def check_window(prompts: list[str]): - - def inner(llm: LLM): - sliding_window = llm.llm_engine.model_config.get_sliding_window() - assert sliding_window and sliding_window > 0 - assert any( - len(llm.get_tokenizer().tokenize(prompt)) > sliding_window - for prompt in prompts) - - return inner diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py deleted file mode 100644 index 24499b9ad4e9c..0000000000000 --- a/tests/core/block/test_block_manager.py +++ /dev/null @@ -1,341 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.core.block_manager import SelfAttnBlockSpaceManager -from vllm.core.interfaces import AllocStatus -from vllm.sequence import Logprob, SequenceStatus -from vllm.utils import chunk_list - -from ..utils import create_dummy_prompt, create_seq_group - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80]) -@pytest.mark.parametrize("num_seqs_per_group", [1, 4]) -@pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, - num_gpu_blocks: int, watermark: float): - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - watermark=watermark, - ) - num_watermark_blocks = int(watermark * num_gpu_blocks) - - num_output_blocks_per_seq = 1 - - # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but - # the current implementation assumes all seqs are new prompts / don't have - # different output lens. - num_output_blocks = num_output_blocks_per_seq - - for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks): - seq_group = create_seq_group( - seq_prompt_len=block_size * num_prompt_blocks, - seq_output_lens=[ - block_size * num_output_blocks_per_seq - for _ in range(num_seqs_per_group) - ], - ) - - assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks - - can_allocate_result = block_manager.can_allocate(seq_group) - - num_required_blocks = num_prompt_blocks + num_output_blocks - - if num_gpu_blocks - num_required_blocks < num_watermark_blocks: - assert can_allocate_result == AllocStatus.NEVER - elif num_gpu_blocks >= num_required_blocks: - assert can_allocate_result == AllocStatus.OK - else: - assert can_allocate_result == AllocStatus.LATER - - -@pytest.mark.parametrize("block_size", [1, 8]) -@pytest.mark.parametrize("prompt_len", [1, 7, 8]) -@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129]) -@pytest.mark.parametrize("num_lookahead_slots", [0, 10]) -def test_append_slots(block_size, prompt_len, num_slots_to_append, - num_lookahead_slots): - """Verify append_slots consumes the correct number of blocks from the block - table. - """ - - num_gpu_blocks = 1024 - watermark = 0.1 - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=0, - watermark=watermark, - ) - - seq_group = create_seq_group( - seq_prompt_len=prompt_len, - seq_output_lens=[0], - ) - - # Allocate seq - assert block_manager.can_allocate(seq_group) - block_manager.allocate(seq_group) - - # Seq seq to RUNNING - seq = seq_group.get_seqs()[0] - seq.status = SequenceStatus.RUNNING - - # Append tokens to the sequeqnce - for token_id in range(num_slots_to_append): - seq.append_token_id(token_id, {token_id: Logprob(0.0)}) - - # Append slots for new tokens and lookahead slots. - free_blocks_before_append = block_manager.get_num_free_gpu_blocks() - block_manager.append_slots(seq, num_lookahead_slots) - num_consumed_blocks = (free_blocks_before_append - - block_manager.get_num_free_gpu_blocks()) - - # Expect consumed blocks to be new blocks required to support the new slots. - expected_consumed_blocks = len( - list( - chunk_list( - list( - range(prompt_len + num_slots_to_append + - num_lookahead_slots)), - block_size))) - len( - list(chunk_list(list(range(prompt_len)), block_size))) - assert num_consumed_blocks == expected_consumed_blocks - - -@pytest.mark.parametrize("block_size", [8]) -@pytest.mark.parametrize("num_cpu_blocks", [4]) -@pytest.mark.parametrize("num_gpu_blocks", [4]) -@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10]) -@pytest.mark.parametrize("enable_caching", [False, True]) -def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, - enable_caching): - """Verify blocks number on src/desc device is correct after swapping in/out - sequence group (not missing or extra blocks). - """ - block_manager = SelfAttnBlockSpaceManager(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - enable_caching=enable_caching) - prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1) - prompt.status = SequenceStatus.WAITING - block_manager.allocate(seq_group) - - # Emulate a forward pass by appending a single token. - # The block manager then knows how many unprocessed - # tokens will be written in the next forward pass. - token_id = 0 - prompt.status = SequenceStatus.RUNNING - prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) - - # Swap seq group from GPU -> CPU. - gpu_blocks = block_manager.get_block_table(prompt) - assert block_manager.can_swap_out(seq_group) - before_cpu_blocks = block_manager.get_num_free_cpu_blocks() - before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_out(seq_group) - mapping_keys = [key for key, _ in mapping] - assert mapping_keys == gpu_blocks - after_cpu_blocks = block_manager.get_num_free_cpu_blocks() - after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) - assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks - prompt.status = SequenceStatus.SWAPPED - - # Swap seq group from CPU -> GPU. - assert block_manager.can_swap_in(seq_group, num_lookahead_slots) - before_cpu_blocks = block_manager.get_num_free_cpu_blocks() - before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_in(seq_group) - cpu_blocks = block_manager.get_block_table(prompt) - mapping_keys = [key for key, _ in mapping] - assert mapping_keys == [cpu_blocks[0]] - after_cpu_blocks = block_manager.get_num_free_cpu_blocks() - after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) - - -@pytest.mark.parametrize("block_size", [8]) -@pytest.mark.parametrize("num_gpu_blocks", [4]) -@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10]) -@pytest.mark.parametrize("enable_caching", [True, False]) -def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots, - enable_caching): - """ Verify the block manager can correctly determine if a sequence group - can be swapped in/out. - """ - num_cpu_blocks = num_gpu_blocks - block_manager = SelfAttnBlockSpaceManager(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - enable_caching=enable_caching) - prompt, seq_group = create_dummy_prompt( - "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1) - prompt.status = SequenceStatus.WAITING - block_manager.allocate(seq_group) - prompt.status = SequenceStatus.RUNNING - - # Swap seq group from GPU -> CPU. - gpu_blocks = block_manager.get_block_table(prompt) - assert block_manager.can_swap_out(seq_group) - before_cpu_blocks = block_manager.get_num_free_cpu_blocks() - before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_out(seq_group) - mapping_keys = [key for key, _ in mapping] - assert mapping_keys == gpu_blocks - after_cpu_blocks = block_manager.get_num_free_cpu_blocks() - after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) - assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks - prompt.status = SequenceStatus.SWAPPED - - # At this moment, we still have enough free blocks to swap in the seq group. - if num_lookahead_slots <= block_size: - assert block_manager.can_swap_in(seq_group, - num_lookahead_slots) == AllocStatus.OK - else: - assert block_manager.can_swap_in( - seq_group, num_lookahead_slots) == AllocStatus.NEVER - - # During Swapped out, 2 cached blocks were evicted from the GPU, - # so the prompt1 can't be swapped in - prompt2_len = 2 * block_size - 1 - prompt2, seq_group2 = create_dummy_prompt( - "2", - prompt_length=prompt2_len, - prompt_tokens=[10000 + i for i in range(prompt2_len)]) - prompt2.status = SequenceStatus.WAITING - block_manager.allocate(seq_group2) - - # Swap seq group from CPU -> GPU. - if num_lookahead_slots <= block_size: - assert block_manager.can_swap_in( - seq_group, num_lookahead_slots) == AllocStatus.LATER - else: - assert block_manager.can_swap_in( - seq_group, num_lookahead_slots) == AllocStatus.NEVER - - -@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10]) -@pytest.mark.parametrize("enable_caching", [False, True]) -def test_swap_in_infeasible(num_lookahead_slots, enable_caching): - """Verifies that swapping fails if there is not enough free blocks - to account for unseen tokens and lookahead_slots. - """ - block_size = 8 - num_cpu_blocks = 1 - num_gpu_blocks = 1 - block_manager = SelfAttnBlockSpaceManager(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - enable_caching=enable_caching) - prompt_length = block_size - 3 - assert prompt_length > 0 - prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length) - prompt.status = SequenceStatus.WAITING - block_manager.allocate(seq_group) - # Emulate a forward pass by appending a single token. - # The block manager then knows how many unprocessed - # tokens will be written in the next forward pass. - token_id = 0 - prompt.status = SequenceStatus.RUNNING - prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) - - # Swap seq group from GPU -> CPU. - assert block_manager.can_swap_out(seq_group) - block_manager.swap_out(seq_group) - prompt.status = SequenceStatus.SWAPPED - - # Swap seq group from CPU -> GPU. - # The number of unseen tokens is 1. If the number of existing - # tokens plus the unseen ones and number of lookahead slots exceeds - # the total number of available GPU blocks then the swap - # should fail. - num_unseen_tokens = 1 - if (num_lookahead_slots + num_unseen_tokens + - prompt_length) <= (block_size * num_gpu_blocks): - assert block_manager.can_swap_in(seq_group, - num_lookahead_slots) == AllocStatus.OK - else: - assert block_manager.can_swap_in( - seq_group, num_lookahead_slots) == AllocStatus.NEVER - - -# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level. - - -@pytest.mark.parametrize("block_size", [8, 16]) -@pytest.mark.parametrize("prompt_len", [10, 300, 1000]) -@pytest.mark.parametrize("num_slots_to_append", [50]) -@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512]) -def test_sliding_window(block_size, prompt_len, num_slots_to_append, - sliding_window): - """Verify append_slots consumes the correct number of blocks from the block - table. - """ - - num_gpu_blocks = 1024 - watermark = 0.1 - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=0, - watermark=watermark, - sliding_window=sliding_window, - ) - - def check_used(min_n, max_n=None): - if max_n is None: - max_n = min_n - used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks() - assert min_n <= used - assert used <= max_n - - def num_blocks(num_tokens): - return (num_tokens + block_size - 1) // block_size - - check_used(0) - - seq_group = create_seq_group( - seq_prompt_len=prompt_len, - seq_output_lens=[0], - ) - - check_used(0) - - # Allocate seq - assert block_manager.can_allocate(seq_group) - block_manager.allocate(seq_group) - - check_used(num_blocks(prompt_len)) - - # Seq seq to RUNNING - seq = seq_group.get_seqs()[0] - seq.status = SequenceStatus.RUNNING - - seq.data.update_num_computed_tokens(prompt_len) - check_used(num_blocks(prompt_len)) - - # this is how we compute it in SelfAttnBlockSpaceManager.__init__ - sliding_blocks = (sliding_window // block_size) + 2 - # plus one block for null block - sliding_blocks += 1 - - # Append tokens to the sequeqnce - for token_id in range(num_slots_to_append): - seq.append_token_id(token_id, {token_id: Logprob(0.0)}) - seq.data.update_num_computed_tokens(1) - block_manager.append_slots(seq, num_lookahead_slots=0) - if prompt_len < sliding_window + 10: - check_used(0, sliding_blocks + 1) - else: - check_used(sliding_blocks, sliding_blocks + 1) diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py deleted file mode 100644 index ba085001136be..0000000000000 --- a/tests/core/block/test_block_table.py +++ /dev/null @@ -1,577 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.core.block.block_table import BlockTable -from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator -from vllm.utils import Device, cdiv, chunk_list - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -def test_allocate_naive(block_size: int, sequence_len: int): - """Test the allocation of blocks using the naive allocator. - - This test creates a CpuGpuBlockAllocator with the specified block size and - number of blocks. It then allocates multiple BlockTables with varying - sequence lengths and verifies that the number of free blocks decreases as - expected after each allocation. - """ - assert block_size > 1 - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type="naive", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) - - block_tables: list[BlockTable] = [] - for i in range(5): - assert allocator.get_num_free_blocks( - device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc - - block_tables.append( - BlockTable( - block_size=block_size, - block_allocator=allocator, - )) - block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU) - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -def test_allocate_prefix_caching(block_size: int, sequence_len: int): - """Test the allocation of blocks using the prefix caching allocator. - - This test creates a CpuGpuBlockAllocator with the specified block size and - number of blocks, using the prefix caching allocator. It then allocates - multiple BlockTables with varying sequence lengths and verifies that the - number of free blocks decreases as expected after each allocation. - - The test expects all sequences to share allocations, except for their last - block, which may be mutable. It calculates the expected number of immutable - and mutable blocks per allocation based on the sequence length and block - size. - """ - assert block_size > 1 - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type="prefix_caching", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - chunked_tokens = list(chunk_list(token_ids, block_size)) - num_mutable_blocks_per_alloc = 0 if len( - chunked_tokens[-1]) == block_size else 1 - num_immutable_blocks_per_alloc = len( - chunked_tokens) - num_mutable_blocks_per_alloc - - block_tables: list[BlockTable] = [] - for alloc_i in range(1, 6): - - block_tables.append( - BlockTable( - block_size=block_size, - block_allocator=allocator, - )) - block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU) - - # Expect all sequences to share allocations, except for their last block - # (which may be mutable). - assert allocator.get_num_free_blocks( - device=Device.GPU) == num_gpu_blocks - ( - num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc * - (alloc_i)) - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -@pytest.mark.parametrize("device", ["cpu", "gpu"]) -def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str, - device: str): - """Test the allocation and freeing of blocks using different allocators and - devices. - - This test creates a CpuGpuBlockAllocator with the specified block size, - number of blocks, allocator type, and device. It then allocates a BlockTable - multiple times with the same sequence and verifies that the number of free - blocks remains consistent after each allocation and freeing. - """ - device = Device[device.upper()] - - num_device_blocks = 1024 - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_device_blocks, - num_cpu_blocks=num_device_blocks, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) - - block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - - for i in range(5): - block_table.allocate(token_ids=token_ids, device=device) - assert allocator.get_num_free_blocks( - device) == num_device_blocks - num_blocks_per_alloc - assert all(block_id is not None - for block_id in block_table.physical_block_ids) - - block_table.free() - assert allocator.get_num_free_blocks(device) == num_device_blocks - - -@pytest.mark.parametrize("block_size", [1, 8]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -@pytest.mark.parametrize("append_len", [1, 16, 129]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_append_token_ids_allocation(block_size: int, sequence_len: int, - append_len: int, allocator_type: str): - """Test the allocation behavior when appending token IDs to a BlockTable. - - This test creates a CpuGpuBlockAllocator with the specified block size, - number of blocks, and allocator type. It then allocates a BlockTable with an - initial sequence and appends additional token IDs to it. The test verifies - that the number of allocated blocks before and after appending matches the - expected values. - """ - - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - token_ids_to_append = list(range(append_len)) - - block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - - num_expected_blocks_before_append = len( - list(chunk_list(token_ids, block_size))) - num_expected_appended_blocks = len( - list(chunk_list(token_ids + token_ids_to_append, - block_size))) - num_expected_blocks_before_append - - block_table.allocate(token_ids=token_ids, device=Device.GPU) - - assert len( - block_table.physical_block_ids) == num_expected_blocks_before_append - block_table.append_token_ids(token_ids_to_append) - assert len( - block_table.physical_block_ids - ) == num_expected_blocks_before_append + num_expected_appended_blocks - - -@pytest.mark.parametrize("block_size", [1, 8]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -@pytest.mark.parametrize("num_empty_slots", [1, 16, 129]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int, - num_empty_slots: int, - allocator_type: str): - """Test the allocation behavior when ensuring a certain number of empty - slots in a BlockTable. - - This test creates a CpuGpuBlockAllocator with the specified block size, - number of blocks, and allocator type. It then allocates a BlockTable with an - initial sequence and ensures a certain number of empty slots. The test - verifies that the number of allocated blocks before and after ensuring empty - slots matches the expected values. It also checks that filling up the empty - slots does not consume additional blocks. - """ - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - - block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - - num_expected_blocks_before_append = len( - list(chunk_list(token_ids, block_size))) - num_expected_appended_blocks = len( - list(chunk_list(token_ids + [-1] * num_empty_slots, - block_size))) - num_expected_blocks_before_append - - block_table.allocate(token_ids=token_ids, device=Device.GPU) - - # Assert that the empty slots consume the expected number of additional - # blocks. - assert len( - block_table.physical_block_ids) == num_expected_blocks_before_append - block_table.ensure_num_empty_slots(num_empty_slots) - assert len( - block_table.physical_block_ids - ) == num_expected_blocks_before_append + num_expected_appended_blocks - - # Now, ensure no additional blocks consumed as we fill up the empty slots. - num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU) - block_table.append_token_ids(token_ids=list(range(num_empty_slots))) - assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU) - - -@pytest.mark.parametrize("block_size", [1, 8]) -@pytest.mark.parametrize("sequence_len", [1, 9]) -@pytest.mark.parametrize("append_len", [1, 16, 129]) -@pytest.mark.parametrize("append_size", [1, 4, 129]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_append_token_ids_correct_content(block_size: int, sequence_len: int, - append_len: int, allocator_type: str, - append_size: int): - """Verify token ids are correctly appended. Appends various amounts of - token ids in various append sizes, and verifies the final sequence is - correct. - """ - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - token_ids_to_append = list(range(append_len)) - - block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - block_table.allocate(token_ids=token_ids, device=Device.GPU) - - appended_so_far: list[int] = [] - for append in chunk_list(token_ids_to_append, append_size): - block_table.append_token_ids(append) - appended_so_far.extend(append) - - assert block_table._get_all_token_ids() == token_ids + appended_so_far - - assert block_table._get_all_token_ids() == token_ids + token_ids_to_append - - -@pytest.mark.parametrize("seq_len", [1, 9, 129]) -@pytest.mark.parametrize("block_size", [1, 8]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_fork(seq_len: int, block_size: int, allocator_type: str): - """Create a sequence using the specified allocator. - 1. Assert that after forking the sequence, the free block count is the - same. - 2. Assert that the forked sequence has the same physical mappings. - 3. Then free the original sequence; verify that the free block count is - the same. - 4. Finally, free the forked sequence and verify that the free block - count drops to zero. - """ - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=0, - block_size=block_size, - ) - - token_ids = list(range(seq_len)) - - block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - - block_table.allocate(token_ids) - - num_free_blocks_before_fork = allocator.get_num_free_blocks( - device=Device.GPU) - - forked_block_table = block_table.fork() - - # Expect physical_block_ids and token_ids to match. - assert (block_table.physical_block_ids == - forked_block_table.physical_block_ids) - assert block_table._get_all_token_ids( - ) == forked_block_table._get_all_token_ids() - - # Do not expect any additional allocations. - assert allocator.get_num_free_blocks( - device=Device.GPU) == num_free_blocks_before_fork - - # Free the original blocks. Assert num free blocks does not change, since - # refcount is nonzero. - block_table.free() - assert allocator.get_num_free_blocks( - device=Device.GPU) == num_free_blocks_before_fork - - # Expect the forked block table to be unaffected by the free. - assert all(block_id is not None - for block_id in forked_block_table.physical_block_ids) - - # Free the forked blocks. Assert num free blocks does change, since - # refcount is now zero. - forked_block_table.free() - assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks - - -@pytest.mark.parametrize("block_size", [8]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -@pytest.mark.parametrize("append_len", [1, 16, 129]) -@pytest.mark.parametrize("appender", ["forked", "original"]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_cow(block_size: int, sequence_len: int, append_len: int, - allocator_type: str, appender: str): - """Fork a sequence; append to the forked sequence; verify there's a CoW. - """ - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=0, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - token_ids_to_append = list(range(append_len)) - - original_block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - - num_expected_non_cow_blocks = cdiv(sequence_len, block_size) - num_expected_cow_blocks = cdiv(sequence_len + append_len, - block_size) - (sequence_len // block_size) - - original_block_table.allocate(token_ids=token_ids, device=Device.GPU) - original_block_ids = original_block_table.physical_block_ids[:] - - print("original_block_ids = {}".format(original_block_ids)) - forked_block_table = original_block_table.fork() - - # Expect no additional allocation (copy on _write_). - assert allocator.get_num_free_blocks( - Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks) - - if appender == "forked": - appender_block_table = forked_block_table - static_block_table = original_block_table - elif appender == "original": - appender_block_table = original_block_table - static_block_table = forked_block_table - else: - raise ValueError(f"unknown test config {appender=}") - - # Write tokens. - appender_block_table.append_token_ids(token_ids_to_append) - - # Expect the non-appending block table to have no change. - assert static_block_table.physical_block_ids == original_block_ids - assert appender_block_table.physical_block_ids != original_block_ids - - # Expect the blocks changed during append to have a CoW. - assert allocator.get_num_free_blocks( - Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks + - num_expected_cow_blocks) - - cows = allocator.clear_copy_on_writes() - if sequence_len % block_size > 0: - # If the last block in the sequence is not full, then when appending we - # expect a CoW. - assert cows - - cow_block_id = sequence_len // block_size - expected_src = static_block_table.physical_block_ids[cow_block_id] - expected_dst = appender_block_table.physical_block_ids[cow_block_id] - - assert (expected_src, expected_dst) in cows - else: - # Otherwise, there should be no copy-on-write. - assert not cows - - static_block_table.free() - appender_block_table.free() - - # After free, expect all blocks to be freed. - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - - -@pytest.mark.parametrize("block_size", [8]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -@pytest.mark.parametrize("append_len", [1, 16, 129]) -@pytest.mark.parametrize("lookahead_slots", [1, 16, 129]) -@pytest.mark.parametrize("appender", ["forked", "original"]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_cow_lookahead_simple(block_size: int, sequence_len: int, - append_len: int, lookahead_slots: int, - allocator_type: str, appender: str): - """Similar to test_cow, except with lookahead allocation. The assertions are - less rigorous due to the complexity of the property under test. - """ - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=0, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - token_ids_to_append = list(range(append_len)) - - original_block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - - original_block_table.allocate(token_ids=token_ids, device=Device.GPU) - - # Allocate lookahead slots. - original_block_table.ensure_num_empty_slots(lookahead_slots) - original_block_ids = original_block_table.physical_block_ids[:] - - forked_block_table = original_block_table.fork() - - if appender == "forked": - appender_block_table = forked_block_table - static_block_table = original_block_table - elif appender == "original": - appender_block_table = original_block_table - static_block_table = forked_block_table - else: - raise ValueError(f"unknown test config {appender=}") - - # Write tokens. - appender_block_table.append_token_ids(token_ids_to_append) - - # Expect the non-appending block table to have no change. - assert static_block_table.physical_block_ids == original_block_ids - assert appender_block_table.physical_block_ids != original_block_ids - - cows = allocator.clear_copy_on_writes() - - # Always expect copy-on-write - assert cows - - if sequence_len % block_size > 0: - # If the last block in the sequence is not full, then when appending we - # expect a CoW. - assert cows - - cow_block_id = sequence_len // block_size - expected_src = static_block_table.physical_block_ids[cow_block_id] - expected_dst = appender_block_table.physical_block_ids[cow_block_id] - - assert (expected_src, expected_dst) in cows - - static_block_table.free() - appender_block_table.free() - - # After free, expect all blocks to be freed. - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - - -@pytest.mark.parametrize("block_size", [1, 8]) -@pytest.mark.parametrize("sequence_len", [1, 16, 129]) -@pytest.mark.parametrize("num_new_tokens", [1, 16, 129]) -@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int, - num_new_tokens: int, - num_lookahead_slots: int, - allocator_type: str): - """Verify correct calculation of get_num_blocks_touched_by_append_slots. - - This is done by using copy-on-write, which requires any modified block to - be copied before write if the refcount > 1. We set the refcount>1 by forking - a sequence, then measure the free blocks before and after an append. If the - number of consumed blocks equals what `get_num_blocks_touched_by_append_ - slots` returns, then the calculation is correct. - """ - - num_gpu_blocks = 1024 - - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=0, - block_size=block_size, - ) - - token_ids = list(range(sequence_len)) - token_ids_to_append = list(range(num_new_tokens)) - - block_table = BlockTable( - block_size=block_size, - block_allocator=allocator, - ) - - block_table.allocate(token_ids=token_ids, device=Device.GPU) - - # Add lookahead before fork so both sequences have the same lookahead - # blocks. - block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots) - - # Fork sequence so that every block has refcount > 1. - _ = block_table.fork() - - # Determine how many blocks should be touched. - expected_num_touched_blocks = ( - block_table.get_num_blocks_touched_by_append_slots( - token_ids=token_ids_to_append, - num_lookahead_slots=num_lookahead_slots)) - - # Measure how many blocks are touched by measuring num_free_blocks before - # and after the append. - # - # We expect append_token_ids to CoW all mutated blocks that have refcount>1. - num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU) - block_table.append_token_ids(token_ids_to_append, num_lookahead_slots) - num_consumed_blocks = (num_free_blocks_before_append - - allocator.get_num_free_blocks(Device.GPU)) - - # TODO(cade) ensure equality when num_lookahead_slots > 0. - # The reason we have < is because lookahead blocks are not copied eagerly; - # they are copied on first write. This will cause issues for beam search + - # speculative decoding. This is acceptable for now as it is a large effort - # to combine the two. To fix this, we can ensure single sequence ownership - # of lookahead blocks by appending empty slots to each block, which will - # trigger the CoW. - # - # Until then, we can accept that the consumed tokens are <= the expected - # tokens when appending with lookahead. - if num_lookahead_slots > 0: - assert num_consumed_blocks <= expected_num_touched_blocks - else: - assert num_consumed_blocks == expected_num_touched_blocks diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py deleted file mode 100644 index 65400899b811c..0000000000000 --- a/tests/core/block/test_common.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random - -import pytest - -from vllm.core.block.common import RefCounter - - -@pytest.mark.parametrize("seed", list(range(20))) -@pytest.mark.parametrize("num_incrs", [1, 100]) -@pytest.mark.parametrize("num_blocks", [1024]) -def test_incr(seed: int, num_incrs: int, num_blocks: int): - random.seed(seed) - - all_block_indices = list(range(num_blocks)) - counter = RefCounter(all_block_indices=all_block_indices) - - block_id = random.randint(0, num_blocks - 1) - for i in range(num_incrs): - value = counter.incr(block_id) - assert value == i + 1 - - -@pytest.mark.parametrize("seed", list(range(20))) -@pytest.mark.parametrize("num_incrs", [1, 100]) -@pytest.mark.parametrize("num_blocks", [1024]) -def test_incr_decr(seed: int, num_incrs: int, num_blocks: int): - random.seed(seed) - - all_block_indices = list(range(num_blocks)) - counter = RefCounter(all_block_indices=all_block_indices) - - block_id = random.randint(0, num_blocks - 1) - for i in range(num_incrs): - value = counter.incr(block_id) - assert value == i + 1 - - for i in range(num_incrs): - value = counter.decr(block_id) - assert value == num_incrs - (i + 1) - - with pytest.raises(AssertionError): - counter.decr(block_id) diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py deleted file mode 100644 index 795eef6743fd1..0000000000000 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ /dev/null @@ -1,96 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator -from vllm.utils import Device, chunk_list - - -@pytest.mark.parametrize("num_cpu_blocks", [0, 512]) -@pytest.mark.parametrize("num_gpu_blocks", [1024]) -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int, - block_size: int, allocator_type: str): - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks, - block_size=block_size, - ) - - assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - - cpu_blocks = [ - allocator.allocate_mutable_block(prev_block=None, device=Device.CPU) - for _ in range(num_cpu_blocks) - ] - assert allocator.get_num_free_blocks(Device.CPU) == 0 - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - - gpu_blocks = [ - allocator.allocate_mutable_block(prev_block=None, device=Device.GPU) - for _ in range(num_gpu_blocks) - ] - assert allocator.get_num_free_blocks(Device.CPU) == 0 - assert allocator.get_num_free_blocks(Device.GPU) == 0 - - _ = [allocator.free(block) for block in cpu_blocks] - assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks - assert allocator.get_num_free_blocks(Device.GPU) == 0 - - _ = [allocator.free(block) for block in gpu_blocks] - assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - - -@pytest.mark.parametrize("num_cpu_blocks", [0, 512]) -@pytest.mark.parametrize("num_gpu_blocks", [1024]) -@pytest.mark.parametrize("block_size", [2]) -@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"]) -def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int, - block_size: int, allocator_type: str): - allocator = CpuGpuBlockAllocator.create( - allocator_type=allocator_type, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks, - block_size=block_size, - ) - - unique_token_ids = list( - range((num_cpu_blocks + num_gpu_blocks) * block_size)) - gpu_token_ids = list( - chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size)) - cpu_token_ids = list( - chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size)) - - assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - - cpu_blocks = [ - allocator.allocate_immutable_block(prev_block=None, - token_ids=token_ids, - device=Device.CPU) - for token_ids in cpu_token_ids - ] - assert allocator.get_num_free_blocks(Device.CPU) == 0 - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - - gpu_blocks = [ - allocator.allocate_immutable_block(prev_block=None, - token_ids=token_ids, - device=Device.GPU) - for token_ids in gpu_token_ids - ] - assert allocator.get_num_free_blocks(Device.CPU) == 0 - assert allocator.get_num_free_blocks(Device.GPU) == 0 - - _ = [allocator.free(block) for block in cpu_blocks] - assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks - assert allocator.get_num_free_blocks(Device.GPU) == 0 - - _ = [allocator.free(block) for block in gpu_blocks] - assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks - assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py deleted file mode 100644 index a31d1c46b37f0..0000000000000 --- a/tests/core/block/test_naive_block.py +++ /dev/null @@ -1,148 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -import pytest - -from vllm.core.block.interfaces import Block, BlockAllocator -from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator - - -class TestNaiveBlockAllocator: - - @staticmethod - def create_allocate_lambda(allocate_type: str, - allocator: NaiveBlockAllocator, - prev_block: Optional[Block], - token_ids: list[int]): - if allocate_type == "immutable": - allocate_block = lambda: allocator.allocate_immutable_block( - prev_block=prev_block, token_ids=token_ids) - elif allocate_type == "mutable": - allocate_block = lambda: allocator.allocate_mutable_block( - prev_block=prev_block) - else: - raise ValueError() - - return allocate_block - - @staticmethod - @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"]) - @pytest.mark.parametrize("num_blocks", [1, 1024]) - @pytest.mark.parametrize("block_size", [1, 16]) - def test_allocate_ooms(allocate_type: str, num_blocks: int, - block_size: int): - allocator = NaiveBlockAllocator(create_block=NaiveBlock, - num_blocks=num_blocks, - block_size=block_size) - allocate_block = TestNaiveBlockAllocator.create_allocate_lambda( - allocate_type, - allocator, - prev_block=None, - token_ids=list(range(block_size))) - - [allocate_block() for _ in range(num_blocks)] - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocate_block() - - @staticmethod - @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"]) - @pytest.mark.parametrize("num_blocks", [1, 1024]) - @pytest.mark.parametrize("block_size", [1, 16]) - def test_free_prevents_oom(allocate_type: str, num_blocks: int, - block_size: int): - allocator = NaiveBlockAllocator(create_block=NaiveBlock, - num_blocks=num_blocks, - block_size=block_size) - allocate_block = TestNaiveBlockAllocator.create_allocate_lambda( - allocate_type, - allocator, - prev_block=None, - token_ids=list(range(block_size))) - - blocks = [allocate_block() for _ in range(num_blocks)] - - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocate_block() - - block_to_free = blocks.pop() - - for _ in range(100): - block_id = block_to_free.block_id - allocator.free(block_to_free) - assert block_to_free.block_id is None - - new_block = allocate_block() - assert new_block.block_id == block_id - - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocate_block() - - block_to_free = new_block - - @staticmethod - @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"]) - @pytest.mark.parametrize("num_blocks", [1024]) - @pytest.mark.parametrize("block_size", [16]) - def test_get_num_free_blocks(allocate_type: str, num_blocks: int, - block_size: int): - allocator = NaiveBlockAllocator(create_block=NaiveBlock, - num_blocks=num_blocks, - block_size=block_size) - allocate_block = TestNaiveBlockAllocator.create_allocate_lambda( - allocate_type, - allocator, - prev_block=None, - token_ids=list(range(block_size))) - - assert allocator.get_num_free_blocks() == num_blocks - - blocks = [allocate_block() for _ in range(num_blocks)] - - for i, block in enumerate(blocks): - assert allocator.get_num_free_blocks() == i - allocator.free(block) - - @staticmethod - @pytest.mark.parametrize("num_blocks", [4]) - @pytest.mark.parametrize("block_size", [8]) - def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size): - """ Verify the allocator can correctly return the number of - full blocks touched. - """ - allocator_src = NaiveBlockAllocator(create_block=NaiveBlock, - num_blocks=num_blocks, - block_size=block_size) - allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock, - num_blocks=num_blocks, - block_size=block_size) - - # Create a chain of cacheable blocks in the dst - allocate_block = TestNaiveBlockAllocator.create_allocate_lambda( - "immutable", - allocator_src, - prev_block=None, - token_ids=list(range(block_size))) - src_blocks = [allocate_block() for _ in range(num_blocks - 1)] - - # All blocks are cached - assert allocator_dst.get_num_full_blocks_touched( - src_blocks) == num_blocks - 1 - - # Insert one non-full block in the src - allocate_non_full_block = \ - TestNaiveBlockAllocator.create_allocate_lambda( - "mutable", allocator_src, - prev_block=src_blocks[-1],token_ids=[] - ) - src_blocks.append(allocate_non_full_block()) - src_blocks[-1].append_token_ids([0]) - - assert allocator_dst.get_num_full_blocks_touched( - src_blocks) == num_blocks - 1 - # Fill up the last source block and then invoke - # get_num_blocks_touched - src_blocks[-1].append_token_ids([0] * (block_size - 1)) - assert allocator_dst.get_num_full_blocks_touched( - src_blocks) == num_blocks diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py deleted file mode 100644 index 46e224c6f53b2..0000000000000 --- a/tests/core/block/test_prefix_caching_block.py +++ /dev/null @@ -1,1035 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -import random -from typing import Optional -from unittest.mock import MagicMock - -import pytest - -from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence -from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator -from vllm.core.block.interfaces import Block, BlockAllocator -from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker, - PrefixCachingBlock, - PrefixCachingBlockAllocator) -from vllm.sequence import Logprob -from vllm.utils import Device - - -class TestPrefixCachingBlock: - - @staticmethod - @pytest.mark.parametrize("seed", list(range(10))) - @pytest.mark.parametrize("block_size", [1, 16]) - @pytest.mark.parametrize("is_curr_block_full", [True, False]) - def test_first_block_has_correct_content_hash(seed: int, block_size: int, - is_curr_block_full: bool): - """Verify a block which is first in the sequence has the correct hash. - """ - random.seed(seed) - num_to_fill = block_size if is_curr_block_full else random.randint( - 0, block_size - 1) - token_ids = list(range(num_to_fill)) - mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator) - - block_with_prev = PrefixCachingBlock(prev_block=None, - token_ids=token_ids, - block_size=block_size, - allocator=mock_allocator) - - if is_curr_block_full: - # Expect hash since block is full. - assert block_with_prev.content_hash == ( - PrefixCachingBlock.hash_block_tokens( - is_first_block=True, - prev_block_hash=None, - cur_block_token_ids=token_ids)) - else: - # Do not expect hash since block is not full. - assert block_with_prev.content_hash is None - - @staticmethod - @pytest.mark.parametrize("seed", list(range(10))) - @pytest.mark.parametrize("block_size", [1, 16]) - @pytest.mark.parametrize("is_curr_block_full", [True, False]) - @pytest.mark.parametrize("prev_block_has_hash", [True, False]) - def test_nth_block_has_correct_content_hash(seed: int, block_size: int, - is_curr_block_full: bool, - prev_block_has_hash: bool): - """Verify a block which is not first in the sequence has the correct - hash. - """ - - random.seed(seed) - - previous_block = MagicMock(spec=PrefixCachingBlock) - prev_block_hash = random.randint(0, 1000) - previous_block.content_hash = (prev_block_hash if prev_block_has_hash - else hash('None')) - - num_to_fill = block_size if is_curr_block_full else random.randint( - 0, block_size - 1) - token_ids = list(range(num_to_fill)) - mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator) - - block_with_prev = PrefixCachingBlock( - prev_block=previous_block, - token_ids=token_ids, - block_size=block_size, - allocator=mock_allocator, - ) - - if is_curr_block_full and prev_block_has_hash: - # Expect hash since block is full and previous block has hash. - assert (block_with_prev.content_hash == - PrefixCachingBlock.hash_block_tokens( - is_first_block=False, - prev_block_hash=prev_block_hash, - cur_block_token_ids=token_ids)) - else: - # Do not expect hash since block is not full or the previous block - # does not have a hash. - assert block_with_prev.content_hash is None - - @staticmethod - @pytest.mark.parametrize("block_size", [1, 2, 16]) - @pytest.mark.parametrize("num_tokens", list(range(3))) - @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10]) - def test_blocks_have_correct_hash_in_chain(block_size: int, - num_tokens: int, - num_empty_trailing_blocks: int): - """Create two chains of logical blocks with the same contents. - Assert the hashes are equal. - """ - random.seed(0) - - token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)] - - first_chain, second_chain = (TestPrefixCachingBlock.create_chain( - block_size=block_size, - token_ids=token_ids, - num_empty_trailing_blocks=num_empty_trailing_blocks) - for _ in range(2)) - - for first_chain_block, second_chain_block in zip( - first_chain, second_chain): - assert (first_chain_block.content_hash == - second_chain_block.content_hash) - - if not first_chain or not second_chain: - assert first_chain == second_chain - assert num_tokens == 0 - - @staticmethod - def create_chain(block_size: int, - token_ids: list[int], - num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]: - """Helper method which creates a chain of blocks. - """ - blocks: list[PrefixCachingBlock] = [] - num_blocks = math.ceil( - len(token_ids) / block_size) + num_empty_trailing_blocks - - if num_blocks == 0: - return [] - - allocator = MagicMock(spec=PrefixCachingBlockAllocator) - - prev_block = None - for block_number in range(0, num_blocks): - prev_block = PrefixCachingBlock( - prev_block=prev_block, - token_ids=[], - block_size=block_size, - allocator=allocator, - ) - - tokens_to_append = token_ids[block_number * - block_size:(block_number + 1) * - block_size] - if tokens_to_append: - prev_block.append_token_ids(tokens_to_append) - - blocks.append(prev_block) - - return blocks - - -class TestPrefixCachingBlockAllocator: - - @staticmethod - def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator, - prev_block: Optional[Block], - token_ids: list[int]): - if allocate_type == "immutable": - allocate_block = lambda: allocator.allocate_immutable_block( - prev_block=prev_block, token_ids=token_ids) - elif allocate_type == "mutable": - allocate_block = lambda: allocator.allocate_mutable_block( - prev_block=prev_block) - else: - raise ValueError() - - return allocate_block - - @staticmethod - @pytest.mark.parametrize("num_blocks", [1, 1024]) - @pytest.mark.parametrize("block_size", [1, 16]) - def test_allocate_mutable_ooms(num_blocks: int, block_size: int): - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda( - allocate_type="mutable", - allocator=allocator, - prev_block=None, - token_ids=list(range(block_size)), - ) - - [allocate_block() for _ in range(num_blocks)] - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocate_block() - - @staticmethod - @pytest.mark.parametrize("num_blocks", [1, 1024]) - @pytest.mark.parametrize("block_size", [1, 16]) - def test_allocate_immutable_does_not_oom_single_hash( - num_blocks: int, block_size: int): - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda( - allocate_type="immutable", - allocator=allocator, - prev_block=None, - token_ids=list(range(block_size)), - ) - - blocks = [allocate_block() for _ in range(num_blocks)] - - # Expect no OOM. If these were mutable blocks, this would OOM. - non_oom_block = allocate_block() - - # Expect all blocks to have same physical block index. - for block in blocks: - assert (block.block_id == non_oom_block.block_id) - - @staticmethod - @pytest.mark.parametrize("num_blocks", [1, 1024]) - @pytest.mark.parametrize("block_size", [1, 16]) - def test_allocate_immutable_ooms_many_hash(num_blocks: int, - block_size: int): - """Consume all blocks using many different hashes/block content. - - Do this by creating a sequence that is very long. - Expect next block to OOM. - """ - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - - # Create token ids that will exhaust all blocks. - token_ids = list(range(num_blocks * block_size)) - - chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - # Expect allocation with unseen hash to fail. - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocator.allocate_immutable_block(prev_block=chain[-1], - token_ids=list( - range(block_size))) - - # Expect mutable allocation to fail. - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocator.allocate_mutable_block(prev_block=chain[-1]) - - # Expect allocation of exact same chain to pass. - second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - # Expect physical block indices to be the same in both chains. - assert chain and second_chain - for first_chain_block, second_chain_block in zip(chain, second_chain): - assert (first_chain_block.block_id == second_chain_block.block_id) - - @staticmethod - @pytest.mark.parametrize("num_blocks", [1, 1024]) - @pytest.mark.parametrize("block_size", [1, 16]) - def test_free_prevents_oom(num_blocks: int, block_size: int): - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - - # Create token ids that will exhaust all blocks. - token_ids = list(range(num_blocks * block_size)) - - chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - # Expect mutable allocation to fail. - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocator.allocate_mutable_block(prev_block=None) - - block_to_free = chain[-1] - - # Expect free/allocate loop to succeed many times. - for i in range(100): - block_id = block_to_free.block_id - allocator.free(block_to_free) - assert block_to_free.block_id is None, i - - new_block = allocator.allocate_mutable_block(prev_block=None) - assert new_block.block_id == block_id, i - - with pytest.raises(BlockAllocator.NoFreeBlocksError): - allocator.allocate_mutable_block(prev_block=None) - - block_to_free = new_block - - @staticmethod - @pytest.mark.parametrize("num_blocks", [1024]) - @pytest.mark.parametrize("block_size", [16]) - @pytest.mark.parametrize("seed", list(range(20))) - def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int): - random.seed(seed) - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - num_blocks_to_consume = random.randint(1, num_blocks - 1) - - # Create token ids that will exhaust all blocks. - token_ids = list(range(num_blocks_to_consume * block_size)) - - chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - # Free each block in chain, assert num free blocks includes new free - # block. - for i, block in enumerate(chain): - assert allocator.get_num_free_blocks() == (num_blocks - - num_blocks_to_consume + - i) - allocator.free(block) - - @staticmethod - @pytest.mark.parametrize("num_blocks", [4]) - @pytest.mark.parametrize("block_size", [8]) - def test_prefix_caching_block_get_num_full_blocks_touched( - num_blocks, block_size): - """ Verify the allocator can correctly return the number of - blocks touched, when there are cached prefixes. - """ - allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - - # Create token ids that will exhaust all blocks except the last - token_ids = list(range((num_blocks - 1) * block_size)) - - # Create a chain of cacheable blocks in the dst - cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator_dst, - ) - - # Create a chain of the same blocks in the src - blocks_to_swap_in = \ - TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator_src, - ) - # All blocks are cached - assert allocator_dst.get_num_full_blocks_touched( - blocks_to_swap_in) == 0 - - # Free the first block in the dst - allocator_dst.free(cached_blocks[0]) - - # Now the first block becomes dangling, the swapped blocks need - # to reclaim the first block in the dst - assert allocator_dst.get_num_full_blocks_touched( - blocks_to_swap_in) == 1 - - # Insert one non-full block in the src - non_full_block = allocator_src.allocate_mutable_block( - blocks_to_swap_in[-1]) - non_full_block.append_token_ids([0]) - blocks_to_swap_in.append(non_full_block) - assert allocator_dst.get_num_full_blocks_touched( - blocks_to_swap_in) == 1 - # Fill up the last mutable block and invoke get_num_blocks_touched. - # Note: The last block is not cached so it will be touched. - non_full_block.append_token_ids([0] * (block_size - 1)) - assert allocator_dst.get_num_full_blocks_touched( - blocks_to_swap_in) == 2 - - @staticmethod - @pytest.mark.parametrize("num_blocks", [1024]) - @pytest.mark.parametrize("block_size", [16]) - @pytest.mark.parametrize("seed", list(range(20))) - def test_get_num_free_blocks_shared(num_blocks: int, block_size: int, - seed: int): - """Verify sharing occurs by allocating two sequences that share prefixes - and incrementally freeing blocks. - """ - random.seed(seed) - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - num_blocks_to_consume = random.randint(1, num_blocks - 1) - - # Create token ids that will exhaust all blocks. - token_ids = list(range(num_blocks_to_consume * block_size)) - - first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - # Free each block in the first chain. Since all blocks are shared, the - # free count should stay constant. - for i, block in enumerate(first_chain): - assert allocator.get_num_free_blocks() == (num_blocks - - num_blocks_to_consume) - allocator.free(block) - - # Free each block in the second chain. Since the refcount is now zero, - # the free count should increment with each free. - for i, block in enumerate(second_chain): - assert allocator.get_num_free_blocks() == (num_blocks - - num_blocks_to_consume + - i) - allocator.free(block) - - @staticmethod - @pytest.mark.parametrize("num_blocks", [1024]) - @pytest.mark.parametrize("block_size", [16]) - @pytest.mark.parametrize("seed", list(range(20))) - def test_get_common_computed_block_ids(num_blocks: int, block_size: int, - seed: int): - """Verify get_common_computed_block_ids could get correct result - by create two immutable chain sharing prefix at specified pos, - and compare whether we also could get right result - from get_common_computed_block_ids. - """ - random.seed(seed) - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2, - block_size=block_size) - num_blocks_to_consume = random.randint(1, num_blocks - 1) - - # Create token ids that will exhaust all blocks. - token_ids = list(range(num_blocks_to_consume * block_size)) - - first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - # After zero_point, second_chain's token_ids would be set -1, which - # make it different from here comparing with first_chain - zero_point = random.randint(1, len(token_ids) - 1) - zero_point_blocks = zero_point // block_size - token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point) - - second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - first_computed_ids = [ - first_chain[i].block_id for i in range(num_blocks_to_consume) - ] - second_computed_ids = [ - second_chain[i].block_id for i in range(num_blocks_to_consume) - ] - res = allocator.get_common_computed_block_ids( - [first_computed_ids, second_computed_ids]) - - assert (len(res) == zero_point_blocks) - - # Test case that assume those prompted block after first immutable would - # be freed into hashless allocator, while first immutable block get ref - # increased. - @staticmethod - @pytest.mark.parametrize("num_blocks", [3]) - @pytest.mark.parametrize("block_size", [16]) - @pytest.mark.parametrize("seed", list(range(10))) - def test_alloc_promotion(num_blocks: int, block_size: int, seed: int): - random.seed(seed) - - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - token_ids = list(range(block_size)) - - block = allocator.allocate_immutable_block(prev_block=None, - token_ids=token_ids) - - assert allocator._refcounter.get(block.block_id) == 1 - m = allocator.allocate_mutable_block(prev_block=None) - - block_id = m.block_id - for i in range(block_size): - m.append_token_ids([i]) - - # After block get promoted to immutable from mutable, if there is - # already same content hash block, then it shall be released into - # hashless_allocator - # And first immutable block's ref get increased by 1 - assert m.block_id == block.block_id - assert block_id in allocator._hashless_allocator._free_block_indices - assert allocator._refcounter.get(block.block_id) == 2 - - # Test case when eviction and allocation are mixed, - # make sure they work as expected - @staticmethod - @pytest.mark.parametrize("num_blocks", [3]) - @pytest.mark.parametrize("block_size", [16]) - @pytest.mark.parametrize("seed", list(range(10))) - def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int): - random.seed(seed) - - all_blocks_list = [i for i in range(num_blocks)] - zero_ref = {i: 0 for i in range(num_blocks)} - one_ref = {i: 1 for i in range(num_blocks)} - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - token_ids = list(range(num_blocks * block_size)) - - # Verify initial/pre-alloc state - - # Ensure all blocks are free inside hashless allocator - assert list(allocator._hashless_allocator._free_block_indices - ) == all_blocks_list - # Ensure no tracked blocks - assert len(allocator._block_tracker.keys()) == num_blocks - for block_id in range(num_blocks): - assert not allocator._block_tracker[block_id].active - # Ensure no cached blocks - assert len(allocator._cached_blocks.values()) == 0 - # Ensure no evicted blocks - assert len(allocator.evictor.free_table.keys()) == 0 - # Ensure 0s ref counts for all blocks - assert allocator._refcounter._refcounts == zero_ref - - # Allocate immutable chains with only one block residuled in - new_block = [] - for i in range(num_blocks): - block = allocator.allocate_immutable_block( - prev_block=None, - token_ids=token_ids[block_size * i:block_size * (i + 1)]) - new_block.append(block) - - # Verify post-alloc state - - # Ensure no blocks are free inside hashless allocator - assert (len(allocator._hashless_allocator._free_block_indices) == 0) - # Ensure all blocks are tracked - assert len(allocator._block_tracker.keys()) == num_blocks - for block_id in range(num_blocks): - assert allocator._block_tracker[block_id].active - # Ensure all blocks are cached (all promoted) - assert len(allocator._cached_blocks.values()) == num_blocks - # Ensure no evicted blocks - assert len(allocator.evictor.free_table.keys()) == 0 - # Ensure 1s ref counts for all blocks - assert allocator._refcounter._refcounts == one_ref - - # Free all blocks, and now all blocks shall be in the evictor - # there shall be no tracking data left in _block_tracker - # all blocks shall be tracked in _cached_blocks - # all blocks' ref shall be zero - for block in new_block: - allocator.free(block) - - # Verify post-free state - - # Ensure no tracked blocks - assert len(allocator._block_tracker.keys()) == num_blocks - for block_id in range(num_blocks): - assert not allocator._block_tracker[block_id].active - # Ensure no blocks in hashless allocator (all promoted) - assert len(allocator._hashless_allocator._free_block_indices) == 0 - # Ensure all blocks are cached - assert list(allocator._cached_blocks.values()) == all_blocks_list - # Ensure all blocks are inside the evictor - assert list(allocator.evictor.free_table.keys()) == all_blocks_list - # Ensure 0s refcounts - assert allocator._refcounter._refcounts == zero_ref - - # Allocate a mutable block, and the first block shall be evicted - # and set its content hash into None, ref to 1 - mutable = allocator.allocate_mutable_block(prev_block=None) - - assert mutable.block_id == 0 - assert mutable.content_hash is None - assert allocator._block_tracker[0].active - assert allocator._refcounter.get(0) == 1 - assert 0 not in allocator._cached_blocks - assert 0 not in allocator.evictor - - # Since this mutable block has no hash yet, it shall be released into - # hashless allocator - allocator.free(mutable) - - assert not allocator._block_tracker[0].active - assert allocator._refcounter._refcounts == zero_ref - assert 0 not in allocator._cached_blocks - assert 0 not in allocator.evictor - assert 0 in allocator._hashless_allocator._free_block_indices - - # When allocate immutable with first block_size tokens, we - # shall get free block from hashless allocator, thus no block left - # in hashless - block = allocator.allocate_immutable_block( - prev_block=None, token_ids=token_ids[:block_size]) - - assert block.block_id == 0 - assert len(allocator._hashless_allocator._free_block_indices) == 0 - assert allocator._block_tracker[0].active - assert 0 in allocator._cached_blocks.values() - assert allocator._refcounter.get(0) == 1 - assert 0 not in allocator.evictor - - # allocate mutable block again, it shall be popped from evictor - mutable = allocator.allocate_mutable_block(prev_block=None) - assert len(allocator._hashless_allocator._free_block_indices) == 0 - assert mutable.block_id not in allocator.evictor.free_table - assert allocator._refcounter.get(mutable.block_id) == 1 - - # Test case where two last accessed times are equal - @staticmethod - @pytest.mark.parametrize("num_blocks", [1024]) - @pytest.mark.parametrize("block_size", [16]) - @pytest.mark.parametrize("seed", list(range(20))) - def test_eviction_order(num_blocks: int, block_size: int, seed: int): - """This test case simulate the two chain created and free in order, - and together they would exhaust the initial freed blocks. - - So the next block created after those two chain shall use the block - from the first chain as that block has long access time. - While first chain has two blocks, it shall pick up the last one, as - it has larger token number. - """ - - random.seed(seed) - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - num_blocks_to_consume = num_blocks + 1 - - token_ids = list(range(num_blocks_to_consume * block_size)) - - num_blocks_in_first_chain = 2 - num_tokens_in_first_chain = block_size * num_blocks_in_first_chain - # First chain takes the first block - first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids[:num_tokens_in_first_chain], - allocator=allocator, - ) - # There should only be one block allocated at this point - assert allocator.get_num_free_blocks() == (num_blocks - - num_blocks_in_first_chain) - - # Set the last accessed time of the first block to 1 - blocks_ids = [block.block_id for block in first_chain] - allocator.mark_blocks_as_accessed(blocks_ids, 1) - - # Second chain takes the rest of the blocks - second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids[num_tokens_in_first_chain:-block_size], - allocator=allocator, - ) - - # There shouldn't be any blocks left at this point - assert allocator.get_num_free_blocks() == (0) - - assert len(first_chain) == num_blocks_in_first_chain - last_block_id = first_chain[-1].block_id - # Free each block in the first chain. - for i, block in enumerate(first_chain): - allocator.free(block) - - # Set the last accessed time on all of the blocks in the second chain - # to 2 - blocks_ids = [block.block_id for block in second_chain] - allocator.mark_blocks_as_accessed(blocks_ids, 2) - - # Free each block in the second chain. - for i, block in enumerate(second_chain): - allocator.free(block) - - # Allocate a new block and check that it's the least recently used block - # from the first chain. - new_block = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids[-block_size:], - allocator=allocator, - ) - - assert new_block[0].block_id == last_block_id - - # Test case for cache mertics - @staticmethod - def test_metric(): - block_size = 16 - allocator = PrefixCachingBlockAllocator(num_blocks=4, - block_size=block_size) - # Test when no query (0/0) - assert allocator.get_prefix_cache_hit_rate() == 0.0 - - token_ids = list(range(block_size)) - allocator.allocate_immutable_block(prev_block=None, - token_ids=token_ids) - # Test 0/1 hit rate - assert allocator.get_prefix_cache_hit_rate() == 0.0 - - allocator.allocate_immutable_block(prev_block=None, - token_ids=token_ids) - # Test 1/2 hit rate - assert allocator.get_prefix_cache_hit_rate() == 0.5 - - # Test more than one block - for _ in range(2, 1005): - allocator.allocate_immutable_block(prev_block=None, - token_ids=token_ids) - assert allocator.get_prefix_cache_hit_rate() > 0.99 - - # Test case for marking cache hit blocks as computed right after - # a batch of prefill sequences are scheduled. - @staticmethod - def test_touch_block(): - block_size = 16 - common_blocks = 4 - allocator = PrefixCachingBlockAllocator(num_blocks=8, - block_size=block_size) - - common_token_ids = list(range(block_size * common_blocks)) - - # Mimic the behavior of allocating the same block chain - # (i.e., common prefix) for a batch of 3 different prefill sequences. - for _ in range(3): - blocks = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=common_token_ids, - allocator=allocator, - ) - block_hashes = [block.content_hash for block in blocks] - # The allocated blocks should be marked as touched - # but not computed. - computed_block_ids = allocator.find_cached_blocks_prefix( - block_hashes) - assert len(computed_block_ids) == 0 - - allocator.mark_blocks_as_computed([]) - computed_block_ids = allocator.find_cached_blocks_prefix( - block_hashes=block_hashes) - assert len(computed_block_ids) == common_blocks - - @staticmethod - def test_find_cached_blocks_prefix(): - """ - This test verifies the behavior of find_cached_blocks_prefix. - """ - block_size = 4 - num_blocks = 8 - total_test_blocks = 12 - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - - token_ids = list(range(total_test_blocks * block_size)) - block_tokens_seq1 = token_ids[:num_blocks * block_size] - blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=block_tokens_seq1, - allocator=allocator, - ) - block_hashes_seq1 = [block.content_hash for block in blocks_seq1] - allocator.mark_blocks_as_computed([]) - - # All blocks should be cached. - cached_blocks_seq1 = allocator.find_cached_blocks_prefix( - block_hashes=block_hashes_seq1) - assert len(cached_blocks_seq1) == num_blocks - - # Free the first sequence. - for block in blocks_seq1: - allocator.free(block) - - # All blocks should be still be cached if not required to be allocated. - cached_blocks = allocator.find_cached_blocks_prefix( - block_hashes=block_hashes_seq1) - assert len(cached_blocks) == num_blocks - - block_tokens_seq2 = token_ids[num_blocks * block_size:] - blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=block_tokens_seq2, - allocator=allocator, - ) - block_hashes_seq2 = [block.content_hash for block in blocks_seq2] - allocator.mark_blocks_as_computed([]) - cached_blocks = allocator.find_cached_blocks_prefix( - block_hashes=block_hashes_seq2) - assert len(cached_blocks) == len(blocks_seq2) - - # Half of the blocks from seq1 should still be cached. - num_evicted_blocks = len(blocks_seq2) - cached_blocks = allocator.find_cached_blocks_prefix( - block_hashes=block_hashes_seq1) - assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks - - # Test reset prefix cache - @staticmethod - @pytest.mark.parametrize("num_blocks", [10]) - @pytest.mark.parametrize("block_size", [16]) - def test_reset_prefix_cache(num_blocks: int, block_size: int): - """This test case simulates the case of resetting the prefix cache.""" - - allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, - block_size=block_size) - token_ids = list(range(3 * block_size)) - - first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=token_ids, - allocator=allocator, - ) - - # Free each block in the first chain. - for block in first_chain: - allocator.free(block) - - # Failed to reset prefix cache because some blocks are not freed yet. - assert not allocator.reset_prefix_cache() - assert allocator.get_prefix_cache_hit_rate() > 0.0 - - # Free each block in the second chain. - for block in second_chain: - allocator.free(block) - - # Reset prefix cache. - assert allocator.reset_prefix_cache() - assert allocator.get_prefix_cache_hit_rate() == 0.0 - - @staticmethod - def create_immutable_chain( - block_size: int, - token_ids: list[int], - allocator: PrefixCachingBlockAllocator, - extra_hash: Optional[int] = None, - ) -> list[PrefixCachingBlock]: - """Helper method which creates a chain of blocks. - """ - blocks: list[Block] = [] - num_blocks = math.ceil(len(token_ids) / block_size) - - if num_blocks == 0: - return [] - - prev_block = None - for block_number in range(0, num_blocks): - block_token_ids = token_ids[block_number * - block_size:(block_number + 1) * - block_size] - prev_block = allocator.allocate_immutable_block( - prev_block=prev_block, - token_ids=block_token_ids, - extra_hash=extra_hash) - blocks.append(prev_block) - - return blocks - - -class TestComputedBlocksTracker: - - @staticmethod - def _get_mock_allocator(): - return MagicMock(spec=PrefixCachingBlockAllocator) - - @staticmethod - def test_get_num_cached_tokens(): - """ - Test it correctly computes the number of cached tokens for a given - sequence: - - - The cache token count is derived from the number of cached blocks. - - The cache token count is updated when the allocator is updated. - - When a sequence is removed, the cache token count should be updated - accordingly. - - # TODO(rickyx): This behaviour for prefill sequence is a hack until - we fix the computed blocks tracking. - - The cache token count for prefill sequence doesn't change while - the sequence is in continuous prefill (chunked prefill). - """ - block_size = 4 - mock_allocator = TestComputedBlocksTracker._get_mock_allocator() - tracker = ComputedBlocksTracker( - allocator=mock_allocator, - block_size=block_size, - enable_caching=True, - ) - - # Not yet allocated. - tokens = [0, 1, 2, 3, 4, 5] - seq1 = create_dummy_sequence(request_id=0, - token_ids=tokens, - block_size=block_size) - mock_allocator.find_cached_blocks_prefix.return_value = [] - assert tracker.get_num_cached_tokens(seq1) == 0 - - mock_allocator.find_cached_blocks_prefix.return_value = [ - None - ] # 1 block cached. - # Result is cached for prefill sequence. - assert tracker.get_num_cached_tokens(seq1) == 0 - - # Mark the sequence as non-prefill. - seq1.data.update_num_computed_tokens(len(tokens)) # 6 tokens computed. - assert not seq1.is_prefill() - - # Recomputes for decoding sequence. - assert tracker.get_num_cached_tokens(seq1) == 4 - - # Append new tokens to the sequence. - num_new_tokens = 3 - for i in range(num_new_tokens): - seq1.append_token_id(i, {i: Logprob(logprob=0.0)}) - - assert tracker.get_num_cached_tokens(seq1) == 4 - - # Update the allocator. - mock_allocator.find_cached_blocks_prefix.return_value = [ - None - ] * 2 # 2 blocks cached. - assert tracker.get_num_cached_tokens(seq1) == 8 - - # Remove the sequence. - tracker.remove_seq(seq1.seq_id) - - # Re-create the sequence with the same request id to simulate recompute. - seq1 = create_dummy_sequence(request_id=0, - token_ids=tokens, - block_size=block_size) - mock_allocator.find_cached_blocks_prefix.return_value = [ - ] # no cached block - assert tracker.get_num_cached_tokens(seq1) == 0 - - @staticmethod - def test_correct_block_hash(): - """ - Test that the block hash is correctly computed for a sequence (should - match the underlying block allocator's block hash). So the number of - cached tokens is correctly retrieved. - """ - block_size = 4 - allocator = CpuGpuBlockAllocator.create( - allocator_type="prefix_caching", - num_gpu_blocks=16, - num_cpu_blocks=16, - block_size=block_size, - ) - gpu_allocator = allocator._allocators[Device.GPU] - - tracker = ComputedBlocksTracker( - allocator=allocator, - block_size=block_size, - enable_caching=True, - ) - - tokens = list(range(block_size * 4)) # 4 blocks. - seq = create_dummy_sequence(request_id=0, - token_ids=tokens, - block_size=block_size) - _ = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=tokens, - allocator=gpu_allocator, - ) - allocator.mark_blocks_as_computed([]) - - assert tracker.get_num_cached_tokens(seq) == len(tokens) - - @staticmethod - def test_correct_extra_hash(): - """ - Test that the block hash is correctly computed based on the extra hash, - ensuring it matches the allocator's block hash, specifically for the - LoRA case, and that the correct number of cached tokens is retrieved. - """ - block_size = 4 - allocator = CpuGpuBlockAllocator.create( - allocator_type="prefix_caching", - num_gpu_blocks=16, - num_cpu_blocks=16, - block_size=block_size, - ) - gpu_allocator = allocator._allocators[Device.GPU] - - tracker = ComputedBlocksTracker( - allocator=allocator, - block_size=block_size, - enable_caching=True, - ) - - tokens = list(range(block_size * 4)) - - # Create a dummy LoRA sequence with a specific LoRA ID. - lora_seq = create_dummy_lora_sequence(request_id=0, - token_ids=tokens, - block_size=block_size, - lora_int_id=1) - - _ = TestPrefixCachingBlockAllocator.create_immutable_chain( - block_size=block_size, - token_ids=tokens, - allocator=gpu_allocator, - extra_hash=lora_seq.extra_hash(), - ) - - allocator.mark_blocks_as_computed([]) - - # Create different dummy sequences that have the same token IDs - # but different LoRA IDs. - seq = create_dummy_sequence(request_id=1, - token_ids=tokens, - block_size=block_size) - - different_lora_seq = create_dummy_lora_sequence(request_id=2, - token_ids=tokens, - block_size=block_size, - lora_int_id=2) - - # Due to the different LoRA IDs, corresponding blocks are not cached. - assert tracker.get_num_cached_tokens(seq) == 0 - assert tracker.get_num_cached_tokens(different_lora_seq) == 0 - - # The number of cached tokens matches the length of the tokens - # for the cached LoRA sequence. - assert tracker.get_num_cached_tokens(lora_seq) == len(tokens) diff --git a/tests/core/conftest.py b/tests/core/conftest.py deleted file mode 100644 index 375b248ebedaa..0000000000000 --- a/tests/core/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py deleted file mode 100644 index ce1fe189b3ca1..0000000000000 --- a/tests/core/test_chunked_prefill_scheduler.py +++ /dev/null @@ -1,858 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from unittest.mock import MagicMock - -import pytest # noqa - -from vllm.config import CacheConfig, SchedulerConfig -from vllm.core.scheduler import Scheduler -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine -from vllm.sampling_params import SamplingParams -from vllm.sequence import Logprob, SequenceGroup - -from .utils import create_dummy_prompt - - -def get_sequence_groups(scheduler_output): - return [s.seq_group for s in scheduler_output.scheduled_seq_groups] - - -def append_new_token(seq_group: SequenceGroup, token_id: int): - for seq in seq_group.get_seqs(): - seq.append_token_id(token_id, {token_id: Logprob(token_id)}) - - -def schedule_and_update_computed_tokens(scheduler): - metas, out, _ = scheduler.schedule() - for s, meta in zip(out.scheduled_seq_groups, metas): - s.seq_group.update_num_computed_tokens(meta.token_chunk_size) - return metas, out - - -def test_simple(): - """Verify basic scheduling works.""" - block_size = 4 - num_seq_group = 4 - max_model_len = 16 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig("generate", - max_num_batched_tokens, - num_seq_group, - max_model_len, - enable_chunked_prefill=True) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(num_seq_group): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=block_size, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Schedule seq groups prompts. - num_tokens = block_size * num_seq_group - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - assert out.num_batched_tokens == num_tokens - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - assert len(seq_group_meta) == num_seq_group - for s in running: - append_new_token(s, 1) - - # Schedule seq groups generation. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - assert out.num_batched_tokens == num_seq_group - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - assert len(seq_group_meta) == num_seq_group - - -def test_chunk(): - """Verify prefills are chunked properly.""" - block_size = 4 - max_seqs = 60 - max_model_len = 80 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 32 - cache_config.num_gpu_blocks = 32 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Verify the second request is chunked. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - print() - assert set(get_sequence_groups(out)) == set(running) - assert seq_group_meta[0].token_chunk_size == 60 - # Verify it is chunked. - assert seq_group_meta[1].token_chunk_size == 4 - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 64 - # Only the first seq group has a new token appended. - append_new_token(running[0], 1) - - # One chunked prefill, and one decoding. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - # The first one is prefill. Scheduler guarantees ordering. - assert seq_group_meta[0].token_chunk_size == 56 - # The second one is a chunked prefill. - assert seq_group_meta[1].token_chunk_size == 1 - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 57 - - -def test_concurrent_chunking(): - """Verify prefills are chunked properly when - --max-num-partial-prefills is > 1""" - block_size = 4 - max_seqs = 60 - max_model_len = 2000 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - max_num_partial_prefills=2, # Up to 2 partial prefills at a time - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 32 - cache_config.num_gpu_blocks = 32 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Verify both requests are chunked with half of max_num_batched_tokens each - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - assert seq_group_meta[0].token_chunk_size == 32 - assert seq_group_meta[1].token_chunk_size == 32 - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 64 - - # After one iteration, both should have 60 - 32 = 28 tokens left to prefill - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - assert seq_group_meta[0].token_chunk_size == 28 - assert seq_group_meta[1].token_chunk_size == 28 - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 56 - - -def test_concurrent_chunking_large_requests(): - """Verify large prefill requests are run one at a time""" - block_size = 4 - max_seqs = 60 - max_model_len = 2000 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - max_num_partial_prefills=2, # Up to 2 partial prefills at a time - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 3200 # large KV cache size for large requests - cache_config.num_gpu_blocks = 3200 - scheduler = Scheduler(scheduler_config, cache_config, None) - - # Add seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt( - str(i), - prompt_length=1200, # Very large prompt - block_size=block_size) - scheduler.add_seq_group(seq_group) - - # Verify only a single request is chunked, and it gets all 64 tokens - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 1 - assert seq_group_meta[0].token_chunk_size == 64 - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 64 - - -def test_short_prompts_jump_long_prompts_in_queue(): - """Verify large prefill requests are punted behind smaller ones if - another large prefill request is already running""" - block_size = 4 - max_seqs = 60 - max_model_len = 2000 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - max_num_partial_prefills=2, # Up to 2 partial prefills at a time - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 3200 # large KV cache size for large requests - cache_config.num_gpu_blocks = 3200 - scheduler = Scheduler(scheduler_config, cache_config, None) - long_seqs: list[SequenceGroup] = [] - short_seqs: list[SequenceGroup] = [] - - # Add 2 large seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt( - str(i), - prompt_length=1200, # Very large prompt - block_size=block_size) - scheduler.add_seq_group(seq_group) - long_seqs.append(seq_group) - assert seq_group.is_prefill() - - # Add 2 small seq groups behind them - for i in range(2): - _, seq_group = create_dummy_prompt( - str(i + 2), - prompt_length=40, # Very small prompt - block_size=block_size) - scheduler.add_seq_group(seq_group) - short_seqs.append(seq_group) - assert seq_group.is_prefill() - - # Verify one large req and 1 small req chunked - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert seq_group_meta[0].token_chunk_size == 32 # large req gets 32 tokens - assert seq_group_meta[1].token_chunk_size == 32 # small req gets 32 tokens - - # all 4 are prefilling - assert long_seqs[0].is_prefill() - assert long_seqs[1].is_prefill() - assert short_seqs[0].is_prefill() - assert short_seqs[1].is_prefill() - # First short and first long sequences have been scheduled - assert long_seqs[0].first_seq.get_num_computed_tokens() == 32 - assert long_seqs[1].first_seq.get_num_computed_tokens() == 0 - assert short_seqs[0].first_seq.get_num_computed_tokens() == 32 - assert short_seqs[1].first_seq.get_num_computed_tokens() == 0 - - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 64 - - # in the second iteration, - # the first small request had only 8 tokens left - # so it went to decode - # The other small req is scheduled - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - # the new small req got 64 - (32+8) tokens - assert seq_group_meta[0].token_chunk_size == 24 - assert seq_group_meta[1].token_chunk_size == 32 # large req still got 32 - # the other small request had only 8 tokens left - assert seq_group_meta[2].token_chunk_size == 8 # 40-32 - - # The first small request got to decode now - assert long_seqs[0].is_prefill() - assert long_seqs[1].is_prefill() - assert not short_seqs[0].is_prefill() - assert short_seqs[1].is_prefill() - # Both small requests have started in front of the second long request - assert long_seqs[0].first_seq.get_num_computed_tokens() == 64 - assert long_seqs[1].first_seq.get_num_computed_tokens() == 0 - assert short_seqs[0].first_seq.get_num_computed_tokens() == 40 - assert short_seqs[1].first_seq.get_num_computed_tokens() == 24 - - assert out.num_prefill_groups == 3 - assert out.num_batched_tokens == 64 - # the first small seq group has a new token appended. - append_new_token(short_seqs[0], 1) - - # in the third iteration, - # the first small request is already decoding - # the second small request only has 16 tokens left and will enter decoding - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert seq_group_meta[0].token_chunk_size == 32 # large still got 32 - # small req finished prefilling 40-24=16 tokens - assert seq_group_meta[1].token_chunk_size == 16 - assert seq_group_meta[2].token_chunk_size == 1 # decode - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 49 # (32+16+1 decode) - - # both small requests have now reached decode - assert long_seqs[0].is_prefill() - assert long_seqs[1].is_prefill() - assert not short_seqs[0].is_prefill() - assert not short_seqs[1].is_prefill() - assert long_seqs[0].first_seq.get_num_computed_tokens() == 96 - assert long_seqs[1].first_seq.get_num_computed_tokens() == 0 - assert short_seqs[0].first_seq.get_num_computed_tokens() == 41 - assert short_seqs[1].first_seq.get_num_computed_tokens() == 40 - - # both the small seq groups have a new token appended - append_new_token(short_seqs[0], 1) - append_new_token(short_seqs[1], 1) - - # in the fourth iteration, both small requests are decoding - # so large request gets all the budget - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - - # large req gets 62 tokens (minus 2 for decode) - assert seq_group_meta[0].token_chunk_size == 62 - assert seq_group_meta[1].token_chunk_size == 1 # decode - assert seq_group_meta[2].token_chunk_size == 1 # decode - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 64 - - assert long_seqs[0].first_seq.get_num_computed_tokens() == 158 - - # assert long_seqs[0].is_prefill() - # assert long_seqs[1].is_prefill() - # assert not short_seqs[0].is_prefill() - # assert not short_seqs[1].is_prefill() - - # # both the small seq groups have a new token appended - # append_new_token(short_seqs[0], 1) - # append_new_token(short_seqs[1], 1) - - # # in the fifth iteration, large request gets all the budget - # # while both small requests are decoding - # seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - # assert seq_group_meta[0].token_chunk_size == 62 - # assert seq_group_meta[1].token_chunk_size == 1 # decode - # assert seq_group_meta[2].token_chunk_size == 1 # decode - # assert out.num_prefill_groups == 1 - # assert out.num_batched_tokens == 64 - - -def test_complex(): - block_size = 4 - max_seqs = 60 - max_model_len = 80 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 64 - cache_config.num_gpu_blocks = 64 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - assert seq_group.is_prefill() - - # Verify the second request is chunked. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - - assert set(get_sequence_groups(out)) == set(running) - assert seq_group_meta[0].token_chunk_size == 60 - # Verify it is chunked. - assert seq_group_meta[1].token_chunk_size == 4 - assert not running[0].is_prefill() - assert running[1].is_prefill() - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 64 - # Only the first seq group has a new token appended. - append_new_token(running[0], 1) - - # Add 2 more requests. - for i in range(2, 4): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Decoding & chunked prefill & first chunk of 3rd request is scheduled. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 3 - # The first one is the first chunked prefill. - assert seq_group_meta[0].token_chunk_size == 7 - # The second one is the second new chunked prefill. - assert seq_group_meta[1].token_chunk_size == 56 - # The last one is decode. - assert seq_group_meta[2].token_chunk_size == 1 - # Two of them are in chunked prefill. - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 64 - # The first 2 requests are now in decodine phase. - append_new_token(running[0], 1) - assert not running[0].is_prefill() - append_new_token(running[1], 1) - assert not running[1].is_prefill() - # The third request is still in prefill stage. - assert running[2].is_prefill() - - -def test_maximal_decoding(): - """Verify decoding requests are prioritized.""" - block_size = 4 - max_seqs = 2 - max_model_len = 8 - max_num_batched_tokens = 2 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - assert seq_group.is_prefill() - - # The first prefill is scheduled. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 1 - assert seq_group_meta[0].token_chunk_size == 2 - assert not running[0].is_prefill() - assert running[1].is_prefill() - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 2 - # Only the first seq group has a new token appended. - append_new_token(running[0], 1) - - # Create one more seq_group. - _, seq_group = create_dummy_prompt("3", - prompt_length=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - assert seq_group.is_prefill() - # The first decoding + second chunk is scheduled. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 2 - assert seq_group_meta[0].token_chunk_size == 1 - assert seq_group_meta[1].token_chunk_size == 1 - assert not running[0].is_prefill() - assert running[1].is_prefill() - assert running[2].is_prefill() - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 2 - append_new_token(running[0], 1) - - # Decoding + running prefill is prioritized. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 2 - assert seq_group_meta[0].token_chunk_size == 1 - assert seq_group_meta[1].token_chunk_size == 1 - assert not running[0].is_prefill() - assert not running[1].is_prefill() - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 2 - append_new_token(running[0], 1) - append_new_token(running[1], 1) - - # Only decoding is prioritized. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 2 - assert seq_group_meta[0].token_chunk_size == 1 - assert seq_group_meta[1].token_chunk_size == 1 - assert not running[0].is_prefill() - assert not running[1].is_prefill() - assert out.num_prefill_groups == 0 - assert out.num_batched_tokens == 2 - append_new_token(running[0], 1) - append_new_token(running[1], 1) - - # After aborting the decoding request, the fcfs new prefill is prioritized. - scheduler.abort_seq_group(running[0].request_id) - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 2 - assert seq_group_meta[0].token_chunk_size == 1 - assert seq_group_meta[1].token_chunk_size == 1 - assert not running[1].is_prefill() - assert running[2].is_prefill() - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 2 - - -def test_prompt_limit(): - """Verify max_num_batched_tokens < max_model_len is possible.""" - block_size = 4 - max_seqs = 32 - max_model_len = 64 - max_num_batched_tokens = 32 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 - cache_config.num_gpu_blocks = 16 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - _, seq_group = create_dummy_prompt("1", - prompt_length=48, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - assert seq_group.is_prefill() - - # The prompt length > max_num_batched_tokens should be still scheduled. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(get_sequence_groups(out)) == 1 - assert seq_group_meta[0].token_chunk_size == 32 - assert running[0].is_prefill() - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == 32 - - -def test_prompt_limit_exceed(): - block_size = 4 - max_seqs = 64 - max_model_len = 32 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig("generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 - cache_config.num_gpu_blocks = 16 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - _, seq_group = create_dummy_prompt("2", - prompt_length=48, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - assert seq_group.is_prefill() - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.ignored_seq_groups) == 1 - assert out.ignored_seq_groups[0] == seq_group - - -def test_chunked_prefill_preempt(): - """Verify preempt works with chunked prefill requests""" - block_size = 4 - max_seqs = 30 - max_model_len = 200 - max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 - cache_config.num_gpu_blocks = 16 - scheduler = Scheduler(scheduler_config, cache_config, None) - - _, seq_group = create_dummy_prompt("1", - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - # The request is chunked. - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert seq_group.is_prefill() - assert out.num_batched_tokens == max_num_batched_tokens - - # The request should be preempted. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group1(seq_group, num_lookahead_slots): - return seq_group.request_id != "1" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group1) - - # The running prefill is now preempted. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 0 - assert out.num_batched_tokens == 0 - assert out.blocks_to_swap_out == [] - assert out.blocks_to_swap_in == [] - - # Make sure we can reschedule preempted request. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert seq_group.is_prefill() - assert out.num_batched_tokens == max_num_batched_tokens - assert seq_group.get_num_uncomputed_tokens() == 30 - - # We should be able to run prefill twice as it is chunked. - def cannot_append_second_group2(seq_group, num_lookahead_slots): - return True - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group2) - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert not seq_group.is_prefill() - assert out.num_batched_tokens == max_num_batched_tokens - - -def test_chunked_prefill_spec_prefill(): - """Verify that the num_lookahead_slots is set appropriately for an all""" - """prefill batch.""" - block_size = 4 - max_seqs = 30 - max_model_len = 200 - max_num_batched_tokens = 30 - num_lookahead_slots = 4 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - num_lookahead_slots=num_lookahead_slots, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 - cache_config.num_gpu_blocks = 16 - scheduler = Scheduler(scheduler_config, cache_config, None) - - _, seq_group = create_dummy_prompt("1", - prompt_length=30, - block_size=block_size) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - # The request is chunked. - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert out.num_batched_tokens == max_num_batched_tokens - print(out.num_lookahead_slots) - assert out.num_lookahead_slots == 0 - - -def test_chunked_prefill_max_seqs(): - block_size = 4 - max_seqs = 2 - max_model_len = 80 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 128 - cache_config.num_gpu_blocks = 128 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - _, seq_group = create_dummy_prompt("1", - prompt_length=65, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - # The first prefill is chunked. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens - assert len(get_sequence_groups(out)) == 1 - - # Add new requests. - for i in range(4): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=65, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Make sure only 2 requests are scheduled. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert out.num_batched_tokens == max_num_batched_tokens - assert len(get_sequence_groups(out)) == 2 - assert not running[0].is_prefill() - assert running[1].is_prefill() - append_new_token(running[0], 1) - - # Although we have enough token budget, we can only schedule max_seqs. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert seq_group_meta[0].token_chunk_size == 2 - assert seq_group_meta[1].token_chunk_size == 1 - assert out.num_batched_tokens == 3 - assert len(get_sequence_groups(out)) == max_seqs - assert not running[0].is_prefill() - assert not running[1].is_prefill() - - -def test_prefix_caching(): - """Verify allocating full blocks when prefix caching is enabled.""" - block_size = 4 - max_seqs = 10 - max_model_len = 80 - max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, - 1.0, - 1, - "auto", - enable_prefix_caching=True) - cache_config.num_cpu_blocks = 0 - cache_config.num_gpu_blocks = 32 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - block_size=block_size, - prompt_length=50) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - assert seq_group_meta[0].token_chunk_size == 50 - # Verify it is chunked. Note that although the budget is 64-50=14, - # we only allocate full blocks for prefix caching, so only 4*(14//4)=12 - # tokens are allocated. - assert seq_group_meta[1].token_chunk_size == 12 - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 62 - - -def test_prefix_caching_with_concurrent_partial_prefills(): - """Verify allocating full blocks when prefix caching is enabled with - --max-num-partial-prefills > 1.""" - block_size = 4 - max_seqs = 10 - max_model_len = 8000 - max_num_batched_tokens = 60 # With two slots, each slot will get 30 tokens - scheduler_config = SchedulerConfig("generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - max_num_partial_prefills=2) - cache_config = CacheConfig(block_size, - 1.0, - 1, - "auto", - enable_prefix_caching=True) - cache_config.num_cpu_blocks = 0 - cache_config.num_gpu_blocks = 32 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - block_size=block_size, - prompt_length=50) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - # To partially prefill both sequences, both can chunk up to 30 tokens - # But the next lowest multiple of the block size (4) is 28 - assert seq_group_meta[0].token_chunk_size == 28 - assert seq_group_meta[1].token_chunk_size == 28 - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 56 - - # On the next iteration, both sequences should finish prefill - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - # Both sequences have 50 - 28 = 22 tokens left to prefill. - # This is not a multiple of the block size, but we don't care since we don't - # cache the final partial block of prefix sequences - assert seq_group_meta[0].token_chunk_size == 22 - assert seq_group_meta[1].token_chunk_size == 22 - assert out.num_prefill_groups == 2 - assert out.num_batched_tokens == 44 - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8]) -def test_chunked_prefill_with_actual_engine(model: str, - max_num_partial_prefills: int): - """Make sure the model can actually sample with concurrent - partial prefills - """ - - prompt = "hello" * 40 - - engine_args = EngineArgs( - model=model, - max_num_partial_prefills=max_num_partial_prefills, - max_num_batched_tokens=40, - max_num_seqs=8, - enable_chunked_prefill=True, - gpu_memory_utilization=0.8, - ) - - engine = LLMEngine.from_engine_args(engine_args) - sampling_params = SamplingParams(temperature=0) - - for req_num in range(max_num_partial_prefills): - engine.add_request(f"{req_num}", prompt, sampling_params) - # first step - request_outputs = engine.step() - # means all are prefilling - assert len(request_outputs) == 0 - assert len(engine.scheduler[0].running) == max_num_partial_prefills diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py deleted file mode 100644 index 131a7b3a6299b..0000000000000 --- a/tests/core/test_num_computed_tokens_update.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from tests.conftest import VllmRunner -from tests.core.utils import create_dummy_prompt -from vllm.engine.llm_engine import LLMEngine -from vllm.sequence import SequenceGroup - -MODEL = "JackFram/llama-160m" - - -def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup): - scheduler = engine.scheduler[0] - scheduler.add_seq_group(seq_group) - - -@pytest.mark.parametrize("enable_chunked_prefill", [False, True]) -@pytest.mark.parametrize("enforce_eager", [False, True]) -def test_num_computed_tokens_update(enable_chunked_prefill: bool, - enforce_eager: bool): - - # Make a vllm engine - runner = VllmRunner(model_name=MODEL, - gpu_memory_utilization=0.7, - enable_chunked_prefill=enable_chunked_prefill, - enforce_eager=enforce_eager) - engine: LLMEngine = runner.llm.llm_engine - - num_prompt_steps = 1 - - num_output_tokens_list = [4, 8, 12, 15, 16, 17] - - # Create sequence and add to engine - prompt_len = 10 - - for req_idx, num_output_tokens in enumerate(num_output_tokens_list): - seq, seq_group = create_dummy_prompt(request_id=str(req_idx), - prompt_length=prompt_len, - min_tokens=num_output_tokens, - max_tokens=num_output_tokens) - add_seq_group_to_engine(engine, seq_group) - - assert seq.data.get_num_computed_tokens() == 0 - - for _ in range(num_prompt_steps): - # prompt steps - engine.step() - - if not seq.is_finished(): - prompt_num_computed_tokens = seq.data.get_num_computed_tokens() - # Test correctness of num_computed_tokens after the prompt steps - assert prompt_num_computed_tokens == \ - prompt_len + num_prompt_steps - 1 - - decode_step_counter = 0 - while not seq.is_finished(): - # Test correctness of num_computed_tokens after the decode steps - assert seq.data.get_num_computed_tokens( - ) == prompt_num_computed_tokens + decode_step_counter - engine.step() - decode_step_counter += 1 - - # Test correctness of num_computed_tokens after the sequence finish. - assert seq.data.get_num_computed_tokens( - ) == prompt_len + num_output_tokens - 1 diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py deleted file mode 100644 index 86e08328c43b0..0000000000000 --- a/tests/core/test_scheduler.py +++ /dev/null @@ -1,1338 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time -from collections import deque -from typing import Optional -from unittest.mock import MagicMock - -import pytest # noqa -import torch -from torch import Use # noqa - -from vllm.config import CacheConfig, SchedulerConfig -from vllm.config.lora import LoRAConfig -from vllm.core.interfaces import AllocStatus -from vllm.core.scheduler import Scheduler, SchedulingBudget -from vllm.lora.request import LoRARequest -from vllm.sequence import SequenceGroup, SequenceStatus - -from .utils import (append_new_token, append_new_token_seq, - append_new_token_seq_group, create_dummy_prompt, - get_sequence_groups, schedule_and_update_computed_tokens) - - -def test_scheduler_add_seq_group(): - block_size = 4 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=100, - max_num_seqs=64, - max_model_len=1, - ) - cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") - cache_config.num_cpu_blocks = 4 - cache_config.num_gpu_blocks = 4 - scheduler = Scheduler(scheduler_config, cache_config, None) - - # Add seq group to scheduler. - num_seq_group = 4 - for i in range(num_seq_group): - _, seq_group = create_dummy_prompt(str(i), - block_size, - block_size=block_size) - scheduler.add_seq_group(seq_group) - assert scheduler.get_num_unfinished_seq_groups() == i + 1 - - -def test_scheduler_abort_seq_group(): - block_size = 4 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=100, - max_num_seqs=64, - max_model_len=1, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 4 - cache_config.num_gpu_blocks = 4 - scheduler = Scheduler(scheduler_config, cache_config, None) - - # Add multiple seq groups to scheduler. - num_seq_group = 4 - request_ids: set[str] = set() - for i in range(num_seq_group): - _, seq_group = create_dummy_prompt(str(i), block_size) - scheduler.add_seq_group(seq_group) - request_ids.add(str(i)) - - # Abort all added seq groups. - assert scheduler.get_num_unfinished_seq_groups() == num_seq_group - scheduler.abort_seq_group(request_ids) - assert scheduler.get_num_unfinished_seq_groups() == 0 - - -def test_scheduler_schedule_simple(): - block_size = 4 - num_seq_group = 4 - max_model_len = 16 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=64, - max_num_seqs=num_seq_group, - max_model_len=max_model_len, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - for i in range(num_seq_group): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=block_size, - block_size=block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Schedule seq groups prompts. - num_tokens = block_size * num_seq_group - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - assert out.num_batched_tokens == num_tokens - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - assert len(seq_group_meta) == num_seq_group - append_new_token(out, 1) - - # Schedule seq groups generation. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set(running) - assert out.num_batched_tokens == num_seq_group - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - assert len(seq_group_meta) == num_seq_group - append_new_token(out, 1) - - -def test_scheduler_prefill_prioritized(): - """Verify running batched tokens are not applied to prefill requests.""" - block_size = 4 - max_model_len = 30 - max_batched_num_tokens = 30 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=max_batched_num_tokens, - max_num_seqs=2, - max_model_len=max_model_len, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 - cache_config.num_gpu_blocks = 16 - scheduler = Scheduler(scheduler_config, cache_config, None) - - # Add seq groups to scheduler. - _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size) - scheduler.add_seq_group(seq_group_a) - - # Schedule seq groups prompts. - _, out = schedule_and_update_computed_tokens(scheduler) - assert get_sequence_groups(out) == [seq_group_a] - - # Add a new prefill request B. - _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size) - scheduler.add_seq_group(seq_group_b) - - # Verify prefill requests are prioritized. Since max_batched_num_tokens - # is 1, new prefill request has to be scheduled first. - _, out = schedule_and_update_computed_tokens(scheduler) - assert get_sequence_groups(out) == [seq_group_b] - - -def test_scheduler_schedule_preempt_abort(): - block_size = 4 - max_model_len = 16 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=64, - max_num_seqs=2, - max_model_len=max_model_len, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 2 - cache_config.num_gpu_blocks = 2 - scheduler = Scheduler(scheduler_config, cache_config, None) - - # Add seq groups to scheduler. - seq_a, seq_group_a = create_dummy_prompt("1", - block_size, - block_size=block_size) - seq_b, seq_group_b = create_dummy_prompt("2", - block_size, - block_size=block_size) - scheduler.add_seq_group(seq_group_a) - scheduler.add_seq_group(seq_group_b) - - # Schedule seq groups prompts. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert get_sequence_groups(out) == [seq_group_a, seq_group_b] - assert out.num_batched_tokens == block_size * 2 # seq_a and seq_b - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - assert len(seq_group_meta) == 2 - assert scheduler.get_num_unfinished_seq_groups() == 2 - - # Append "generated" tokens, allowing the sequence to mark prompt tokens as - # processed. - append_new_token(out, 1) - - # Schedule seq groups generation and preempt seq group b. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert get_sequence_groups(out) == [seq_group_a] - assert out.num_batched_tokens == 1 - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - assert len(seq_group_meta) == 1 - assert scheduler.get_num_unfinished_seq_groups() == 2 - assert out.preempted == 1 - - # Abort seq group a. Re-schedule seq group b prompt with recomputation. - scheduler.abort_seq_group("1") - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert get_sequence_groups(out) == [seq_group_b] - assert out.num_batched_tokens == 5 # 4 prompt + 1 generation. - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - assert len(seq_group_meta) == 1 - assert scheduler.get_num_unfinished_seq_groups() == 1 - - -def test_scheduler_max_seqs(): - block_size = 4 - num_seq_group = 4 - max_seq_group = 2 - max_model_len = 16 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=64, - max_num_seqs=max_seq_group, - max_model_len=max_model_len, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 - scheduler = Scheduler(scheduler_config, cache_config, None) - - all_seq_groups: list[SequenceGroup] = [] - # Add seq groups to scheduler. - for i in range(num_seq_group): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=block_size, - block_size=block_size) - all_seq_groups.append(seq_group) - - # Append 1 seq group - scheduler.add_seq_group(all_seq_groups[0]) - - # Schedule seq groups prompts. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set([all_seq_groups[0]]) - append_new_token(out, 1) - - # Schedule seq groups generation. - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set([all_seq_groups[0]]) - append_new_token(out, 1) - - # Append 2 more seq group - scheduler.add_seq_group(all_seq_groups[1]) - scheduler.add_seq_group(all_seq_groups[2]) - - # Schedule seq groups prompts. - # Only 1 seq group should be scheduled since max_seq_group is 2 - # and one is prompting. - _, out = schedule_and_update_computed_tokens(scheduler) - assert set(get_sequence_groups(out)) == set([all_seq_groups[1]]) - - -def test_scheduler_delay_factor(): - block_size = 4 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=100, - max_num_seqs=64, - max_model_len=16, - delay_factor=0.5, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 8 - cache_config.num_gpu_blocks = 8 - scheduler = Scheduler(scheduler_config, cache_config, None) - - # schedule first prompt - seq_group_meta, seq_group = create_dummy_prompt("0", - prompt_length=block_size, - block_size=block_size) - scheduler.add_seq_group(seq_group) - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert out.num_prefill_groups > 0 - assert seq_group_meta[0].request_id == '0' - append_new_token(out, 1) - - # wait for a second before scheduling next prompt - time.sleep(1) - seq_group_meta, seq_group = create_dummy_prompt("1", - prompt_length=block_size, - block_size=block_size) - scheduler.add_seq_group(seq_group) - - # second prompt should *not* be scheduled - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert out.num_prefill_groups == 0 - assert seq_group_meta[0].request_id == '0' - append_new_token(out, 1) - - # wait for more than 0.5 second and try again - time.sleep(0.6) - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert out.num_prefill_groups > 0 - assert seq_group_meta[0].request_id == '1' - append_new_token(out, 1) - - -def initialize_scheduler( - *, - max_num_seqs=1000, - max_token_budget=1000, - max_model_len=1000, - lora_config=None, - block_size=4, - num_cpu_blocks=8, - num_gpu_blocks=8, - enable_prefix_caching=False, - enable_chunked_prefill=False, -): - block_size = block_size - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=max_token_budget, - max_num_seqs=max_num_seqs, - max_model_len=max_model_len, - enable_chunked_prefill=enable_chunked_prefill, - ) - cache_config = CacheConfig( - block_size, - 1.0, - 1, - "auto", - enable_prefix_caching=enable_prefix_caching, - ) - cache_config.num_cpu_blocks = num_cpu_blocks - cache_config.num_gpu_blocks = num_gpu_blocks - scheduler = Scheduler(scheduler_config, cache_config, lora_config) - return scheduler - - -def create_token_budget(token_budget: int = 10000, - max_num_seqs: int = 10000) -> SchedulingBudget: - return SchedulingBudget( - token_budget=token_budget, - max_num_seqs=max_num_seqs, - ) - - -def add_token_budget(budget: SchedulingBudget, - num_batched_tokens: int = 0, - num_curr_seqs: int = 0): - mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1] - budget.add_num_batched_tokens(mock_seq_group.request_id, - num_batched_tokens) - budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs) - - -def test_prefill_schedule_max_prompt_len(): - """ - Test prompt longer than max_prompt_len is aborted. - """ - block_size = 4 - scheduler = initialize_scheduler(max_model_len=30, block_size=block_size) - _, seq_group = create_dummy_prompt("0", - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - budget = create_token_budget() - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 1 - assert len(output.seq_groups) == 0 - assert budget.num_batched_tokens == 0 - assert budget.num_curr_seqs == 0 - assert len(remaining_waiting) == 0 - - -def test_prefill_schedule_token_budget(): - """ - Test token budget respected. - """ - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64) - budget = create_token_budget(token_budget=0) - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - - # 0 token budget == nothing is scheduled. - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 0 - assert len(output.seq_groups) == 0 - assert budget.num_batched_tokens == 0 - assert budget.num_curr_seqs == 0 - assert len(remaining_waiting) == 2 - - # 60 token budget == 1 request scheduled. - budget = create_token_budget(token_budget=60) - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 0 - assert len(output.seq_groups) == 1 - assert budget.num_batched_tokens == 60 - assert budget.num_curr_seqs == 1 - assert len(remaining_waiting) == 1 - - # Test when current_batched_tokens respected. - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=16, - num_gpu_blocks=16) - budget = create_token_budget(token_budget=60) - add_token_budget(budget, 30, 0) - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - # Cannot schedule a prompt that doesn't fit the budget. - scheduler.add_seq_group(seq_group) - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 0 - assert len(output.seq_groups) == 0 - assert budget.num_batched_tokens == 30 - assert budget.num_curr_seqs == 0 - assert len(remaining_waiting) == 1 - budget = create_token_budget(token_budget=90) - add_token_budget(budget, 30, 0) - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.seq_groups) == 1 - assert budget.num_batched_tokens == 90 - assert budget.num_curr_seqs == 1 - assert len(remaining_waiting) == 0 - - -def test_prefill_schedule_max_seqs(): - """ - Test max seq respected. - """ - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64) - budget = create_token_budget(max_num_seqs=2) - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 0 - assert len(output.seq_groups) == 2 - assert budget.num_batched_tokens == 120 - assert budget.num_curr_seqs == 2 - assert len(remaining_waiting) == 1 - - # Verify curr_num_seqs respected. - scheduler.waiting = deque() - budget = create_token_budget(max_num_seqs=2) - add_token_budget(budget, 0, 2) - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 0 - assert len(output.seq_groups) == 0 - assert budget.num_batched_tokens == 0 - assert budget.num_curr_seqs == 2 - assert len(remaining_waiting) == 1 - - -def test_prefill_schedule_max_lora(): - """ - Test max lora is respected and prioritized. - """ - block_size = 4 - lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) - scheduler = initialize_scheduler(lora_config=lora_config, - block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64) - budget = create_token_budget(token_budget=120) - curr_loras: set[int] = set() - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size, - lora_request=LoRARequest( - lora_name=str(i), - lora_int_id=i + 1, - lora_path="abc")) - scheduler.add_seq_group(seq_group) - # Add two more requests to verify lora is prioritized. - # 0: LoRA, 1: LoRA, 2: regular, 3: regular - # In the first iteration, index 0, 2 is scheduled. - # If a request is not scheduled because it hits max lora, it is - # prioritized. Verify that. - for i in range(2, 4): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - # Schedule 2 requests (0 and 2) - output = scheduler._schedule_prefills(budget, curr_loras) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 0 - assert len(output.seq_groups) == 2 - assert budget.num_batched_tokens == 120 - assert budget.num_curr_seqs == 2 - assert len(remaining_waiting) == 2 - assert len(curr_loras) == 1 - # The second lora request is scheduled next as FCFS policy. - # Reset curr_loras so that it can be scheduled. - curr_loras = set() - budget = create_token_budget(token_budget=60) - output = scheduler._schedule_prefills(budget, curr_loras) - remaining_waiting = scheduler.waiting - assert len(output.seq_groups) == 1 - assert output.seq_groups[0].seq_group.request_id == "1" - assert len(remaining_waiting) == 1 - assert len(curr_loras) == 1 - assert budget.num_batched_tokens == 60 - - -def test_prefill_schedule_no_block_manager_capacity(): - """ - Test sequence cannot be scheduled due to block manager has no capacity. - """ - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_gpu_blocks=128, - num_cpu_blocks=128) - budget = create_token_budget() - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - scheduler.block_manager.can_allocate = MagicMock() - scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 0 - assert len(output.seq_groups) == 0 - assert budget.num_batched_tokens == 0 - assert budget.num_curr_seqs == 0 - assert len(remaining_waiting) == 3 - - scheduler = initialize_scheduler() - budget = create_token_budget() - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group) - scheduler.block_manager.can_allocate = MagicMock() - scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER - output = scheduler._schedule_prefills(budget, None) - remaining_waiting = scheduler.waiting - assert len(output.ignored_seq_groups) == 3 - assert len(output.seq_groups) == 0 - assert budget.num_batched_tokens == 0 - assert budget.num_curr_seqs == 0 - assert len(remaining_waiting) == 0 - - -def test_decode_schedule_preempted(): - """ - Test decodes cannot be scheduled and preempted. - """ - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64) - curr_loras = None - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._add_seq_group_to_running(seq_group) - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "1" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - - # 1 cannot be scheduled, and the lowest priority (request 2) - # should be preempted. 1 will also be preempted. - budget = create_token_budget() - output = scheduler._schedule_running(budget, curr_loras) - remaining_running = scheduler.running - assert len(remaining_running) == 0 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - assert output.decode_seq_groups[0].seq_group.request_id == "0" - assert len(output.preempted) == 2 - # Verify budgets are updated. - assert budget.num_batched_tokens == 1 - # NOTE: When enable_chunk is False, num_seqs budget is not updated. - # assert budget.num_curr_seqs == 1 - # Both should be preempted, not swapped. - assert output.blocks_to_swap_out == [] - # Nothing is copied. - assert output.blocks_to_copy == [] - - -def test_schedule_decode_blocks_to_copy_update(): - """ - Verify blocks_to_copy is updated. - """ - block_size = 4 - scheduler = initialize_scheduler(block_size=4, - num_cpu_blocks=16, - num_gpu_blocks=16) - _, seq_group = create_dummy_prompt("1", - prompt_length=60, - block_size=block_size) - curr_loras = None - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._add_seq_group_to_running(seq_group) - - # The last request should be swapped out. - scheduler.block_manager.append_slots = MagicMock() - scheduler.block_manager.append_slots.return_value = [(2, 3)] - - budget = create_token_budget() - output = scheduler._schedule_running(budget, curr_loras) - remaining_running = scheduler.running - assert len(remaining_running) == 0 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - assert len(output.preempted) == 0 - assert len(output.swapped_out) == 0 - # Nothing is preempted. - assert output.blocks_to_swap_out == [] - # Since append_slot returns the source -> dist mapping, it should - # be applied. - assert output.blocks_to_copy == [(2, 3)] - - -def test_schedule_swapped_max_loras(): - block_size = 4 - lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) - scheduler = initialize_scheduler(lora_config=lora_config, - block_size=block_size, - num_cpu_blocks=32, - num_gpu_blocks=32) - curr_loras: set[int] = set() - blocks_to_swap_out: list[tuple[int, int]] = [] - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size, - lora_request=LoRARequest( - lora_name=str(i), - lora_int_id=i + 1, - lora_path="abc")) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - budget = create_token_budget() - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 1 - assert budget.num_batched_tokens == 1 - assert budget.num_curr_seqs == 1 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - assert len(curr_loras) == 1 - - -def test_schedule_swapped_cannot_swap_in(): - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=32, - num_gpu_blocks=32) - curr_loras = None - blocks_to_swap_out: list[tuple[int, int]] = [] - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - # The last request should be swapped out. - scheduler.block_manager.can_swap_in = MagicMock() - scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER - # Since we cannot swap in, none of the requests are swapped in. - budget = create_token_budget() - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 2 - assert budget.num_batched_tokens == 0 - assert budget.num_curr_seqs == 0 - assert len(output.decode_seq_groups) == 0 - assert len(output.prefill_seq_groups) == 0 - - -def test_infeasible_swap(): - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=32, - num_gpu_blocks=32) - curr_loras = None - blocks_to_swap_out: list[tuple[int, int]] = [] - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - # The last request should be swapped out. - scheduler.block_manager.can_swap_in = MagicMock() - scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER - # Since we cannot swap in, none of the requests are swapped in. - budget = create_token_budget() - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 0 - assert len(output.infeasible_seq_groups) == 2 - assert budget.num_batched_tokens == 0 - assert budget.num_curr_seqs == 0 - assert len(output.decode_seq_groups) == 0 - assert len(output.prefill_seq_groups) == 0 - - -def test_schedule_swapped_blocks_to_copy(): - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=32, - num_gpu_blocks=32) - curr_loras = None - _, seq_group = create_dummy_prompt("1", - prompt_length=60, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - blocks_to_swap_out: list[tuple[int, int]] = [] - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - # The last request should be swapped out. - scheduler.block_manager.append_slots = MagicMock() - scheduler.block_manager.append_slots.return_value = [(2, 3)] - - budget = create_token_budget() - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 0 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - assert output.blocks_to_copy == [(2, 3)] - - -def test_scheduling_budget(): - TOKEN_BUDGET = 4 - MAX_SEQS = 4 - budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS) - assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1) - assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4) - assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5) - assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1) - assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5) - assert budget.remaining_token_budget() == TOKEN_BUDGET - - # Verify add/subtract num batched tokens. - _, seq_group = create_dummy_prompt("1", 3) - budget.add_num_batched_tokens(seq_group.request_id, 2) - assert budget.remaining_token_budget() == 2 - assert budget.num_batched_tokens == 2 - assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1) - assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1) - # Verify adding another seq group is no-op. - budget.add_num_batched_tokens(seq_group.request_id, 2) - assert budget.remaining_token_budget() == 2 - assert budget.num_batched_tokens == 2 - budget.subtract_num_batched_tokens(seq_group.request_id, 2) - assert budget.remaining_token_budget() == 4 - assert budget.num_batched_tokens == 0 - budget.subtract_num_batched_tokens(seq_group.request_id, 2) - assert budget.remaining_token_budget() == 4 - assert budget.num_batched_tokens == 0 - - # Verify add/subtract max seqs. - _, seq_group = create_dummy_prompt("1", 3) - budget.add_num_seqs(seq_group.request_id, 2) - assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2) - assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3) - assert budget.num_curr_seqs == 2 - # Verify adding another seq group is no-op. - budget.add_num_seqs(seq_group.request_id, 2) - assert budget.num_curr_seqs == 2 - budget.subtract_num_seqs(seq_group.request_id, 2) - assert budget.num_curr_seqs == 0 - budget.subtract_num_seqs(seq_group.request_id, 2) - assert budget.num_curr_seqs == 0 - - -@pytest.mark.parametrize("enable_prefix_caching", [True, False]) -def test_prefix_caching_aware_prefills(enable_prefix_caching): - """ - Test the below scenario: - - For 3 sequences, seqA, seqB, seqC, share the first block as prefix. - - The test verifies the below scenarios: - 1. SeqA is first scheduled. - 2. SeqB and SeqC can be prefilled together in a single schedule round - even though there are not enough token budgets to prefill both without - considering prefix caching. - """ - - block_size = 4 - max_num_batched_tokens = 12 - max_seq_group = 3 - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=16, - num_gpu_blocks=16, - max_token_budget=max_num_batched_tokens, - max_num_seqs=max_seq_group, - max_model_len=max_num_batched_tokens, - enable_prefix_caching=enable_prefix_caching, - ) - - seqA_tokens = list(range(8)) - num_shared_tokens = 4 - seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range( - 12, 16)) # Shared prefix first 4. - seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range( - 16, 20)) # Shared prefix first 4. - - seqA, seqA_group = create_dummy_prompt("0", - prompt_tokens=seqA_tokens, - block_size=block_size) - seqB, seqB_group = create_dummy_prompt("1", - prompt_tokens=seqB_tokens, - block_size=block_size) - seqC, seqC_group = create_dummy_prompt("2", - prompt_tokens=seqC_tokens, - block_size=block_size) - - # Schedule seqA prefill. - scheduler.add_seq_group(seqA_group) - metas, out, _ = scheduler.schedule() - assert (len(out.scheduled_seq_groups) == 1 - and out.scheduled_seq_groups[0].seq_group == seqA_group) - assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens) - - # Schedule seqA decode. - append_new_token_seq_group(len(seqA_tokens), seqA_group, 999) - metas, out, _ = scheduler.schedule() - - assert len(out.scheduled_seq_groups) == 1 - assert out.scheduled_seq_groups[0].seq_group == seqA_group - assert out.scheduled_seq_groups[0].token_chunk_size == 1 - - # Schedule seqB and seqC prefills should work with prefix caching. - scheduler.add_seq_group(seqB_group) - scheduler.add_seq_group(seqC_group) - metas, out, _ = scheduler.schedule() - - if enable_prefix_caching: - assert len(out.scheduled_seq_groups) == 2 - assert set([ - out.scheduled_seq_groups[0].seq_group, - out.scheduled_seq_groups[1].seq_group, - ]) == set([seqB_group, seqC_group]) - assert len(metas) == 2 - for meta in metas: - assert meta.token_chunk_size == 8 - assert (len(meta.computed_block_nums) == num_shared_tokens // - block_size) # 1 Block for the 8 tokens. - else: - assert len(out.scheduled_seq_groups) == 1 - assert len(metas) == 1 - assert metas[0].token_chunk_size == 8 - assert len(metas[0].computed_block_nums) == 0 # No blocks computed. - - -def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching( -): - """ - This test verifies that we don't schedule new prefills if there's already - a continuous prefill in progress even though the new prefills with shared - prefix can fit in the token budget: - - - SeqA is being chunked prefill. - - SeqB with the same prompt shouldn't be scheduled for prefill even though - there's enough token budget to prefill the cached tokens. - - Neither should seqC be scheduled. - - - When seqA is in decoding phase, seqB and seqC can be scheduled. - - Entire seqB should be prefilled since it's a full prefix cache hit. - - SeqC would be partially prefilled with the prefix shared, and the - remaining unique tokens would be prefilled (rounded down to be - block-size aligned). - """ - - block_size = 2 - max_num_batched_tokens = 4 - max_seq_group = 3 - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=16, - num_gpu_blocks=16, - max_token_budget=max_num_batched_tokens, - max_num_seqs=max_seq_group, - max_model_len=100, - enable_prefix_caching=True, - enable_chunked_prefill=True, - ) - - seqA_tokens = list(range(8)) - seqB_tokens = seqA_tokens - seqC_shared_prefix_len = 4 - seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20)) - - seqA, seqA_group = create_dummy_prompt("0", - prompt_tokens=seqA_tokens, - block_size=block_size) - seqB, seqB_group = create_dummy_prompt("1", - prompt_tokens=seqB_tokens, - block_size=block_size) - - # Chunked prefill seqA. - scheduler.add_seq_group(seqA_group) - metas, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - assert out.scheduled_seq_groups[0].seq_group == seqA_group - assert out.scheduled_seq_groups[0].token_chunk_size == 4 - - # seqB should not be scheduled with ongoing prefills. - scheduler.add_seq_group(seqB_group) - metas, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - assert out.scheduled_seq_groups[0].seq_group == seqA_group - assert out.scheduled_seq_groups[0].token_chunk_size == 4 - - # both seqB and seqC can now be scheduled with seqA is over. - # seqA is in decoding phase. - append_new_token_seq(seqA, 999) - seqC, seqC_group = create_dummy_prompt("2", - prompt_tokens=seqC_tokens, - block_size=block_size) - scheduler.add_seq_group(seqC_group) - metas, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 3 - - metas = {meta.request_id: meta for meta in metas} - assert metas[seqA_group.request_id].token_chunk_size == 1 # Decode - assert (metas[seqB_group.request_id].token_chunk_size == 8 - ) # Fully cached prefill - assert ( - metas[seqC_group.request_id].token_chunk_size == 6 - ), "A partial prefix of C (4 tokens) should be prefilled, with the " - "remaining tokens fit into 3 token budget (4-1 from the seqA). It will " - "then be rounded down to 2 tokens on block size, thus 6 tokens in total." - - -def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds(): - """ - Test that the scheduler does not schedule batches with prompt tokens and - prompt embeddings co-mingled. - """ - block_size = 2 - max_seq_group = 3 - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=16, - num_gpu_blocks=16, - max_num_seqs=max_seq_group, - max_model_len=100, - enable_prefix_caching=True, - ) - - # the odd indexed inputs should be passed in via embeddings, - # evens via token_ids - seq_length = 7 - embedding_size = 5 - num_seqs = 11 - seq_tokens: list[list[int]] = [] - seq_embeds: list[Optional[torch.Tensor]] = [] - for i in range(num_seqs): - if i % 2: - seq_tokens.append(list(range(seq_length))) - seq_embeds.append(None) - else: - seq_tokens.append([0] * seq_length) - seq_embeds.append(torch.rand(embedding_size)) - - seq_and_seq_groups = [ - create_dummy_prompt(f"{i}", - prompt_tokens=seq_tokens[i], - prompt_embeds=seq_embeds[i], - block_size=block_size) - for i in range(len(seq_tokens)) - ] - - for _, seq_group in seq_and_seq_groups: - scheduler.add_seq_group(seq_group) - - while not all(seq.is_finished() for seq, _ in seq_and_seq_groups): - unfinished_seq_groups = [ - seq_group for _, seq_group in seq_and_seq_groups - if not seq_group.is_finished() - ] - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) > 0 - batch_is_prompt_embeds = out.scheduled_seq_groups[ - 0].seq_group.uses_prompt_embeds() - expected_scheduled_seq_groups = [ - seq_group for seq_group in unfinished_seq_groups - if seq_group.uses_prompt_embeds() == batch_is_prompt_embeds - ] - - # We should have as many scheduled groups as possible, without mixing - assert len(out.scheduled_seq_groups) == min( - max_seq_group, len(expected_scheduled_seq_groups)) - assert all(scheduled_seq_group.seq_group.uses_prompt_embeds() == - batch_is_prompt_embeds - for scheduled_seq_group in out.scheduled_seq_groups) - - # Finish the scheduled groups - for scheduled_seq_group in out.scheduled_seq_groups: - for seq in scheduled_seq_group.seq_group.seqs: - seq.status = SequenceStatus.FINISHED_STOPPED - scheduler.free_finished_seq_groups() - - -def test_remove_seq_from_computed_blocks_tracker(): - """ - Test that computed_blocks_tracker correctly removes stale sequences - during scheduling. - - The test covers 9 scheduling branches where stale seqs are removed: - - 1 in _schedule_swapped - - 1 in _schedule_priority_preemption - - 7 in _schedule_prefill - - Each branch is tested to ensure proper cleanup of - _seq_id_to_num_tokens_computed. - """ - # Budget can not schedule in swapped - block_size = 2 - max_seq_group = 3 - seq_tokens_with_swapped: list[list[int]] = [] - blocks_to_swap_out: list[tuple[int, int]] = [] - curr_loras: set[int] = set() - - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=16, - max_num_seqs=max_seq_group, - enable_prefix_caching=True, - ) - budget = create_token_budget(token_budget=15) - - seq_length = 16 - num_seqs = 3 - for i in range(num_seqs): - seq_tokens_with_swapped.append([i] * seq_length) - - seq_and_seq_groups = [ - create_dummy_prompt(f"{i}", - prompt_tokens=seq_tokens_with_swapped[i], - block_size=block_size) - for i in range(len(seq_tokens_with_swapped)) - ] - - for _, seq_group in seq_and_seq_groups: - scheduler._allocate_and_set_running(seq_group) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - scheduler._schedule_swapped(budget, curr_loras) - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(1)) - assert seq_id_to_num_tokens_computed is None - - # Prefill schedule don't have a space for another LoRA, so - # we ignore this request for now. - block_size = 4 - lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) - scheduler = initialize_scheduler(lora_config=lora_config, - block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64, - enable_prefix_caching=True) - budget = create_token_budget(token_budget=120) - num_seqs = 2 - for i in range(num_seqs): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=seq_length, - block_size=block_size, - lora_request=LoRARequest( - lora_name=str(i), - lora_int_id=i + 1, - lora_path="abc")) - scheduler.add_seq_group(seq_group) - - scheduler._schedule_prefills(budget, curr_loras) - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(1)) - assert seq_id_to_num_tokens_computed is None - - # Priority preemption schedule - scheduler._schedule_priority_preemption(budget) - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(1)) - assert seq_id_to_num_tokens_computed is None - - # Prefill scheduler does not schedule batches with prompt tokens and - # prompt embeddings co-mingled. - block_size = 2 - max_seq_group = 3 - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=16, - num_gpu_blocks=16, - max_num_seqs=max_seq_group, - max_model_len=100, - enable_prefix_caching=True, - ) - seq_length = 7 - embedding_size = 5 - seq_tokens_with_embedding: list[list[int]] = [] - seq_embeds: list[Optional[torch.Tensor]] = [] - - seq_tokens_with_embedding.append(list(range(seq_length))) - seq_embeds.append(None) - seq_tokens_with_embedding.append([0] * seq_length) - seq_embeds.append(torch.rand(embedding_size)) - - seq_and_seq_groups = [ - create_dummy_prompt(f"{i}", - prompt_tokens=seq_tokens_with_embedding[i], - prompt_embeds=seq_embeds[i], - block_size=block_size) - for i in range(len(seq_tokens_with_embedding)) - ] - - for _, seq_group in seq_and_seq_groups: - scheduler.add_seq_group(seq_group) - - scheduler._schedule_default() - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(1)) - assert seq_id_to_num_tokens_computed is None - - # Prefill scheduler budget num_batched_tokens - # >= scheduler_config max_num_batched_tokens - block_size = 2 - max_seq_group = 3 - seq_tokens_prefill_budget: list[list[int]] = [] - - scheduler = initialize_scheduler( - block_size=block_size, - max_token_budget=8, - num_cpu_blocks=16, - num_gpu_blocks=16, - max_num_seqs=max_seq_group, - max_model_len=5, - enable_prefix_caching=True, - ) - seq_length = 4 - num_seqs = 3 - for i in range(num_seqs): - seq_tokens_prefill_budget.append([i] * seq_length) - - seq_and_seq_groups = [ - create_dummy_prompt(f"{i}", - prompt_tokens=seq_tokens_prefill_budget[i], - block_size=block_size) - for i in range(len(seq_tokens_prefill_budget)) - ] - - for _, seq_group in seq_and_seq_groups: - scheduler.add_seq_group(seq_group) - - scheduler._schedule_default() - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(2)) - assert seq_id_to_num_tokens_computed is None - - # Budget can not schedule in waiting - block_size = 2 - max_seq_group = 3 - - scheduler = initialize_scheduler( - block_size=block_size, - max_token_budget=30, - num_cpu_blocks=16, - num_gpu_blocks=16, - max_num_seqs=max_seq_group, - max_model_len=30, - enable_prefix_caching=True, - ) - seq_length = 16 - num_seqs = 3 - seq_tokens_prefill_budget_waiting: list[list[int]] = [] - - for i in range(num_seqs): - seq_tokens_prefill_budget_waiting.append(list(range(seq_length))) - - seq_and_seq_groups = [ - create_dummy_prompt(f"{i}", - prompt_tokens=seq_tokens_prefill_budget_waiting[i], - block_size=block_size) - for i in range(len(seq_tokens_prefill_budget_waiting)) - ] - - for _, seq_group in seq_and_seq_groups: - scheduler.add_seq_group(seq_group) - - scheduler._schedule_default() - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(1)) - assert seq_id_to_num_tokens_computed is None - - # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED - block_size = 2 - max_seq_group = 3 - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=16, - num_gpu_blocks=16, - max_num_seqs=max_seq_group, - max_model_len=30, - enable_prefix_caching=True, - ) - - seq_length = 31 - seq_tokens_prompt_limit: list[list[int]] = [] - seq_tokens_prompt_limit.append(list(range(seq_length))) - seq_and_seq_groups = [ - create_dummy_prompt("0", - prompt_tokens=seq_tokens_prompt_limit[0], - block_size=block_size) - ] - for _, seq_group in seq_and_seq_groups: - scheduler.add_seq_group(seq_group) - scheduler._schedule_default() - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(0)) - assert seq_id_to_num_tokens_computed is None - - # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED - block_size = 2 - max_seq_group = 3 - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=160, - num_gpu_blocks=160, - max_num_seqs=max_seq_group, - max_model_len=320, - enable_prefix_caching=True, - ) - - seq_length = 320 - num_seqs = 1 - seq_tokens_never: list[list[int]] = [] - for i in range(num_seqs): - seq_tokens_never.append(list(range(seq_length))) - - seq_and_seq_groups = [ - create_dummy_prompt(f"{i}", - prompt_tokens=seq_tokens_never[i], - block_size=block_size) - for i in range(len(seq_tokens_never)) - ] - - for _, seq_group in seq_and_seq_groups: - scheduler.add_seq_group(seq_group) - - scheduler._schedule_default() - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(0)) - assert seq_id_to_num_tokens_computed is None - - # Budget can not allocate, AllocStatus is LATER - block_size = 2 - max_seq_group = 3 - scheduler = initialize_scheduler( - block_size=block_size, - num_cpu_blocks=160, - num_gpu_blocks=160, - max_num_seqs=max_seq_group, - max_model_len=320, - enable_prefix_caching=True, - ) - - seq_length = 160 - num_seqs = 2 - seq_tokens_later: list[list[int]] = [] - for i in range(num_seqs): - seq_tokens_later.append(list(range(seq_length))) - - seq_and_seq_groups = [ - create_dummy_prompt(f"{i}", - prompt_tokens=seq_tokens_later[i], - block_size=block_size) - for i in range(len(seq_tokens_later)) - ] - - for _, seq_group in seq_and_seq_groups: - scheduler.add_seq_group(seq_group) - - scheduler._schedule_default() - seq_id_to_num_tokens_computed = ( - scheduler.block_manager._computed_blocks_tracker. - _seq_id_to_num_tokens_computed.get(1)) - assert seq_id_to_num_tokens_computed is None diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py deleted file mode 100644 index ee9ac2129f2db..0000000000000 --- a/tests/core/test_serialization.py +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import msgspec - -from vllm.executor.msgspec_utils import decode_hook, encode_hook -from vllm.sequence import ExecuteModelRequest - -from .utils import create_batch - - -def test_msgspec_serialization(): - num_lookahead_slots = 4 - seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=num_lookahead_slots, - running_queue_size=4) - - encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) - decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, - dec_hook=decode_hook) - req = decoder.decode(encoder.encode(execute_model_req)) - expected = execute_model_req.seq_group_metadata_list - actual = req.seq_group_metadata_list - assert (len(expected) == len(actual)) - expected = expected[0] - actual = actual[0] - - assert expected.block_tables == actual.block_tables - assert expected.is_prompt == actual.is_prompt - assert expected.request_id == actual.request_id - assert (expected.seq_data[0].prompt_token_ids == - actual.seq_data[0].prompt_token_ids) - assert (expected.seq_data[0].output_token_ids == - actual.seq_data[0].output_token_ids) diff --git a/tests/core/utils.py b/tests/core/utils.py deleted file mode 100644 index 033fffd2c4e24..0000000000000 --- a/tests/core/utils.py +++ /dev/null @@ -1,392 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time -from collections import defaultdict -from collections.abc import Sequence as GenericSequence -from itertools import count -from typing import Any, Optional, Union - -import torch - -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs -from vllm.lora.request import LoRARequest -from vllm.sampling_params import SamplingParams -from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup, - SequenceGroupMetadata) - - -def create_dummy_prompt( - request_id: str, - prompt_length: int = -1, - block_size: Optional[int] = None, - lora_request: Optional[LoRARequest] = None, - prompt_tokens: Optional[list[int]] = None, - prompt_embeds: Optional[torch.Tensor] = None, - min_tokens: int = 0, - max_tokens: int = 16, -) -> tuple[Sequence, SequenceGroup]: - if not block_size: - block_size = prompt_length - - if prompt_tokens is None: - # Create dummy prompt sequence with tokens 0...block_size-1 - # and prompt "0 ... block_size". - prompt_tokens = list(range(prompt_length)) - - prompt_str = " ".join([str(t) for t in prompt_tokens]) - inputs = token_inputs( - prompt_token_ids=prompt_tokens, - prompt=prompt_str) if prompt_embeds is None else embeds_inputs( - prompt_embeds=prompt_embeds) - prompt = Sequence( - int(request_id), - inputs=inputs, - block_size=block_size, - ) - seq_group = SequenceGroup( - request_id=request_id, - seqs=[prompt], - arrival_time=time.time(), - sampling_params=SamplingParams(max_tokens=max_tokens, - min_tokens=min_tokens), - lora_request=lora_request, - ) - - return prompt, seq_group - - -def create_dummy_lora_sequence(request_id: int, token_ids: list[int], - block_size: int, lora_int_id: int) -> Sequence: - return Sequence(seq_id=request_id, - inputs=token_inputs(token_ids), - block_size=block_size, - lora_request=LoRARequest(lora_name="dummy", - lora_path="/dummy", - lora_int_id=lora_int_id)) - - -def create_dummy_sequence(request_id: int, token_ids: list[int], - block_size: int) -> Sequence: - return Sequence( - seq_id=request_id, - inputs=token_inputs(token_ids), - block_size=block_size, - ) - - -def create_dummy_prompt_encoder_decoder( - request_id: str, - decoder_prompt_length: int, - encoder_prompt_length: int, - block_size: Optional[int] = None, - lora_request: Optional[LoRARequest] = None, -) -> tuple[Sequence, Sequence, SequenceGroup]: - if not block_size: - block_size = decoder_prompt_length - - # Create dummy prompt sequence with tokens 0...block_size-1 - # and prompt "0 ... block_size". Note that the prompt string - # doesn't actually match the tokens - decoder_prompt_tokens = list(range(decoder_prompt_length)) - decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens]) - encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) - encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - - inputs: EncoderDecoderInputs = { - "decoder": token_inputs(decoder_prompt_tokens, - prompt=decoder_prompt_str), - "encoder": token_inputs(encoder_prompt_tokens, - prompt=encoder_prompt_str), - } - - decoder_prompt = Sequence(int(request_id), - inputs=inputs["decoder"], - block_size=block_size) - - encoder_prompt = Sequence(int(request_id), - inputs=inputs["encoder"], - block_size=block_size) - - seq_group = SequenceGroup(request_id=request_id, - seqs=[decoder_prompt], - arrival_time=time.time(), - lora_request=lora_request, - encoder_seq=encoder_prompt) - - return decoder_prompt, encoder_prompt, seq_group - - -def create_seq_group( - seq_prompt_len: int = 1024, - seq_output_lens: GenericSequence[int] = (128, ), - request_id: str = '0', - seq_id_start: int = 0, - sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: - - assert len(seq_output_lens) > 0 - - if sampling_params is None: - sampling_params = SamplingParams() - - prompt_token_ids = [0] * seq_prompt_len - - seqs: list[Sequence] = [] - for seq_id_offset, output_len in enumerate(seq_output_lens): - seq = Sequence( - seq_id=seq_id_start + seq_id_offset, - inputs=token_inputs(prompt_token_ids), - block_size=16, - ) - - for i in range(output_len): - seq.append_token_id( - token_id=i, - logprobs={i: Logprob(0.0)}, - ) - seqs.append(seq) - - seq_group = SequenceGroup( - request_id=request_id, - seqs=seqs, - sampling_params=sampling_params, - arrival_time=time.time(), - ) - - return seq_group - - -def create_seq_group_encoder_decoder( - seq_prompt_len: int = 1024, - seq_output_lens: GenericSequence[int] = (128, ), - request_id: str = '0', - seq_id_start: int = 0, - sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: - - assert len(seq_output_lens) > 0 - - if sampling_params is None: - sampling_params = SamplingParams() - - prompt_token_ids = [0] * seq_prompt_len - - inputs: EncoderDecoderInputs = { - "decoder": token_inputs(prompt_token_ids), - "encoder": token_inputs(prompt_token_ids), - } - - seqs = [] - for seq_id_offset, output_len in enumerate(seq_output_lens): - # Construct decoder input sequences - seq = Sequence( - seq_id=seq_id_start + seq_id_offset, - inputs=inputs["decoder"], - block_size=16, - ) - - for i in range(output_len): - seq.append_token_id( - token_id=i, - logprobs={i: Logprob(0.0)}, - ) - seqs.append(seq) - - # Encoder input sequence - encoder_seq = Sequence( - seq_id=seq_id_start + len(seq_output_lens), - inputs=inputs["encoder"], - block_size=16, - ) - - return SequenceGroup(request_id=request_id, - seqs=seqs, - sampling_params=sampling_params, - arrival_time=time.time(), - encoder_seq=encoder_seq) - - -def round_up_to_next_block(seq_len: int, block_size: int) -> int: - return (seq_len + block_size - 1) // block_size - - -# Helper functions for scheduler tests - - -def get_sequence_groups(scheduler_output): - return [s.seq_group for s in scheduler_output.scheduled_seq_groups] - - -def append_new_token(out, token_id: int): - seq_groups = get_sequence_groups(out) - for seq_group in seq_groups: - for seq in seq_group.get_seqs(): - seq.append_token_id(token_id, {token_id: Logprob(token_id)}) - - -def schedule_and_update_computed_tokens(scheduler): - metas, out, _ = scheduler.schedule() - for s in out.scheduled_seq_groups: - s.seq_group.update_num_computed_tokens(s.token_chunk_size) - return metas, out - - -def append_new_token_seq(seq: Sequence, token_id: int): - seq.append_token_id(token_id, {token_id: Logprob(token_id)}) - - -def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int): - seq_group.update_num_computed_tokens(token_chunk_size) - for seq in seq_group.get_seqs(): - seq.append_token_id(token_id, {token_id: Logprob(token_id)}) - - -class SchedulerProxy: - """ - A proxy class to forward calls to the scheduler. - """ - - def __init__(self, scheduler: Scheduler): - self.scheduler_ = scheduler - self.call_history: dict[str, list[Any]] = defaultdict(list) - - def __getattr__(self, name: str) -> Any: - - def wrapper(*args, **kwargs): - result = getattr(self.scheduler_, name)(*args, **kwargs) - self.call_history[name].append((args, kwargs, result)) - return result - - return wrapper - - def last_schedule_ret( - self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]: - _, _, ret = self.call_history["schedule"][-1] - return ret - - -def create_seq_group_metadata_from_prompts( - prompts: list[list[int]], - num_gpu_blocks: int, - block_size: int, - final_prompt_lens: list[int], - continuations: Optional[list[list[int]]] = None, - seq_ids: Optional[list[int]] = None, -) -> list[SequenceGroupMetadata]: - - if continuations is None: - continuations = [[] for _ in prompts] - - if seq_ids is None: - seq_ids = list(i for i, _ in enumerate(prompts)) - - free_gpu_blocks = list(range(num_gpu_blocks)) - - block_allocations = { - i: [ - free_gpu_blocks.pop() - for _ in range(round_up_to_next_block(final_len, block_size)) - ] - for i, final_len in enumerate(final_prompt_lens) - } - - seq_grou_metadata_list = [] - for i, (prompt_token_ids, - cont_token_ids) in enumerate(zip(prompts, continuations)): - data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids) - data.update_num_computed_tokens( - len(prompt_token_ids) + len(cont_token_ids) - 1) - seq_data = {i: data} - seq_grou_metadata_list.append( - SequenceGroupMetadata( - request_id=str(i), - is_prompt=len(cont_token_ids) == 0, - seq_data=seq_data, - sampling_params=SamplingParams(temperature=0.0), - block_tables={i: block_allocations[i][:]}, - )) - return seq_grou_metadata_list - - -def create_chunked_seq_group_metadata_from_prompt( - prompt: list[int], - num_gpu_blocks: int, - chunk_size: int, - block_size: int, - seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]: - - if seq_id is None: - seq_id = 0 - - free_gpu_blocks = list(range(num_gpu_blocks)) - - block_allocations = [ - free_gpu_blocks.pop() - for _ in range(round_up_to_next_block(len(prompt), block_size)) - ] - - seq_group_metadata_list = [] - for i, idx in enumerate(range(0, len(prompt), chunk_size)): - chunk_ids = prompt[idx:idx + chunk_size] - data = SequenceData.from_seqs(prompt) - data.update_num_computed_tokens(idx) - seq_data = {i: data} - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=str(seq_id), - is_prompt=True, - do_sample=idx + chunk_size >= len(prompt), # terminal chunk - seq_data=seq_data, - sampling_params=SamplingParams(temperature=0.0), - block_tables={i: block_allocations}, - token_chunk_size=len(chunk_ids))) - return seq_group_metadata_list - - -def create_batch(batch_size, - k, - prompt_len: Union[int, list[int]] = 10, - prev_output_token_len: int = 10, - seq_ids: Optional[list[int]] = None, - num_gpu_blocks: Optional[int] = None, - block_size: Optional[int] = None, - prefill_chunk_size: Optional[int] = None): - if block_size is None: - block_size = 8 - - if num_gpu_blocks is None: - num_gpu_blocks = 2048 // block_size - - iterator = count() - - if isinstance(prompt_len, int): - prompt_lens = [prompt_len for _ in range(batch_size)] - else: - prompt_lens = prompt_len - - prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] - - if prefill_chunk_size: - # Create a batch of chunked prompts. - if not seq_ids: - seq_ids = list(range(len(prompts))) - seq_group_metadata_list = [] - for p, sid in zip(prompts, seq_ids): - seq_group_metadata_list += \ - create_chunked_seq_group_metadata_from_prompt( - p, num_gpu_blocks, prefill_chunk_size, block_size, sid) - seq_group_metadata_list = seq_group_metadata_list[:batch_size] - prev_output_tokens = [] - else: - prev_output_tokens = [[ - next(iterator) for _ in range(prev_output_token_len) - ] for _ in range(batch_size)] - final_prompt_lens = [ - len(prompt) + len(prev_output_token) + k + 1 - for prompt, prev_output_token in zip(prompts, prev_output_tokens) - ] - - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, num_gpu_blocks, block_size, final_prompt_lens, - prev_output_tokens, seq_ids) - return seq_group_metadata_list, prompts, prev_output_tokens From 4aa8c7b0477de2cd0f6f1a46437f46e4cb00bae3 Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Wed, 17 Sep 2025 09:46:29 -0700 Subject: [PATCH 369/584] cleanup: remove adapter commons (#25045) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> --- pyproject.toml | 1 - vllm/adapter_commons/__init__.py | 0 vllm/adapter_commons/layers.py | 16 ---- vllm/adapter_commons/models.py | 106 ------------------------- vllm/adapter_commons/request.py | 26 ------ vllm/adapter_commons/utils.py | 93 ---------------------- vllm/adapter_commons/worker_manager.py | 39 --------- vllm/lora/layers/utils.py | 11 ++- vllm/lora/models.py | 77 ++++++++++++------ vllm/lora/request.py | 6 +- vllm/lora/worker_manager.py | 44 ++++++---- 11 files changed, 89 insertions(+), 330 deletions(-) delete mode 100644 vllm/adapter_commons/__init__.py delete mode 100644 vllm/adapter_commons/layers.py delete mode 100644 vllm/adapter_commons/models.py delete mode 100644 vllm/adapter_commons/request.py delete mode 100644 vllm/adapter_commons/utils.py delete mode 100644 vllm/adapter_commons/worker_manager.py diff --git a/pyproject.toml b/pyproject.toml index f5a44f56f416e..fe55461db00be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,7 +115,6 @@ follow_imports = "silent" # move the directory here and remove it from tools/mypy.sh files = [ "vllm/*.py", - "vllm/adapter_commons", "vllm/assets", "vllm/entrypoints", "vllm/core", diff --git a/vllm/adapter_commons/__init__.py b/vllm/adapter_commons/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py deleted file mode 100644 index 9753a08806565..0000000000000 --- a/vllm/adapter_commons/layers.py +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass - - -@dataclass -class AdapterMapping: - # Per every token in input_ids: - index_mapping: tuple[int, ...] - # Per sampled token: - prompt_mapping: tuple[int, ...] - - def __post_init__(self): - self.index_mapping = tuple(self.index_mapping) - self.prompt_mapping = tuple(self.prompt_mapping) \ No newline at end of file diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py deleted file mode 100644 index 7b685880a9e6c..0000000000000 --- a/vllm/adapter_commons/models.py +++ /dev/null @@ -1,106 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import Any, Callable, Optional, TypeVar - -from torch import nn - -from vllm.logger import init_logger -from vllm.utils import LRUCache - -logger = init_logger(__name__) - - -class AdapterModel(ABC): - - def __init__(self, model_id=None): - self.id = model_id - - @abstractmethod - def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs): - # Common initialization code - # Load weights or embeddings from local checkpoint - raise NotImplementedError("Subclasses must implement this method.") - - -T = TypeVar('T') - - -class AdapterLRUCache(LRUCache[int, T]): - - def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]): - super().__init__(capacity) - self.deactivate_fn = deactivate_fn - - def _on_remove(self, key: int, value: Optional[T]): - logger.debug("Removing adapter int id: %d", key) - self.deactivate_fn(key) - return super()._on_remove(key, value) - - -class AdapterModelManager(ABC): - - def __init__( - self, - model: nn.Module, - ): - """Create a AdapterModelManager and adapter for a given model. - Args: - model: the model to be adapted. - """ - self.model: nn.Module = model - self._registered_adapters: dict[int, Any] = {} - # Dict instead of a Set for compatibility with LRUCache. - self._active_adapters: dict[int, None] = {} - self.adapter_type = 'Adapter' - self._last_mapping = None - - def __len__(self) -> int: - return len(self._registered_adapters) - - @property - @abstractmethod - def adapter_slots(self) -> int: - raise NotImplementedError - - @property - @abstractmethod - def capacity(self) -> int: - raise NotImplementedError - - @abstractmethod - def activate_adapter(self, adapter_id: int) -> bool: - raise NotImplementedError - - @abstractmethod - def deactivate_adapter(self, adapter_id: int) -> bool: - raise NotImplementedError - - @abstractmethod - def add_adapter(self, adapter: Any) -> bool: - raise NotImplementedError - - @abstractmethod - def set_adapter_mapping(self, mapping: Any) -> None: - raise NotImplementedError - - @abstractmethod - def remove_adapter(self, adapter_id: int) -> bool: - raise NotImplementedError - - @abstractmethod - def remove_all_adapters(self) -> None: - raise NotImplementedError - - @abstractmethod - def get_adapter(self, adapter_id: int) -> Optional[Any]: - raise NotImplementedError - - @abstractmethod - def list_adapters(self) -> dict[int, Any]: - raise NotImplementedError - - @abstractmethod - def pin_adapter(self, adapter_id: int) -> bool: - raise NotImplementedError diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py deleted file mode 100644 index 8135b54ba19f6..0000000000000 --- a/vllm/adapter_commons/request.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod - - -class AdapterRequest(ABC): - """ - Base class for adapter requests. - """ - - @property - @abstractmethod - def adapter_id(self) -> int: - raise NotImplementedError - - def __post_init__(self) -> None: - if self.adapter_id < 1: - raise ValueError(f"id must be > 0, got {self.adapter_id}") - - def __eq__(self, value: object) -> bool: - return isinstance( - value, self.__class__) and self.adapter_id == value.adapter_id - - def __hash__(self) -> int: - return hash(self.adapter_id) diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py deleted file mode 100644 index a1a56b6bbd4ba..0000000000000 --- a/vllm/adapter_commons/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Callable, Optional - - -## model functions -def deactivate_adapter(adapter_id: int, active_adapters: dict[int, None], - deactivate_func: Callable) -> bool: - if adapter_id in active_adapters: - deactivate_func(adapter_id) - active_adapters.pop(adapter_id) - return True - return False - - -def add_adapter(adapter: Any, registered_adapters: dict[int, Any], - capacity: int, add_func: Callable) -> bool: - if adapter.id not in registered_adapters: - if len(registered_adapters) >= capacity: - raise RuntimeError('No free adapter slots.') - add_func(adapter) - registered_adapters[adapter.id] = adapter - return True - return False - - -def set_adapter_mapping(mapping: Any, last_mapping: Any, - set_mapping_func: Callable) -> Any: - if last_mapping != mapping: - set_mapping_func(mapping) - return mapping - return last_mapping - - -def remove_adapter(adapter_id: int, registered_adapters: dict[int, Any], - deactivate_func: Callable) -> bool: - deactivate_func(adapter_id) - return bool(registered_adapters.pop(adapter_id, None)) - - -def list_adapters(registered_adapters: dict[int, Any]) -> dict[int, Any]: - return dict(registered_adapters) - - -def get_adapter(adapter_id: int, - registered_adapters: dict[int, Any]) -> Optional[Any]: - return registered_adapters.get(adapter_id) - - -## worker functions -def set_active_adapters_worker(requests: set[Any], mapping: Optional[Any], - apply_adapters_func, - set_adapter_mapping_func) -> None: - apply_adapters_func(requests) - set_adapter_mapping_func(mapping) - - -def add_adapter_worker(adapter_request: Any, list_adapters_func, - load_adapter_func, add_adapter_func, - activate_adapter_func) -> bool: - if adapter_request.adapter_id in list_adapters_func(): - return False - loaded_adapter = load_adapter_func(adapter_request) - loaded = add_adapter_func(loaded_adapter) - activate_adapter_func(loaded_adapter.id) - return loaded - - -def apply_adapters_worker(adapter_requests: set[Any], list_adapters_func, - adapter_slots: int, remove_adapter_func, - add_adapter_func) -> None: - models_that_exist = list_adapters_func() - models_map = { - adapter_request.adapter_id: adapter_request - for adapter_request in adapter_requests if adapter_request - } - if len(models_map) > adapter_slots: - raise RuntimeError( - f"Number of requested models ({len(models_map)}) is greater " - f"than the number of GPU model slots " - f"({adapter_slots}).") - new_models = set(models_map) - models_to_add = new_models - models_that_exist - models_to_remove = models_that_exist - new_models - for adapter_id in models_to_remove: - remove_adapter_func(adapter_id) - for adapter_id in models_to_add: - add_adapter_func(models_map[adapter_id]) - - -def list_adapters_worker(adapter_manager_list_adapters_func) -> set[int]: - return set(adapter_manager_list_adapters_func()) diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py deleted file mode 100644 index 07e85d138ac50..0000000000000 --- a/vllm/adapter_commons/worker_manager.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import Any, Optional - -import torch - - -class AbstractWorkerManager(ABC): - - def __init__(self, device: torch.device): - self.device = device - - @property - @abstractmethod - def is_enabled(self) -> bool: - raise NotImplementedError - - @abstractmethod - def set_active_adapters(self, requests: set[Any], - mapping: Optional[Any]) -> None: - raise NotImplementedError - - @abstractmethod - def add_adapter(self, adapter_request: Any) -> bool: - raise NotImplementedError - - @abstractmethod - def remove_adapter(self, adapter_id: int) -> bool: - raise NotImplementedError - - @abstractmethod - def remove_all_adapters(self) -> None: - raise NotImplementedError - - @abstractmethod - def list_adapters(self) -> set[int]: - raise NotImplementedError diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py index 27dcd720fbdea..772d32a44c22d 100644 --- a/vllm/lora/layers/utils.py +++ b/vllm/lora/layers/utils.py @@ -1,17 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + from dataclasses import dataclass import torch import torch.nn as nn -from vllm.adapter_commons.layers import AdapterMapping - @dataclass -class LoRAMapping(AdapterMapping): +class LoRAMapping: + index_mapping: tuple[int, ...] + prompt_mapping: tuple[int, ...] is_prefill: bool = False + def __post_init__(self): + self.index_mapping = tuple(self.index_mapping) + self.prompt_mapping = tuple(self.prompt_mapping) + def _get_lora_device(base_layer: nn.Module) -> torch.device: # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34 diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 7712438054914..25f90f2fa932b 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,18 +4,13 @@ import math import os from collections.abc import Sequence -from typing import Any, Callable, Optional, Union +from typing import Callable, Optional, TypeVar, Union import regex as re import safetensors.torch import torch from torch import nn -from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel, - AdapterModelManager) -from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter, - get_adapter, list_adapters, - remove_adapter, set_adapter_mapping) from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping @@ -33,10 +28,25 @@ from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper from vllm.model_executor.utils import get_packed_modules_mapping -from vllm.utils import is_pin_memory_available +from vllm.utils import LRUCache, is_pin_memory_available logger = init_logger(__name__) +T = TypeVar("T") + + +class AdapterLRUCache(LRUCache[int, T]): + + def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]): + super().__init__(capacity) + self.deactivate_fn = deactivate_fn + + def _on_remove(self, key: int, value: Optional[T]): + logger.debug("Removing adapter int id: %d", key) + self.deactivate_fn(key) + return super()._on_remove(key, value) + + _GLOBAL_LORA_ID = 0 @@ -57,7 +67,7 @@ def is_moe_model(model: nn.Module) -> bool: return False -class LoRAModel(AdapterModel): +class LoRAModel: """A LoRA fine-tuned model.""" def __init__( @@ -313,7 +323,7 @@ class LoRAModel(AdapterModel): weights_mapper=weights_mapper) -class LoRAModelManager(AdapterModelManager): +class LoRAModelManager: """A manager that manages multiple LoRA-fine-tuned models.""" def __init__( @@ -336,6 +346,11 @@ class LoRAModelManager(AdapterModelManager): vocab_size: the vocab size of the model. lora_config: the LoRA configuration. """ + self.model: SupportsLoRA = model + self._registered_adapters: dict[int, LoRAModel] = {} + # Dict instead of a set for compatibility with LRUCache. + self._active_adapters: dict[int, None] = {} + self.adapter_type = "LoRA" self.lora_config = lora_config self.device = device self.max_num_seqs = max_num_seqs @@ -347,9 +362,8 @@ class LoRAModelManager(AdapterModelManager): max_num_batched_tokens, max_batches=self.max_num_seqs, device=self.device, - max_loras=self.lora_config.max_loras) - - super().__init__(model) + max_loras=self.lora_config.max_loras, + ) self.supported_lora_modules = get_supported_lora_modules(self.model) assert self.supported_lora_modules, "No supported LoRA modules found in" @@ -370,7 +384,9 @@ class LoRAModelManager(AdapterModelManager): self._last_mapping: Optional[LoRAMapping] = None self._create_lora_modules() self.model.lora_manager = self - self.adapter_type = 'LoRA' + + def __len__(self) -> int: + return len(self._registered_adapters) @property def capacity(self) -> int: @@ -669,28 +685,39 @@ class LoRAModelManager(AdapterModelManager): return lora_model.get_lora(org_module_name) def deactivate_adapter(self, adapter_id: int) -> bool: - return deactivate_adapter(adapter_id, self._active_adapters, - self._deactivate_adapter) + if adapter_id not in self._active_adapters: + return False + self._deactivate_adapter(adapter_id) + self._active_adapters.pop(adapter_id, None) + return True def add_adapter(self, adapter: LoRAModel) -> bool: logger.debug("Adding lora. Model id: %d, " "int id: %d", adapter.id, adapter.id) - return add_adapter(adapter, self._registered_adapters, self.capacity, - self._add_adapter) + if adapter.id in self._registered_adapters: + return False + if len(self._registered_adapters) >= self.capacity: + raise RuntimeError("No free adapter slots.") + self._add_adapter(adapter) + return True def set_adapter_mapping(self, mapping: LoRAMapping) -> None: - self._last_mapping = set_adapter_mapping(mapping, self._last_mapping, - self._set_adapter_mapping) + if self._last_mapping != mapping: + self._set_adapter_mapping(mapping) + self._last_mapping = mapping def remove_adapter(self, adapter_id: int) -> bool: - return remove_adapter(adapter_id, self._registered_adapters, - self.deactivate_adapter) + self.deactivate_adapter(adapter_id) + if adapter_id not in self._registered_adapters: + return False + self._registered_adapters.pop(adapter_id, None) + return True - def list_adapters(self) -> dict[int, Any]: - return list_adapters(self._registered_adapters) + def list_adapters(self) -> dict[int, LoRAModel]: + return dict(self._registered_adapters) - def get_adapter(self, adapter_id: int) -> Optional[Any]: - return get_adapter(adapter_id, self._registered_adapters) + def get_adapter(self, adapter_id: int) -> Optional[LoRAModel]: + return self._registered_adapters.get(adapter_id) class LoRALRUCache(AdapterLRUCache[LoRAModel]): diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 5bbba7830c1b1..523525d46f0b3 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -6,8 +6,6 @@ from typing import Optional import msgspec -from vllm.adapter_commons.request import AdapterRequest - class LoRARequest( msgspec.Struct, @@ -24,8 +22,6 @@ class LoRARequest( lora_int_id must be globally unique for a given adapter. This is currently not enforced in vLLM. """ - __metaclass__ = AdapterRequest - lora_name: str lora_int_id: int lora_path: str = "" @@ -35,6 +31,8 @@ class LoRARequest( tensorizer_config_dict: Optional[dict] = None def __post_init__(self): + if self.lora_int_id < 1: + raise ValueError(f"id must be > 0, got {self.lora_int_id}") if self.lora_local_path: warnings.warn( "The 'lora_local_path' attribute is deprecated " diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 3a807b1e161d2..e27b7d5fcf223 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -6,11 +6,6 @@ from typing import Any, Literal, Optional, Union import torch -from vllm.adapter_commons.utils import (add_adapter_worker, - apply_adapters_worker, - list_adapters_worker, - set_active_adapters_worker) -from vllm.adapter_commons.worker_manager import AbstractWorkerManager from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, @@ -22,7 +17,7 @@ from vllm.lora.utils import get_adapter_absolute_path logger = init_logger(__name__) -class WorkerLoRAManager(AbstractWorkerManager): +class WorkerLoRAManager: """WorkerLoRAManager that manages LoRA models on the worker side. Every request, the requested LoRAs will be loaded (unless they are already @@ -51,7 +46,7 @@ class WorkerLoRAManager(AbstractWorkerManager): self.vocab_size = vocab_size self.lora_config = lora_config self.max_position_embeddings = max_position_embeddings - super().__init__(device) + self.device = device # Lazily initialized by create_lora_manager. self._adapter_manager: LoRAModelManager @@ -164,19 +159,34 @@ class WorkerLoRAManager(AbstractWorkerManager): def set_active_adapters(self, requests: set[Any], mapping: Optional[Any]) -> None: - set_active_adapters_worker(requests, mapping, self._apply_adapters, - self._adapter_manager.set_adapter_mapping) + self._apply_adapters(requests) + if mapping is not None: + self._adapter_manager.set_adapter_mapping(mapping) def _apply_adapters(self, adapter_requests: set[Any]) -> None: - apply_adapters_worker(adapter_requests, self.list_adapters, - self._adapter_manager.adapter_slots, - self.remove_adapter, self.add_adapter) + existing_adapters = self.list_adapters() + models_map = { + adapter_request.adapter_id: adapter_request + for adapter_request in adapter_requests if adapter_request + } + if len(models_map) > self._adapter_manager.adapter_slots: + raise RuntimeError( + f"Number of requested models ({len(models_map)}) is greater " + "than the number of GPU model slots " + f"({self._adapter_manager.adapter_slots}).") + requested_ids = set(models_map) + for adapter_id in existing_adapters - requested_ids: + self.remove_adapter(adapter_id) + for adapter_id in requested_ids - existing_adapters: + self.add_adapter(models_map[adapter_id]) def add_adapter(self, adapter_request: Any) -> bool: - return add_adapter_worker(adapter_request, self.list_adapters, - self._load_adapter, - self._adapter_manager.add_adapter, - self._adapter_manager.activate_adapter) + if adapter_request.adapter_id in self.list_adapters(): + return False + loaded_adapter = self._load_adapter(adapter_request) + loaded = self._adapter_manager.add_adapter(loaded_adapter) + self._adapter_manager.activate_adapter(loaded_adapter.id) + return loaded def remove_adapter(self, adapter_id: int) -> bool: return self._adapter_manager.remove_adapter(adapter_id) @@ -185,7 +195,7 @@ class WorkerLoRAManager(AbstractWorkerManager): self._adapter_manager.remove_all_adapters() def list_adapters(self) -> set[int]: - return list_adapters_worker(self._adapter_manager.list_adapters) + return set(self._adapter_manager.list_adapters()) class LRUCacheWorkerLoRAManager(WorkerLoRAManager): From d6a518fdde9780f5c9aabe8cf1f2fafd29af3cbc Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Wed, 17 Sep 2025 09:47:40 -0700 Subject: [PATCH 370/584] Remove unused find_cuda_init helper script (#25044) --- find_cuda_init.py | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 find_cuda_init.py diff --git a/find_cuda_init.py b/find_cuda_init.py deleted file mode 100644 index 308fc6fc2d61c..0000000000000 --- a/find_cuda_init.py +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import importlib -import traceback -from typing import Callable -from unittest.mock import patch - - -def find_cuda_init(fn: Callable[[], object]) -> None: - """ - Helper function to debug CUDA re-initialization errors. - - If `fn` initializes CUDA, prints the stack trace of how this happens. - """ - from torch.cuda import _lazy_init - - stack = None - - def wrapper(): - nonlocal stack - stack = traceback.extract_stack() - return _lazy_init() - - with patch("torch.cuda._lazy_init", wrapper): - fn() - - if stack is not None: - print("==== CUDA Initialized ====") - print("".join(traceback.format_list(stack)).strip()) - print("==========================") - - -if __name__ == "__main__": - find_cuda_init( - lambda: importlib.import_module("vllm.model_executor.models.llava")) From 99cc41ad50c08e745571abe568226f9fcae61ccd Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 09:50:07 -0700 Subject: [PATCH 371/584] [V0 Deprecation] Remove unused output processor util (#25023) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- vllm/engine/output_processor/util.py | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100644 vllm/engine/output_processor/util.py diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py deleted file mode 100644 index 1e127eb982425..0000000000000 --- a/vllm/engine/output_processor/util.py +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List -from typing import Sequence as GenericSequence -from typing import cast - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import CompletionSequenceGroupOutput, SequenceGroupOutput - - -def create_output_by_sequence_group( - outputs: GenericSequence[SamplerOutput], - num_seq_groups: int) -> List[List[SequenceGroupOutput]]: - """Helper method which transforms a 2d list organized by - [step][sequence group] into [sequence group][step]. - """ - output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [ - [] for _ in range(num_seq_groups) - ] - for step in outputs: - sequence_group_output: CompletionSequenceGroupOutput - for i, sequence_group_output in enumerate(step): - output_by_sequence_group[i].append(sequence_group_output) - - # Cast to the more generic type that CompletionSequenceGroupOutput - # inherits from. - return cast(List[List[SequenceGroupOutput]], output_by_sequence_group) From 8b32464ac13fcafe32bebb2fb78447a3e762bb16 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Wed, 17 Sep 2025 13:21:28 -0400 Subject: [PATCH 372/584] Change log level from info to debug for IOProcessor (#24999) Signed-off-by: Michael Goin <mgoin64@gmail.com> --- vllm/plugins/io_processors/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py index c5c4f6f8d97c3..3b17211b1b835 100644 --- a/vllm/plugins/io_processors/__init__.py +++ b/vllm/plugins/io_processors/__init__.py @@ -33,7 +33,7 @@ def get_io_processor( model_plugin = config_plugin if model_plugin is None: - logger.info("No IOProcessor plugins requested by the model") + logger.debug("No IOProcessor plugins requested by the model") return None logger.debug("IOProcessor plugin to be loaded %s", model_plugin) From eb68c2dcd972fdeca7908268e4ba35c77a699f82 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 11:03:16 -0700 Subject: [PATCH 373/584] [CI] Revert back prepare_prompts and check_answers (#25087) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/models/test_transformers.py | 3 +- tests/utils.py | 47 +++++++++++++++++++ .../v1/e2e/test_correctness_sliding_window.py | 3 +- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 66ff8f7a54d31..ba9c3bebc437e 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -8,8 +8,7 @@ import pytest from vllm.platforms import current_platform from ..conftest import HfRunner, VllmRunner -from ..core.block.e2e.test_correctness_sliding_window import prep_prompts -from ..utils import multi_gpu_test +from ..utils import multi_gpu_test, prep_prompts from .utils import check_logprobs_close diff --git a/tests/utils.py b/tests/utils.py index 16e1e60393290..9a27c3de4533d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,6 +8,7 @@ import functools import importlib import json import os +import random import signal import subprocess import sys @@ -1150,3 +1151,49 @@ def override_cutlass_fp8_supported(value: bool): "vllm.model_executor.layers.quantization.utils.w8a8_utils.cutlass_fp8_supported", return_value=value): yield + + +def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)): + """ + Generate prompts which a bunch of assignments, + then asking for the value of one of them. + The prompt is just under 10k tokens; sliding window is 4k + so the answer is outside sliding window, but should still be correct. + Args: + batch_size: number of prompts to generate + ln_range: an argument to control the length of the prompt + """ + prompts: list[str] = [] + answer: list[int] = [] + indices: list[int] = [] + random.seed(1) + for _ in range(batch_size): + idx = random.randint(30, 90) + indices.append(idx) + prompt = "```python\n# We set a number of variables, " + \ + f"x{idx} will be important later\n" + ln = random.randint(*ln_range) + for k in range(30, ln): + v = random.randint(10, 99) + if k == idx: + answer.append(v) + prompt += f"x{k} = {v}\n" + prompt += f"# Now, we check the value of x{idx}:\n" + prompt += f"assert x{idx} == " + prompts.append(prompt) + return prompts, answer, indices + + +def check_answers(indices: list[int], + answer: list[int], + outputs: list[str], + accept_rate: float = 0.7): + answer2 = [int(text[0:2].strip()) for text in outputs] + print(list(zip(indices, zip(answer, answer2)))) + numok = 0 + for a1, a2 in zip(answer, answer2): + if a1 == a2: + numok += 1 + frac_ok = numok / len(answer) + print(f"Num OK: {numok}/{len(answer)} {frac_ok}") + assert frac_ok >= accept_rate diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index 4dfe1d3bb33fa..5b0c154722510 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -6,8 +6,7 @@ import pytest from vllm import LLM, SamplingParams -from ...core.block.e2e.test_correctness_sliding_window import (check_answers, - prep_prompts) +from ...utils import check_answers, prep_prompts @dataclass From 9d442b7c48288d6a65cbaca1bba10392523fe94d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 11:08:45 -0700 Subject: [PATCH 374/584] [V0 Deprecation] Remove V0 tests in test_sequence.py (#25088) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/test_sequence.py | 97 +----------------------------------------- 1 file changed, 1 insertion(+), 96 deletions(-) diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 1b019be9e56dc..da9826ff05058 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,104 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest import torch -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - SequenceData, SequenceOutput) - -from .core.utils import create_dummy_prompt - - -@pytest.fixture -def sample_outputs(): - return [ - CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=0, output_token=i, logprobs={}) - ], - prompt_logprobs=None) for i in range(5) - ] - - -@pytest.fixture -def sampler_output(sample_outputs): - return SamplerOutput(outputs=sample_outputs) - - -def test_sampler_output_initialization(sampler_output, sample_outputs): - assert len(sampler_output) == len(sample_outputs) - assert sampler_output.sampled_token_probs is None - assert sampler_output.sampled_token_ids is None - - -def test_sampler_output_getitem(sampler_output, sample_outputs): - assert sampler_output[2] == sample_outputs[2] - - -def test_sampler_output_setitem(sampler_output): - new_output = CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=0, output_token=99, logprobs={}) - ], - prompt_logprobs=None) - sampler_output[2] = new_output - assert sampler_output[2] == new_output - - -def test_sampler_output_len(sampler_output, sample_outputs): - assert len(sampler_output) == len(sample_outputs) - - -def test_sampler_output_eq(sample_outputs): - sampler_output1 = SamplerOutput(outputs=sample_outputs) - sampler_output2 = SamplerOutput(outputs=sample_outputs.copy()) - sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1]) - assert sampler_output1 == sampler_output2 - assert sampler_output1 != sampler_output3 - - -def test_sequence_data_prefill(): - seq_data = SequenceData.from_seqs([1, 2, 3, 4]) - assert seq_data.get_num_uncomputed_tokens() == 4 - assert seq_data.get_num_computed_tokens() == 0 - # advance by 2 - seq_data.update_num_computed_tokens(2) - assert seq_data.get_num_uncomputed_tokens() == 2 - assert seq_data.get_num_computed_tokens() == 2 - - # advance by 1 - seq_data.update_num_computed_tokens(1) - assert seq_data.get_num_uncomputed_tokens() == 1 - assert seq_data.get_num_computed_tokens() == 3 - - # append tokens and reset, simulating recompute - seq_data.append_token_id(1, logprob=0.0) - seq_data.reset_state_for_recompute() - assert seq_data.get_num_uncomputed_tokens() == 5 - assert seq_data.get_num_computed_tokens() == 0 - - -def test_sequence_group_stage(): - _, seq_group = create_dummy_prompt("1", 12) - assert seq_group.is_prefill() is True - seq_group.update_num_computed_tokens(6) - assert seq_group.is_prefill() is True - seq_group.update_num_computed_tokens(5) - assert seq_group.is_prefill() is True - seq_group.update_num_computed_tokens(1) - assert seq_group.is_prefill() is False - seqs = seq_group.get_seqs() - assert len(seqs) == 1 - seqs[0].data.append_token_id(1, logprob=0.0) - for seq in seq_group.get_seqs(): - seq.reset_state_for_recompute() - assert seq_group.is_prefill() is True - seq_group.update_num_computed_tokens(5) - assert seq_group.is_prefill() is True - seq_group.update_num_computed_tokens(7) - assert seq_group.is_prefill() is True - seq_group.update_num_computed_tokens(1) - assert seq_group.is_prefill() is False +from vllm.sequence import IntermediateTensors def test_sequence_intermediate_tensors_equal(): From e3db5ebb66590031ecfd3338de41b6f1ee95bf2a Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Wed, 17 Sep 2025 14:15:05 -0400 Subject: [PATCH 375/584] [CI Bugfix] Fix failing test_model_load_with_params tests due to tokenizer refactor (#25086) Signed-off-by: mgoin <mgoin64@gmail.com> --- tests/model_executor/test_model_load_with_params.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 0ade75b7e6228..c7b15c6ae1186 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -47,8 +47,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5" - assert model_tokenizer.tokenizer.model_max_length == 512 + assert model_config.tokenizer == "BAAI/bge-base-en-v1.5" + assert model_tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, BertEmbeddingModel) @@ -87,8 +87,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base" - assert model_tokenizer.tokenizer.model_max_length == 512 + assert model_config.tokenizer == "intfloat/multilingual-e5-base" + assert model_tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, RobertaEmbeddingModel) @@ -116,8 +116,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_tokenizer = vllm_model.llm.llm_engine.tokenizer - assert model_tokenizer.tokenizer_id == model_name + assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name def check_model(model): assert isinstance(model, RobertaEmbeddingModel) From 7ae9887542bf0fc45cc44de583290c41234a09c4 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:53:12 -0400 Subject: [PATCH 376/584] [V1] Logits processor docs (#22919) Signed-off-by: Andrew Feldman <afeldman@redhat.com> Signed-off-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Co-authored-by: Joseph Marinier <Joseph.Marinier@gmail.com> --- docs/design/logits_processors.md | 559 ++++++++++++++++++ docs/features/custom_arguments.md | 46 ++ docs/features/custom_logitsprocs.md | 445 ++++++++++++++ .../logits_processor/custom.py | 10 +- tests/v1/logits_processors/utils.py | 7 +- vllm/v1/sample/logits_processor/interface.py | 6 +- vllm/v1/sample/logits_processor/state.py | 8 +- 7 files changed, 1065 insertions(+), 16 deletions(-) create mode 100644 docs/design/logits_processors.md create mode 100644 docs/features/custom_arguments.md create mode 100644 docs/features/custom_logitsprocs.md diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md new file mode 100644 index 0000000000000..20d78ca3aae2c --- /dev/null +++ b/docs/design/logits_processors.md @@ -0,0 +1,559 @@ +# Logits Processors + +!!! important + Some logits processors design changes are still in progress and the API may + change in the near future. We hope to stabilize this part of the API soon + +This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors. + +## Logits Processors Background + +A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior. + +In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. + +## Logits Processors in the vLLM engine + +The vLLM engine's persistent batch data structure maintains a list of loaded logits processors. + +In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch (i.e. each request's logits-processor-specific configuration settings). Therefore, logits processors are stateful. + +In each engine step, the vLLM engine will (1) update each logits processor's internal state and (2) apply logits processors to the model output logits. + +### Updating Logits Processor Internal State + +At the beginning of each engine step, the persistent batch may add, discard and/or reorder requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step. + +The pseudocode below shows the process by which the vLLM persistent batch notifies each logits processor of changes in batch state: + +??? code "Model Runner Updates Logits Processor States" + + ``` python + # gpu_model_runner.py + + class GPUModelRunner(...): + + ... + + def execute_model(self, scheduler_output, ...): + self._update_states(scheduler_output) + + ... + + def _update_states(...): + + ... + + # ...update persistent batch to reflect new/finished requests & reordering + # of requests within batch... + + ... + + self.input_batch.refresh_metadata() + + + # gpu_input_batch.py + + class InputBatch: + + ... + + def refresh_metadata(self): + + ... + + # Update each logits processor's state to reflect persistent batch state + batch_update = self.batch_update_builder.get_and_reset(self.num_reqs) + for logit_proc in self.logitsprocs.all: + logit_proc.update_state(batch_update) + + ... + + + # vllm/v1/sample/logits_processor/interface.py + + @dataclass(frozen=True) + class BatchUpdate: + # Batch state-change data structure which is passed to logits processors' + # update_state() methods + + batch_size: int + + removed: Sequence[RemovedRequest] + added: Sequence[AddedRequest] + moved: Sequence[MovedRequest] + + ``` + +### Applying Logits Processors to the Model Output Logits + +After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors, yielding transformed logits (the `apply()` methods may modify the logits in-place or out-of-place, although in-place is more memory-efficient). This process is shown in the pseudocode below. + +Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata` (not shown in the code below), the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`. + +??? code "Apply logits processors to model output logits" + + ``` python + # gpu_model_runner.py + + class GPUModelRunner(...): + + ... + + def execute_model(self, scheduler_output, ...): + # (discussed in previous section) + self._update_states(scheduler_output) + + ... + + # ...run model inference to obtain logits... + + ... + + # Invoke sampler, which applies logits processors + sampler_output = self.sampler(logits=logits, + sampling_metadata=sampling_metadata) + + ... + + + # sampler.py + + class Sampler(nn.Module): + + ... + + def forward(self, logits, sampling_metadata): + + ... + + # Apply non-argmax-invariant logits processors to model output logits + for processor in (sampling_metadata.logitsprocs.non_argmax_invariant): + logits = processor.apply(logits) + + sampled = self.sample(logits, sampling_metadata) + + ... + + # ...return sampler output data structure... + + + def sample(self, logits, sampling_metadta) + + ... + + # ...exit early if all requests are greedy-sampling... + + ... + + # Apply argmax-invariant logits processors + for processor in sampling_metadata.logitsprocs.argmax_invariant: + logits = processor.apply(logits) + + ... + + # ...perform sampling and return sampling result... + ``` + +At sampling time, the sampler checks whether all requests in the persistent batch employ greedy sampling. If that is the case, the sampler saves compute by skipping "argmax-invariant" logits processors. Here, "argmax" is shorthand for the token ID with the highest logit value in a given row of the logits tensor (i.e. the token which the model weighted the highest for a given request). + +* An **argmax-invariant logits processor** is a logits processor (such as Min-P) which does not modify the argmax. For example, a logits processor which masks out the lowest-probability tokens will not change which token ID has the max logit. Greedy sampling always picks the highest-logit-value token ID, and so conceptually an argmax-invariant logits processor can be skipped for greedy sampling requests. + +* A **non-argmax-invariant logits processor** is a logits processor which may modify the argmax. For example, a logits processor which masks all tokens except for EOS after a certain number of steps in order to force decoding to terminate might end up masking the max-logit-value token and therefore change the argmax. Conceptually, these logits processors cannot be skipped for greedy sampling requests. + +The vLLM logits processor abstraction requires the engine to apply logits processors at batch granularity; therefore in practice the argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling. + +## Logits Processor Programming Model + +The previous sections alluded to the interfaces which vLLM logits processors must support. This section introduces in full the programming model for implementing logits processors that are compatible with the vLLM engine, including the `LogitsProcessor` base class and its interface methods as well as the `BatchUpdate` data structure for representing persistent batch state changes, both of which are shown in the code below: + +??? code "`LogitsProcessor` base class and `BatchUpdate` data structure" + + ``` python + from abc import ABC, abstractmethod + from collections.abc import Sequence + from dataclasses import dataclass + from enum import Enum, auto + from typing import TYPE_CHECKING, Optional + + import torch + + from vllm import SamplingParams + + if TYPE_CHECKING: + from vllm.config import VllmConfig + + + class MoveDirectionality(Enum): + # One-way i1->i2 req move within batch + UNIDIRECTIONAL = auto() + # Two-way i1<->i2 req swap within batch + SWAP = auto() + + + # (index, params, prompt_tok_ids, output_tok_ids) tuples for new + # requests added to the batch. + AddedRequest = tuple[int, SamplingParams, list[int], list[int]] + + # (index 1, index 2, directionality) tuples representing + # one-way moves or two-way swaps of requests in batch + MovedRequest = tuple[int, int, MoveDirectionality] + + # Batch indices of any removed requests. + RemovedRequest = int + + + @dataclass(frozen=True) + class BatchUpdate: + """Persistent batch state change info for logitsprocs""" + batch_size: int # Current num reqs in batch + + # Metadata for requests added to, removed from, and moved + # within the persistent batch. + # + # Key assumption: the `output_tok_ids` list (which is an element of each + # tuple in `added`) is a reference to the request's running output tokens + # list; via this reference, the logits processors always see the latest + # list of generated output tokens + removed: Sequence[RemovedRequest] + moved: Sequence[MovedRequest] + added: Sequence[AddedRequest] + + + class LogitsProcessor(ABC): + + @abstractmethod + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool) -> None: + raise NotImplementedError + + @abstractmethod + def apply(self, logits: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + @abstractmethod + def is_argmax_invariant(self) -> bool: + """True if logits processor has no impact on the + argmax computation in greedy sampling. + NOTE: may or may not have the same value for all + instances of a given LogitsProcessor subclass, + depending on subclass implementation. + """ + raise NotImplementedError + + @abstractmethod + def update_state( + self, + batch_update: Optional["BatchUpdate"], + ) -> None: + """Called when there are new output tokens, prior + to each forward pass. + + Args: + batch_update is non-None iff there have been + changes to the batch makeup. + """ + raise NotImplementedError + + ``` + +A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods: + +* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)` + * `vllm_config`: engine configuration data structure + * `device`: hardware accelerator device info + * `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation + +* `apply(self, logits: torch.Tensor) -> torch.Tensor`: + * Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`) + * Apply logits processor transformation at batch granularity + * Return a transformed `(num_requests) x (vocab_size)` logits tensor + * You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient + +* `is_argmax_invariant(self) -> bool`: + * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax + * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling + +* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: + * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step + * Use the `BatchUpdate` members to update logits processor internal state + * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. + +### `BatchUpdate` data structure + +The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`): + +* **Remove:** remove (without replacement) request at index `i` + + * A Remove is represented in `Batchupdate.removed` by an `int` (representing `i`) + + * Effect of remove-at-index on batch: + + ``` text + Batch: [A,B,C] + Remove @ i: 1 + + => + + New Batch: [A,x,C] # Discard B and leave an empty slot + ``` + +* **Add:** add (or replace existing request with) a new request at index `i`. If a request is replaced, its associated state should be discarded. + + * An Add is represented in `Batchupdate.added` as a tuple of + + ``` text + (index, new request SamplingParams, prompt token ids, output token ids) + ``` + + * `prompt token ids` and `output token ids` are references to the request's prompt token ids and output token ids lists, respectively. Note that the output token ids list grows with each engine step, and this growth is visible to the logits processor because output token ids are passed by reference. **This is important for LogitsProcessors that take into account the tokens generated so far**. + + * The implementation of the particular logits processor subclass determines whether or how the fields in the added request tuple are digested into an internal representation. For example, a logits processor that does not utilize prompt or output token ids may only need to utilize `index` and `SamplingParams` and discard the other tuple fields + + * If index `i` currently holds a request, a replacement occurs: + + ``` text + Batch: [A,B,C] + New request to be added @ i: D @ 1 + + => + + New Batch: [A,D,C] # Add D, discard B + ``` + + * If index `i` does not currently hold a request (because `i` is out of bounds of the current batch size): + + ``` text + Batch: [A,B,C] + New request to be added @ i: D @ 3 + + => + + New Batch: [A,B,C,D] # Add D, extending batch + ``` + +* **Move:** move request at index `s` to index `d` OR swap requests at indices `s` and `d` + + * A Move is represented in `Batchupdate.moved` as a tuple of + + ``` text + (s, d, UNIDIRECTIONAL or SWAP) + ``` + + * If the Move specifies `UNIDRECTIONAL`: + + * The request at index `s` is moved to index `d`; index `s` becomes an empty slot + + ``` text + Batch: [A,x,C,D] + Unidirectionally Move s -> d: 3 -> 1 + + => + + New Batch: [A,D,C,x] # Move D to 1, leaving empty slot at 3 + ``` + + * If another request already resided at index `d`, it is replaced and discarded + + ``` text + Batch: [A,B,C,D] + Unidirectionally Move s -> d: 3 -> 1 + + => + + New Batch: [A,D,C,x] # Move D to 1, discarding B and leaving empty slot at 3 + ``` + + * If the Move specifies `SWAP`, the requests at `s` and `d` exchange indices + + ``` text + Batch: [A,B,C,D] + Swap Move s <-> d: 3 <-> 1 + + => + + New Batch: [A,D,C,B] # Swap B and D + ``` + +Additionally, the `BatchUpdate` data structure includes a representation (`batch_size`) of the size of the persistent batch at the beginning of the engine step. + +### How the vLLM engine builds the `BatchUpdate` data structure + +Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction): + +1. Identify indices of requests which finished in the current engine step + +2. Identify new requests introduced in the current step + +3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index + +4. Based on the relative number of new and finished requests: + + 1. If the numbers of new and finished requests are the same, proceed to next step + + 2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1` + + 3. *If there are fewer new requests than finished requests:* + + * Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state + + * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous + + * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots + +5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch + +Notes: + +* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves + +* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations + * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3 + * In other words Move operations can be assumed to be applied after Adds and Removes + +* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved` + +* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None` + +#### Example: Batch Update with Fewer New Requests Than Finished Requests + +The following example models an engine step where 1 new request is introduced and 2 finished requests are eliminated, additionally the attention backend performs a swap to optimize the batch ordering. + +``` text +Batch state (beginning of engine step): [A,B,C,D] +Batch size: 4 + +New requests: E + +Finished requests: A, C + +Processing steps (using BatchUpdate abstraction): + +1. Add E at index 0 + +[E,B,C,D] # Discard A +Batch size: 4 + +2. Remove at index 2 + +[E,B,x,D] # Discard C, empty slot at index 2 +Batch size: 4 + +3. Condense batch with a Unidirectional Move 3 -> 2 operation and shrink batch + +[E,B,D] x # Empty slot is now outside batch +Batch size: 3 + +4. Attention backend optimization: reorder batch with Swap 0 <-> 1 + +[B,E,D] +Batch size: 3 + +``` + +The resulting `BatchUpdate` data structure will look like + +``` text +BatchUpdate instance +* added: [(0,E's SamplingParams,E's prompt tokens ref,E's output tokens ref)] +* removed: [2] # request C was removed without replacement +* moved: [(3,2,UNIDIRECTIONAL),(0,1,SWAP)] +``` + +#### Example: Batch Update with More New Requests Than Finished Requests + +The following example models an engine step where 2 new requests are introduced and 1 finished request is eliminated, additionally the attention backend performs a swap to optimize the batch ordering. + +``` text +Batch state (beginning of engine step): [A,B,C,D] +Batch size: 4 + +New requests: E,F + +Finished requests: C + +Processing steps (using BatchUpdate abstraction): + +1. Add E at index 2 + +[A,B,E,D] # Discard C +Batch size: 4 + +2. Add F at index 4 (current max batch index + 1) + +[A,B,E,D,F] # Extend batch by 1 +Batch size: 5 + +4. Attention backend optimization: reorder batch with Swap 0 <-> 1 + +[B,A,E,D,F] +Batch size: 5 + +``` + +Note that batch condensation is skipped because there are no empty slots left behind by Remove operations. + +The resulting `BatchUpdate` data structure will look like + +``` text +BatchUpdate instance +* added: [(2,E's SamplingParams,E's prompt tokens ref,E's output tokens ref),(4,F's SamplingParams,F's prompt tokens ref,F's output tokens ref)] +* removed: [] # no requests were removed without replacement +* moved: [(0,1,SWAP)] +``` + +## How to Introduce a New Logits Processor to vLLM + +### Best Practices for Writing Built-In Logits Processors + +* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity + * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` + * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor + +* It is up to the logits processor author to determine: + + 1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API + + 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the built-in logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor + + 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the built-in logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the built-in logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor + + * Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None` + +* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) + +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method + +### Built-In Logits Processors + +Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `vllm/v1/sample/logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently employs the following built-in logits processors based on the programming model described above: + +* Min-P + +* Logit bias + +* Min-tokens + +Review these logits processor implementations for guidance on writing built-in logits processors. + +Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model. + +* Allowed token IDs + +* Bad words + +* Repetition penalty + +* Frequency penalty + +* Presence penalty + +* Temperature + +* Top-K + +* Top-P + +### Custom Logits Processors + +vLLM can be augmented with [user-provided custom logits processors](../features/custom_logitsprocs.md). diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md new file mode 100644 index 0000000000000..74ed40835b4d4 --- /dev/null +++ b/docs/features/custom_arguments.md @@ -0,0 +1,46 @@ +# Custom Arguments + +You can use vLLM *custom arguments* to pass in arguments which are not part of the vLLM `SamplingParams` and REST API specifications. Adding or removing a vLLM custom argument does not require recompiling vLLM, since the custom arguments are passed in as a dictionary. + +Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code. + +## Offline Custom Arguments + +Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`: + +``` python +SamplingParams(extra_args={"your_custom_arg_name": 67}) +``` + +This allows arguments which are not already part of `SamplingParams` to be passed into `LLM` as part of a request. + +## Online Custom Arguments + +The vLLM REST API allows custom arguments to be passed to the vLLM server via `vllm_xargs`. The example below integrates custom arguments into a vLLM REST API request: + +``` bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + ... + "vllm_xargs": {"your_custom_arg": 67} + }' +``` + +Furthermore, OpenAI SDK users can access `vllm_xargs` via the `extra_body` argument: + +``` python +batch = await client.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + ..., + extra_body={ + "vllm_xargs": { + "your_custom_arg": 67 + } + } +) +``` + +!!! note + `vllm_xargs` is assigned to `SamplingParams.extra_args` under the hood, so code which uses `SamplingParams.extra_args` is compatible with both offline and online scenarios. diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md new file mode 100644 index 0000000000000..201b340c5972c --- /dev/null +++ b/docs/features/custom_logitsprocs.md @@ -0,0 +1,445 @@ +# Custom Logits Processors + +!!! important + Some logits processors design changes are still in progress and the API may + change in the near future. We hope to stabilize this part of the API soon + +A "custom" logits processor is written by a user of vLLM and is loaded into vLLM at initialization without needing to modify or recompile the vLLM source code. It is the opposite of a built-in logits processor. + +This document shows how to write, load and use a custom logits processor. + +## Logits Processors Background + +A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior. + +In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax. + +## Creating a Custom Logits Processor + +Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and define (at minimum) the following methods: + +* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)` + * `vllm_config`: engine configuration data structure + * `device`: hardware accelerator device info + * `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation + +* `apply(self, logits: torch.Tensor) -> torch.Tensor`: + * Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`) + * Apply logits processor transformation at batch granularity + * Return a transformed `(num_requests) x (vocab_size)` logits tensor + * You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient + +* `is_argmax_invariant(self) -> bool`: + * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax + * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling + +* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: + * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step + * Use the `BatchUpdate` members to update logits processor internal state + * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. + +### How the vLLM engine builds the `BatchUpdate` data structure + +!!! important + Some logits processors design changes are still in progress. We expect + that in the future you will not need to account for batch state changes + when implementing a logits processor, and the information in this section + will become irrelevant. + +Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction): + +1. Identify indices of requests which finished in the current engine step + +2. Identify new requests introduced in the current step + +3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index + +4. Based on the relative number of new and finished requests: + + 1. If the numbers of new and finished requests are the same, proceed to next step + + 2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1` + + 3. *If there are fewer new requests than finished requests:* + + * Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state + + * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous + + * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots + +5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch + +Notes: + +* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves + +* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations + * Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3 + * In other words Move operations can be assumed to be applied after Adds and Removes + +* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved` + +* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None` + +### Passing Custom Argument to a Custom Logits Processor + +Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.) + +### Example Custom Logits Processor Implementation + +The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request: + +??? code "Example custom logits processor definition" + + ``` python + from typing import Optional + import torch + from vllm.config import VllmConfig + from vllm.sampling_params import SamplingParams + from vllm.v1.sample.logits_processor import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) + + class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + self.req_info: dict[int, int] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and (target_token := + params.extra_args.get("target_token")): + self.req_info[index] = target_token + else: + self.req_info.pop(index, None) + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + cols = torch.tensor( + list(self.req_info.values()), dtype=torch.long, device=logits.device + ) + rows = torch.tensor( + list(self.req_info.keys()), dtype=torch.long, device=logits.device + ) + values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float('-inf') + logits[rows, cols] = values_to_keep + + return logits + ``` + +In the rest of this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor. + +The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch. + +### Wrapping an Existing Request-Level Logits Processor + +Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation: + +``` python +RequestLogitsProcessor = Union[ + + # (output token ids, logits tensor) -> logits tensor + Callable[[list[int], Tensor], Tensor], + + # (prompt token ids, output token ids, logits tensor) -> logits tensor + Callable[[list[int], list[int], Tensor], Tensor], +] +``` + +While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above. + +You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance: + +??? code "Example of Wrapping a Request-Level Logits Processor" + + ``` python + ... + + from vllm.v1.sample.logits_processor import ( + AdapterLogitsProcessor, # Wrapper base-class + RequestLogitsProcessor, # Request-level logitsproc type annotation + ) + + ... + + # Stand-in for your request-level logits processor: + class DummyPerReqLogitsProcessor: + """The request-level logits processor masks out all logits except the + token id identified by `target_token`""" + + def __init__(self, target_token: int) -> None: + """Specify `target_token`""" + self.target_token = target_token + + def __call__( + self, + output_ids: list[int], + logits: torch.Tensor, + ) -> torch.Tensor: + val_to_keep = logits[self.target_token].item() + logits[:] = float("-inf") + logits[self.target_token] = val_to_keep + return logits + + ... + + # Example of wrapping the request-level logits processor: + class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): + """Example of wrapping a fake request-level logit processor to create a + batch-level logits processor""" + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor( + self, + params: SamplingParams, + ) -> Optional[RequestLogitsProcessor]: + """This method returns a new request-level logits processor, customized + to the `target_token` value associated with a particular request. + + Returns None if the logits processor should not be applied to the + particular request. To use the logits processor the request must have + a "target_token" custom argument with an integer value. + + Args: + params: per-request sampling params + + Returns: + `Callable` request logits processor, or None + """ + target_token: Optional[Any] = params.extra_args and params.extra_args.get( + "target_token" + ) + if target_token is None: + return None + if not isinstance(target_token, int): + logger.warning( + "target_token value %s is not int; not applying logits" + " processor to request.", + target_token, + ) + return None + return DummyPerReqLogitsProcessor(target_token) + ``` + +!!! note + Your `new_req_logits_processor()` override can return `None` to signal that the wrapped logits processor should not be applied to the request in question. + +Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) which wraps your request level logits processor, you can pass the custom subclass to vLLM via any of the methods described in the following section. + +## Ways to Load Your Custom Logits Processor in vLLM + +Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests. + +This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor. + +### Method 1: Pass the Custom Logits Processor Fully-Qualified Class Name (FQCN) to vLLM at Initialization Time + +This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` and `AsyncLLM` Python constructors, or as a CLI argument to `vllm serve` with the following syntax + +``` bash +vllm serve ... --logits_processors <logits processor 1> <logits processor 2> ... +``` + +The only requirements on the FQCN are + +1. Python's `importlib.import_module()` must be able to resolve the dotted path portion of the FQCN and load it as a module + +2. The class-name portion of the FQCN must be possible to import from the loaded module + +3. The object pointed to by the FQCN must be a subclass of `LogitsProcessor` + +See examples below: + +??? code "Passing custom logits processor FQCN to `LLM` in Python" + + ``` python + # Pass in FQCN + llm = LLM( + model="facebook/opt-125m", + logits_processors=["your.module.path:DummyLogitsProcessor"], + ) + ``` + +??? code "Passing custom logits processor FQCN to `AsyncLLM` in Python" + + ``` python + # Pass in FQCN + engine_args = AsyncEngineArgs(model="facebook/opt-125m", + logits_processors=["your.module.path:DummyLogitsProcessor"]) + async_llm = AsyncLLM.from_engine_args(engine_args) + ``` + +??? code "Passing custom logits processor FQCN to vLLM server via CLI" + + ```bash + vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor + ``` + +### Method 2: Automatically Detect Custom Logits Processors Installed in Your Python Environment As Entry Points + +[`setuptools`](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) can enable installed packages to make themselves available as plugins to other Python programs, via pieces of metadata known as "entry points". + +During initialization, vLLM automatically scans the `vllm.logits_processors` entry point group and loads any installed logits processors which it finds. + +Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your logits processor Python package. The example below shows how to add an entrypoint to your project's `pyproject.toml` file: + +??? code "Exposing a custom logits processor as a Python entrypoint" + + ``` toml + [project.entry-points."vllm.logits_processors"] + dummy_logits_processor = "your.module.path:DummyLogitsProcessor" + ``` + +Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to the `LLM` or `AsyncLLM` constructors or to the vLLM server explicitly at initialization time if your logits processor is exposed as an entry point. + +!!! note + vLLM will *always* load *all* logits processors which are exposed via entrypoints under the `vllm.logits_processors` grouping. + +### Method 3 (Offline-only): Pass a Python Class Object to the vLLM Constructor + +You can pass one or more custom logits processor class objects to the `LLM` and `AsyncLLM` constructors. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` or `AsyncLLM` is instantiated, or (2) imported from a Python package. + +??? code "Passing custom logits processor class object to `LLM` or `AsyncLLM` in Python" + + ``` python + # Import custom logits processor + from some.module import DummyLogitsProcessor + + # ...or... + + # Define custom logits processor locally + from vllm.v1.sample.logits_processor import LogitsProcessor + + class DummyLogitsProcessor(LogitsProcessor): + # See DummyLogitsProcessor implementation above + ... + + # Pass class object to LLM constructor + llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], + ) + + # Pass class object to AsyncLLM constructor + engine_args = AsyncEngineArgs(model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor]) + async_llm = AsyncLLM.from_engine_args(engine_args) + ``` + +## Invoking a Custom Logits Processor Against a Request + +The design of the custom logits processor determines whether the logits processor must be enabled/disabled for a given request, and what arguments must be provided to configure the logits processor. + +The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior. + +??? code "vLLM REST API: configure custom logits processor for a request" + + ``` bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + ... + "vllm_xargs": {"target_token": 67} + }' + ``` + +??? code "OpenAI SDK: configure custom logits processor for a request" + + ``` python + batch = await client.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + ..., + extra_body={ + "vllm_xargs": { + "target_token": 67 + } + } + ) + ``` + +??? code "Offline: configure custom logits processor for an `LLM` request" + + ``` python + outputs_logitproc = llm.generate("your prompt", + SamplingParams(..., + extra_args={"target_token": 67})) + ``` + +??? code "Offline: configure custom logits processor for an `AsyncLLM` request" + + ``` python + async for out in engine.generate(request_id="your request id", + prompt="your prompt", + sampling_params=SamplingParams(..., + extra_args={"target_token": 67})): + + # Process async request outputs + ... + ``` + +## Best Practices for Writing Custom Logits Processors + +Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently. + +* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity + * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()` + * However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor + * **Note:** wrapped request-level logits processors do not need to implement `apply()` and `update_state()`; the default `AdapterLogitsProcessor.update_state()` implementation maintains a sparse representation of request state, wherein requests for which `new_req_logits_processor()` returns `None` are not represented in the base-class state dictionary. The default implementation of `AdapterLogitsProcessor.apply()` applies the request-level logits processor to each row of input logits sequentially and assembles the output logits tensor. If the performance of this `AdapterLogitsProcessor` default implementation is insufficient, then avoid wrapping your request-level logits processor and instead re-implement it as a `LogitsProcessor` subclass with optimized `apply()` and `update_state()` implementations that operate at batch granularity + +* It is up to the logits processor author to determine: + + 1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state + + * **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance. + + 2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor + + * **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request + + 3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor + + * Additionally, an easy way to save compute in `update_state()` is to exit early when the `batch_update` is `None` + + * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class implements the above optimizations by default + +* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove) + + * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default + +* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/offline_inference/logits_processor/custom.py index 3e122319169eb..4112a498f37ab 100644 --- a/examples/offline_inference/logits_processor/custom.py +++ b/examples/offline_inference/logits_processor/custom.py @@ -56,7 +56,6 @@ class DummyLogitsProcessor(LogitsProcessor): self.req_info: dict[int, int] = {} def is_argmax_invariant(self) -> bool: - """Never impacts greedy sampling""" return False def update_state(self, batch_update: Optional[BatchUpdate]): @@ -75,13 +74,12 @@ class DummyLogitsProcessor(LogitsProcessor): return logits # Save target values before modification - rows_list = list(self.req_info.keys()) cols = torch.tensor( - [self.req_info[i] for i in rows_list], - dtype=torch.long, - device=logits.device, + list(self.req_info.values()), dtype=torch.long, device=logits.device + ) + rows = torch.tensor( + list(self.req_info.keys()), dtype=torch.long, device=logits.device ) - rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) values_to_keep = logits[rows, cols].clone() # Mask all but target tokens diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py index 7ec35bd3eb639..d3b7f314da099 100644 --- a/tests/v1/logits_processors/utils.py +++ b/tests/v1/logits_processors/utils.py @@ -69,11 +69,12 @@ class DummyLogitsProcessor(LogitsProcessor): return logits # Save target values before modification - rows_list = list(self.req_info.keys()) - cols = torch.tensor([self.req_info[i] for i in rows_list], + cols = torch.tensor(list(self.req_info.values()), + dtype=torch.long, + device=logits.device) + rows = torch.tensor(list(self.req_info.keys()), dtype=torch.long, device=logits.device) - rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) values_to_keep = logits[rows, cols].clone() # Mask all but target tokens diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 683fc7c00dfb2..04027359909a6 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -21,6 +21,9 @@ class MoveDirectionality(Enum): SWAP = auto() +# Batch indices of any removed requests. +RemovedRequest = int + # (index, params, prompt_tok_ids, output_tok_ids) tuples for new # requests added to the batch. AddedRequest = tuple[int, SamplingParams, list[int], list[int]] @@ -29,9 +32,6 @@ AddedRequest = tuple[int, SamplingParams, list[int], list[int]] # one-way moves or two-way swaps of requests in batch MovedRequest = tuple[int, int, MoveDirectionality] -# Batch indices of any removed requests. -RemovedRequest = int - @dataclass(frozen=True) class BatchUpdate: diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py index 31cece58c7db5..0a1196559d3e3 100644 --- a/vllm/v1/sample/logits_processor/state.py +++ b/vllm/v1/sample/logits_processor/state.py @@ -36,18 +36,18 @@ class BatchUpdateBuilder: _removed: list[RemovedRequest] _is_removed_sorted: bool - moved: list[MovedRequest] added: list[AddedRequest] + moved: list[MovedRequest] def __init__( self, removed: Optional[list[RemovedRequest]] = None, - moved: Optional[list[MovedRequest]] = None, added: Optional[list[AddedRequest]] = None, + moved: Optional[list[MovedRequest]] = None, ) -> None: self._removed = removed or [] - self.moved = moved or [] self.added = added or [] + self.moved = moved or [] self._is_removed_sorted = False # Used to track changes in the pooling case @@ -107,8 +107,8 @@ class BatchUpdateBuilder: """Returns True if there were any changes to the batch.""" self._is_removed_sorted = False self._removed.clear() - self.moved.clear() self.added.clear() + self.moved.clear() batch_changed = self.batch_changed self.batch_changed = False return batch_changed From ee5fd491504913383e9e7b6782038f4ee7d36cfd Mon Sep 17 00:00:00 2001 From: Yihua Cheng <yihua98@uchicago.edu> Date: Wed, 17 Sep 2025 12:37:29 -0700 Subject: [PATCH 377/584] [Misc] Update owners for KV connector and V1 offloading (#25041) Signed-off-by: ApostaC <yihua98@uchicago.edu> --- .github/CODEOWNERS | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 73184d4e6b125..771dd2e172586 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -22,7 +22,7 @@ /vllm/reasoning @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg -/vllm/distributed/kv_transfer @NickLucche +/vllm/distributed/kv_transfer @NickLucche @ApostaC CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, @@ -35,8 +35,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/triton_attn.py @tdoublep -/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 +/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC /vllm/v1/kv_cache_interface.py @heheda12345 +/vllm/v1/offloading @ApostaC # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo @@ -54,11 +55,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm -/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 +/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC /tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee /tests/models/language/generation/test_hybrid.py @tdoublep -/tests/v1/kv_connector/nixl_integration @NickLucche +/tests/v1/kv_connector/nixl_integration @NickLucche +/tests/v1/kv_connector @ApostaC +/tests/v1/offloading @ApostaC # Docs /docs @hmellor From 883131544faf78f31f85a0350f74ea913ee6ef9c Mon Sep 17 00:00:00 2001 From: Mohammad Miadh Angkad <mangkad.bsdsba2027@aim.edu> Date: Thu, 18 Sep 2025 04:33:11 +0800 Subject: [PATCH 378/584] [Bugfix] Update import path for bc_linter_include (#24766) Signed-off-by: Mohammad Miadh Angkad <mangkad.bsdsba2027@aim.edu> --- vllm/v1/core/sched/output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 56ab396d6d937..3ec5b91bf2860 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -6,7 +6,7 @@ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING, Optional -from vllm import bc_linter_include +from vllm._bc_linter import bc_linter_include if TYPE_CHECKING: import numpy as np From f20c3b095109fd2a016e201d550bcaae6414e9fc Mon Sep 17 00:00:00 2001 From: ahao-anyscale <ahao@anyscale.com> Date: Wed, 17 Sep 2025 13:42:09 -0700 Subject: [PATCH 379/584] [BUG] Exclude .pth files when pulling remote files (#25092) Signed-off-by: ahao-anyscale <ahao@anyscale.com> --- vllm/config/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 5f30576099714..64be2f38c6a31 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -845,7 +845,8 @@ class ModelConfig: object_storage_model.pull_files(model, ignore_pattern=[ "*.pt", "*.safetensors", - "*.bin", "*.tensors" + "*.bin", "*.tensors", + "*.pth" ]) self.tokenizer = object_storage_model.dir return @@ -853,9 +854,12 @@ class ModelConfig: # Only download tokenizer if needed and not already handled if is_runai_obj_uri(tokenizer): object_storage_tokenizer = ObjectStorageModel() - object_storage_tokenizer.pull_files( - model, - ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"]) + object_storage_tokenizer.pull_files(model, + ignore_pattern=[ + "*.pt", "*.safetensors", + "*.bin", "*.tensors", + "*.pth" + ]) self.tokenizer = object_storage_tokenizer.dir def _get_encoder_config(self): From 3c068c637b9b1f945d5aa572da40553ab1691896 Mon Sep 17 00:00:00 2001 From: czhu-cohere <conway.zhu@cohere.com> Date: Wed, 17 Sep 2025 17:35:32 -0400 Subject: [PATCH 380/584] [Kernel] Faster pre-processing time for W4A8 (#23972) Signed-off-by: czhu-cohere <conway.zhu@cohere.com> --- .../cutlass_w4a8/w4a8_mm_entry.cu | 72 ++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu index 57bcbaae45dda..2d1568b08651c 100644 --- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -25,6 +25,8 @@ #include "cutlass_extensions/common.hpp" #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" +#include <cuda_runtime.h> + namespace vllm::cutlass_w4a8 { using namespace cute; @@ -393,6 +395,71 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) { return packed_scales; } +/* + GPU-accelerated implementation of cutlass::unified_encode_int4b. + Constructs a lookup table in constant memory to map 8 bits + (two 4-bit values) at a time. Assumes memory is contiguous + and pointers are 16-byte aligned. +*/ +__constant__ uint8_t kNibbleLUT[256]; + +__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out, + size_t nbytes) { + constexpr size_t V = sizeof(uint4); // 16 bytes + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t nthreads = size_t(gridDim.x) * blockDim.x; + const size_t nvec = nbytes / V; + + // 1-D grid-stride loop over 16-byte chunks + for (size_t vec = tid; vec < nvec; vec += nthreads) { + uint4 v = reinterpret_cast<const uint4*>(in)[vec]; + uint8_t* b = reinterpret_cast<uint8_t*>(&v); +#pragma unroll + for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]]; + reinterpret_cast<uint4*>(out)[vec] = v; + } +} + +static bool upload_lut() { + std::array<uint8_t, 256> lut{}; + auto map_nib = [](uint8_t v) -> uint8_t { + // 1..7 -> (8 - v); keep 0 and 8..15 + return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v); + }; + for (int b = 0; b < 256; ++b) { + uint8_t lo = b & 0xF; + uint8_t hi = (b >> 4) & 0xF; + lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo)); + } + cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(), + /*offset=*/0, cudaMemcpyHostToDevice); + + return (e == cudaSuccess); +} + +static bool unified_encode_int4b(cutlass::int4b_t const* in, + cutlass::int4b_t* out, size_t num_int4_elems) { + // Build/upload LUT + if (!upload_lut()) return false; + + static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1, + "int4 storage must be 1 byte"); + const size_t nbytes = num_int4_elems >> 1; + + auto* in_bytes = reinterpret_cast<uint8_t const*>(in); + auto* out_bytes = reinterpret_cast<uint8_t*>(out); + + // kernel launch params + constexpr int block = 256; + const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors + int grid = int((nvec + block - 1) / block); + if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel + + unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes); + cudaError_t err = cudaGetLastError(); + return (err == cudaSuccess); +} + torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { TORCH_CHECK(B.dtype() == torch::kInt32); TORCH_CHECK(B.dim() == 2); @@ -401,6 +468,7 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { int k = B.size(0) * PackFactor; // logical k int n = B.size(1); + TORCH_CHECK((n * k) % 32 == 0, "need multiples of 32 int4s for 16B chunks"); auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr()); auto B_packed_ptr = static_cast<QuantType*>(B_packed.data_ptr()); @@ -409,7 +477,9 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { LayoutB_Reordered layout_B_reordered = cute::tile_to_shape(LayoutAtomQuant{}, shape_B); - cutlass::unified_encode_int4b(B_ptr, B_packed_ptr, n * k); + bool ok = + vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k); + TORCH_CHECK(ok, "unified_encode_int4b failed"); cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered); return B_packed; From bff2e5f1d6201c817d29f49581e6c15724a4e186 Mon Sep 17 00:00:00 2001 From: Andrew Xia <axia@meta.com> Date: Wed, 17 Sep 2025 15:04:28 -0700 Subject: [PATCH 381/584] [gpt-oss][2] fix types for streaming (#24556) Signed-off-by: Andrew Xia <axia@meta.com> --- vllm/entrypoints/openai/api_server.py | 9 +- vllm/entrypoints/openai/protocol.py | 37 ++++- vllm/entrypoints/openai/serving_responses.py | 154 ++++++++----------- 3 files changed, 104 insertions(+), 96 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 527193c913394..c07e95e9370a0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -27,7 +27,6 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse -from openai import BaseModel from prometheus_client import make_asgi_app from prometheus_fastapi_instrumentator import Instrumentator from starlette.concurrency import iterate_in_threadpool @@ -67,7 +66,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, RerankRequest, RerankResponse, ResponsesRequest, ResponsesResponse, ScoreRequest, - ScoreResponse, TokenizeRequest, + ScoreResponse, + StreamingResponsesResponse, + TokenizeRequest, TokenizeResponse, TranscriptionRequest, TranscriptionResponse, @@ -481,8 +482,8 @@ async def show_version(): async def _convert_stream_to_sse_events( - generator: AsyncGenerator[BaseModel, - None]) -> AsyncGenerator[str, None]: + generator: AsyncGenerator[StreamingResponsesResponse, None] +) -> AsyncGenerator[str, None]: """Convert the generator to a stream of events in SSE format""" async for event in generator: event_type = getattr(event, 'type', 'unknown') diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6b4c3f531dbce..2505e493625d8 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -18,10 +18,19 @@ from openai.types.chat.chat_completion_audio import ( from openai.types.chat.chat_completion_message import ( Annotation as OpenAIAnnotation) # yapf: enable -from openai.types.responses import (ResponseFunctionToolCall, - ResponseInputItemParam, ResponseOutputItem, - ResponsePrompt, ResponseReasoningItem, - ResponseStatus) +from openai.types.responses import ( + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallCompletedEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent, + ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, + ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent, + ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem, + ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, + ResponseStatus, ResponseWebSearchCallCompletedEvent, + ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent) # Backward compatibility for OpenAI client versions try: # For older openai versions (< 1.100.0) @@ -251,6 +260,26 @@ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam, ResponseReasoningItem, ResponseFunctionToolCall] +StreamingResponsesResponse: TypeAlias = Union[ + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseCompletedEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent, + ResponseWebSearchCallCompletedEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallInterpretingEvent, + ResponseCodeInterpreterCallCompletedEvent, +] + class ResponsesRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index b81b2c7223efc..469d74272b0e6 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -10,24 +10,28 @@ from collections.abc import AsyncGenerator, AsyncIterator, Sequence from contextlib import AsyncExitStack from copy import copy from http import HTTPStatus -from typing import Callable, Final, Optional, TypeVar, Union +from typing import Callable, Final, Optional, Union import jinja2 -import openai.types.responses as openai_responses_types from fastapi import Request -from openai import BaseModel # yapf conflicts with isort for this block # yapf: disable -from openai.types.responses import (ResponseCreatedEvent, - ResponseFunctionToolCall, - ResponseInProgressEvent, - ResponseOutputItem, - ResponseOutputItemDoneEvent, - ResponseOutputMessage, ResponseOutputText, - ResponseReasoningItem, - ResponseReasoningTextDeltaEvent, - ResponseReasoningTextDoneEvent, - ResponseStatus, response_text_delta_event) +from openai.types.responses import ( + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallCompletedEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallInterpretingEvent, + ResponseCodeInterpreterToolCallParam, ResponseCompletedEvent, + ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, + ResponseCreatedEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch, + ResponseInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, ResponseOutputMessage, ResponseOutputText, + ResponseReasoningItem, ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, ResponseStatus, ResponseTextDeltaEvent, + ResponseTextDoneEvent, ResponseWebSearchCallCompletedEvent, + ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent, + response_function_web_search, response_text_delta_event) from openai.types.responses.response_output_text import (Logprob, LogprobTopLogprob) # yapf: enable @@ -55,7 +59,8 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse, OutputTokensDetails, RequestResponseMetadata, ResponsesRequest, - ResponsesResponse, ResponseUsage) + ResponsesResponse, ResponseUsage, + StreamingResponsesResponse) # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels @@ -175,7 +180,7 @@ class OpenAIServingResponses(OpenAIServing): # HACK(wuhang): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove events from the store. - self.event_store: dict[str, tuple[deque[BaseModel], + self.event_store: dict[str, tuple[deque[StreamingResponsesResponse], asyncio.Event]] = {} self.background_tasks: dict[str, asyncio.Task] = {} @@ -186,8 +191,8 @@ class OpenAIServingResponses(OpenAIServing): self, request: ResponsesRequest, raw_request: Optional[Request] = None, - ) -> Union[AsyncGenerator[BaseModel, None], ResponsesResponse, - ErrorResponse]: + ) -> Union[AsyncGenerator[StreamingResponsesResponse, None], + ResponsesResponse, ErrorResponse]: error_check_ret = await self._check_model(request) if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) @@ -814,7 +819,7 @@ class OpenAIServingResponses(OpenAIServing): *args, **kwargs, ): - event_deque: deque[BaseModel] = deque() + event_deque: deque[StreamingResponsesResponse] = deque() new_event_signal = asyncio.Event() self.event_store[request.request_id] = (event_deque, new_event_signal) response = None @@ -867,7 +872,7 @@ class OpenAIServingResponses(OpenAIServing): self, response_id: str, starting_after: Optional[int] = None, - ) -> AsyncGenerator[BaseModel, None]: + ) -> AsyncGenerator[StreamingResponsesResponse, None]: if response_id not in self.event_store: raise ValueError(f"Unknown response_id: {response_id}") @@ -893,8 +898,8 @@ class OpenAIServingResponses(OpenAIServing): response_id: str, starting_after: Optional[int], stream: Optional[bool], - ) -> Union[ErrorResponse, ResponsesResponse, AsyncGenerator[BaseModel, - None]]: + ) -> Union[ErrorResponse, ResponsesResponse, AsyncGenerator[ + StreamingResponsesResponse, None]]: if not response_id.startswith("resp_"): return self._make_invalid_id_error(response_id) @@ -977,9 +982,9 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: int, - _increment_sequence_number_and_return: Callable[[BaseModel], - BaseModel], - ) -> AsyncGenerator[BaseModel, None]: + _increment_sequence_number_and_return: Callable[ + [StreamingResponsesResponse], StreamingResponsesResponse], + ) -> AsyncGenerator[StreamingResponsesResponse, None]: current_content_index = 0 current_output_index = 0 current_item_id = "" @@ -1017,13 +1022,11 @@ class OpenAIServingResponses(OpenAIServing): current_item_id = str(uuid.uuid4()) if delta_message.reasoning_content: yield _increment_sequence_number_and_return( - openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - ResponseReasoningItem( + item=ResponseReasoningItem( type="reasoning", id=current_item_id, summary=[], @@ -1032,13 +1035,11 @@ class OpenAIServingResponses(OpenAIServing): )) else: yield _increment_sequence_number_and_return( - openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - ResponseOutputMessage( + item=ResponseOutputMessage( id=current_item_id, type="message", role="assistant", @@ -1047,13 +1048,13 @@ class OpenAIServingResponses(OpenAIServing): ), )) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseContentPartAddedEvent( + ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, output_index=current_output_index, item_id=current_item_id, content_index=current_content_index, - part=openai_responses_types.ResponseOutputText( + part=ResponseOutputText( type="output_text", text="", annotations=[], @@ -1104,11 +1105,11 @@ class OpenAIServingResponses(OpenAIServing): item=reasoning_item, )) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseOutputItemAddedEvent( + ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types.ResponseOutputMessage( + item=ResponseOutputMessage( id=current_item_id, type="message", role="assistant", @@ -1119,13 +1120,13 @@ class OpenAIServingResponses(OpenAIServing): current_output_index += 1 current_item_id = str(uuid.uuid4()) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseContentPartAddedEvent( + ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, output_index=current_output_index, item_id=current_item_id, content_index=current_content_index, - part=openai_responses_types.ResponseOutputText( + part=ResponseOutputText( type="output_text", text="", annotations=[], @@ -1148,7 +1149,7 @@ class OpenAIServingResponses(OpenAIServing): )) elif delta_message.content is not None: yield _increment_sequence_number_and_return( - openai_responses_types.ResponseTextDeltaEvent( + ResponseTextDeltaEvent( type="response.output_text.delta", sequence_number=-1, content_index=current_content_index, @@ -1204,7 +1205,7 @@ class OpenAIServingResponses(OpenAIServing): for pm in previous_delta_messages if pm.content is not None) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseTextDoneEvent( + ResponseTextDoneEvent( type="response.output_text.done", sequence_number=-1, output_index=current_output_index, @@ -1220,7 +1221,7 @@ class OpenAIServingResponses(OpenAIServing): annotations=[], ) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseContentPartDoneEvent( + ResponseContentPartDoneEvent( type="response.content_part.done", sequence_number=-1, item_id=current_item_id, @@ -1257,9 +1258,9 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: int, - _increment_sequence_number_and_return: Callable[[BaseModel], - BaseModel], - ) -> AsyncGenerator[BaseModel, None]: + _increment_sequence_number_and_return: Callable[ + [StreamingResponsesResponse], StreamingResponsesResponse], + ) -> AsyncGenerator[StreamingResponsesResponse, None]: current_content_index = -1 current_output_index = 0 current_item_id: str = "" @@ -1314,7 +1315,7 @@ class OpenAIServingResponses(OpenAIServing): annotations=[], ) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseTextDoneEvent( + ResponseTextDoneEvent( type="response.output_text.done", sequence_number=-1, output_index=current_output_index, @@ -1324,7 +1325,6 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, )) yield _increment_sequence_number_and_return( - openai_responses_types. ResponseContentPartDoneEvent( type="response.content_part.done", sequence_number=-1, @@ -1334,7 +1334,7 @@ class OpenAIServingResponses(OpenAIServing): part=text_content, )) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseOutputItemDoneEvent( + ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, output_index=current_output_index, @@ -1355,13 +1355,11 @@ class OpenAIServingResponses(OpenAIServing): sent_output_item_added = True current_item_id = f"msg_{random_uuid()}" yield _increment_sequence_number_and_return( - openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - ResponseOutputMessage( + item=ResponseOutputMessage( id=current_item_id, type="message", role="assistant", @@ -1371,14 +1369,13 @@ class OpenAIServingResponses(OpenAIServing): )) current_content_index += 1 yield _increment_sequence_number_and_return( - openai_responses_types. ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, output_index=current_output_index, item_id=current_item_id, content_index=current_content_index, - part=openai_responses_types.ResponseOutputText( + part=ResponseOutputText( type="output_text", text="", annotations=[], @@ -1386,7 +1383,7 @@ class OpenAIServingResponses(OpenAIServing): ), )) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseTextDeltaEvent( + ResponseTextDeltaEvent( type="response.output_text.delta", sequence_number=-1, content_index=current_content_index, @@ -1402,13 +1399,11 @@ class OpenAIServingResponses(OpenAIServing): sent_output_item_added = True current_item_id = f"msg_{random_uuid()}" yield _increment_sequence_number_and_return( - openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - ResponseReasoningItem( + item=ResponseReasoningItem( type="reasoning", id=current_item_id, summary=[], @@ -1417,14 +1412,13 @@ class OpenAIServingResponses(OpenAIServing): )) current_content_index += 1 yield _increment_sequence_number_and_return( - openai_responses_types. ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, output_index=current_output_index, item_id=current_item_id, content_index=current_content_index, - part=openai_responses_types.ResponseOutputText( + part=ResponseOutputText( type="output_text", text="", annotations=[], @@ -1450,13 +1444,11 @@ class OpenAIServingResponses(OpenAIServing): sent_output_item_added = True current_item_id = f"tool_{random_uuid()}" yield _increment_sequence_number_and_return( - openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - ResponseCodeInterpreterToolCallParam( + item=ResponseCodeInterpreterToolCallParam( type="code_interpreter_call", id=current_item_id, code=None, @@ -1466,7 +1458,6 @@ class OpenAIServingResponses(OpenAIServing): ), )) yield _increment_sequence_number_and_return( - openai_responses_types. ResponseCodeInterpreterCallInProgressEvent( type= "response.code_interpreter_call.in_progress", @@ -1475,7 +1466,6 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, )) yield _increment_sequence_number_and_return( - openai_responses_types. ResponseCodeInterpreterCallCodeDeltaEvent( type="response.code_interpreter_call_code.delta", sequence_number=-1, @@ -1495,14 +1485,12 @@ class OpenAIServingResponses(OpenAIServing): action = None parsed_args = json.loads(previous_item.content[0].text) if function_name == "search": - action = (openai_responses_types. - response_function_web_search.ActionSearch( - type="search", - query=parsed_args["query"], - )) + action = (response_function_web_search.ActionSearch( + type="search", + query=parsed_args["query"], + )) elif function_name == "open": action = ( - openai_responses_types. response_function_web_search.ActionOpenPage( type="open_page", # TODO: translate to url @@ -1510,7 +1498,6 @@ class OpenAIServingResponses(OpenAIServing): )) elif function_name == "find": action = ( - openai_responses_types. response_function_web_search.ActionFind( type="find", pattern=parsed_args["pattern"], @@ -1523,12 +1510,11 @@ class OpenAIServingResponses(OpenAIServing): current_item_id = f"tool_{random_uuid()}" yield _increment_sequence_number_and_return( - openai_responses_types.ResponseOutputItemAddedEvent( + ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - response_function_web_search. + item=response_function_web_search. ResponseFunctionWebSearch( # TODO: generate a unique id for web search call type="web_search_call", @@ -1538,7 +1524,6 @@ class OpenAIServingResponses(OpenAIServing): ), )) yield _increment_sequence_number_and_return( - openai_responses_types. ResponseWebSearchCallInProgressEvent( type="response.web_search_call.in_progress", sequence_number=-1, @@ -1546,7 +1531,6 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, )) yield _increment_sequence_number_and_return( - openai_responses_types. ResponseWebSearchCallSearchingEvent( type="response.web_search_call.searching", sequence_number=-1, @@ -1556,7 +1540,6 @@ class OpenAIServingResponses(OpenAIServing): # enqueue yield _increment_sequence_number_and_return( - openai_responses_types. ResponseWebSearchCallCompletedEvent( type="response.web_search_call.completed", sequence_number=-1, @@ -1564,12 +1547,11 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, )) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseOutputItemDoneEvent( + ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - ResponseFunctionWebSearch( + item=ResponseFunctionWebSearch( type="web_search_call", id=current_item_id, action=action, @@ -1582,7 +1564,6 @@ class OpenAIServingResponses(OpenAIServing): and previous_item.recipient is not None and previous_item.recipient.startswith("python")): yield _increment_sequence_number_and_return( - openai_responses_types. ResponseCodeInterpreterCallCodeDoneEvent( type="response.code_interpreter_call_code.done", sequence_number=-1, @@ -1591,7 +1572,6 @@ class OpenAIServingResponses(OpenAIServing): code=previous_item.content[0].text, )) yield _increment_sequence_number_and_return( - openai_responses_types. ResponseCodeInterpreterCallInterpretingEvent( type="response.code_interpreter_call.interpreting", sequence_number=-1, @@ -1599,7 +1579,6 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, )) yield _increment_sequence_number_and_return( - openai_responses_types. ResponseCodeInterpreterCallCompletedEvent( type="response.code_interpreter_call.completed", sequence_number=-1, @@ -1607,12 +1586,11 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, )) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseOutputItemDoneEvent( + ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, output_index=current_output_index, - item=openai_responses_types. - ResponseCodeInterpreterToolCallParam( + item=ResponseCodeInterpreterToolCallParam( type="code_interpreter_call", id=current_item_id, code=previous_item.content[0].text, @@ -1633,7 +1611,7 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: Optional[int] = None, - ) -> AsyncGenerator[BaseModel, None]: + ) -> AsyncGenerator[StreamingResponsesResponse, None]: # TODO: # 1. Handle disconnect @@ -1641,9 +1619,9 @@ class OpenAIServingResponses(OpenAIServing): sequence_number = 0 - T = TypeVar("T", bound=BaseModel) - - def _increment_sequence_number_and_return(event: T) -> T: + def _increment_sequence_number_and_return( + event: StreamingResponsesResponse + ) -> StreamingResponsesResponse: nonlocal sequence_number # Set sequence_number if the event has this attribute if hasattr(event, 'sequence_number'): @@ -1705,7 +1683,7 @@ class OpenAIServingResponses(OpenAIServing): created_time=created_time, ) yield _increment_sequence_number_and_return( - openai_responses_types.ResponseCompletedEvent( + ResponseCompletedEvent( type="response.completed", sequence_number=-1, response=final_response.model_dump(), From fedb75fa2790403b90ec6dc926fef9c6c5ccb7a6 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Wed, 17 Sep 2025 18:06:38 -0400 Subject: [PATCH 382/584] [Bugfix][B200] Fix `cutlass_mla` hang (#24966) Signed-off-by: Alexander Matveev <amatveev@redhat.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp index 95e32559cd540..fbbc2e588c326 100644 --- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp +++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp @@ -133,6 +133,14 @@ public: // printf(" sm_count = %d\n", sm_count); int max_splits = ceil_div(K, 128); max_splits = min(16, max_splits); + + // TODO: This avoids a hang when the batch size larger than 1 and + // there is more than 4 kv_splits. + // Discuss with NVIDIA how this can be fixed. + if (B > 1) { + max_splits = min(2, max_splits); + } + // printf(" max_splits = %d\n", max_splits); int sms_per_batch = max(1, sm_count / B); // printf(" sms_per_batch = %d\n", sms_per_batch); From 1a456c7c90afdb534d1203d7e4ea5747aada801c Mon Sep 17 00:00:00 2001 From: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:29:14 -0500 Subject: [PATCH 383/584] Aiter mha fp8 fix (#24991) Signed-off-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> --- vllm/attention/ops/rocm_aiter_paged_attn.py | 4 ++-- vllm/v1/attention/backends/rocm_aiter_fa.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py index ad97152e208b8..2a0336de8cf72 100644 --- a/vllm/attention/ops/rocm_aiter_paged_attn.py +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -81,8 +81,8 @@ class AITERPagedAttention(PagedAttention): blocksparse_head_sliding_step=blocksparse_head_sliding_step) if "fp8" in kv_cache_dtype: - key_cache = key_cache.view(torch.float8_e4m3fnuz) - value_cache = value_cache.view(torch.float8_e4m3fnuz) + key_cache = key_cache.view(current_platform.fp8_dtype()) + value_cache = value_cache.view(current_platform.fp8_dtype()) if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: # use blocksparse paged attention diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index a4e2758bd311f..8eb3505cf274d 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -479,8 +479,8 @@ class AiterFlashAttentionImpl(AttentionImpl): ) if self.kv_cache_dtype.startswith("fp8"): - key_cache = key_cache.view(torch.float8_e4m3fnuz) - value_cache = value_cache.view(torch.float8_e4m3fnuz) + key_cache = key_cache.view(current_platform.fp8_dtype()) + value_cache = value_cache.view(current_platform.fp8_dtype()) if not attn_metadata.use_cascade: cu_seqlens_q = attn_metadata.query_start_loc From 9f882d879198200104fed7e166d40dd11039d217 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Wed, 17 Sep 2025 18:36:00 -0400 Subject: [PATCH 384/584] Disable failing GPT-OSS Eval (Blackwell) for now (#25107) Signed-off-by: mgoin <mgoin64@gmail.com> --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 133ba792680de..150dc40a91733 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -812,7 +812,7 @@ steps: timeout_in_minutes: 60 working_dir: "/vllm-workspace/" gpu: b200 - # optional: true + optional: true # disable while debugging source_file_dependencies: - tests/evals/gpt_oss - vllm/model_executor/models/gpt_oss.py From e67a79db03752e9ab7ed216bc99c30a16f45a33e Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Thu, 18 Sep 2025 06:36:29 +0800 Subject: [PATCH 385/584] [Bugfix] Refactor Flashinfer TRTLLM attention kernel selection logic (#24600) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- vllm/envs.py | 7 ++- vllm/utils/flashinfer.py | 70 ++++++++++++++++-------- vllm/v1/attention/backends/flashinfer.py | 17 ++++-- 3 files changed, 65 insertions(+), 29 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 385d2a7c51f26..eeed7771f0453 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1223,9 +1223,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_CUDNN_PREFILL": lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))), - # If set to 1, use the TRTLLM attention backend in flashinfer. + # If set to 1/True, use the TRTLLM attention backend in flashinfer. + # If set to 0/False, use the default attention backend in flashinfer. + # If not set, auto-detect the attention backend in flashinfer. "VLLM_USE_TRTLLM_ATTENTION": - lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), + lambda: (None if "VLLM_USE_TRTLLM_ATTENTION" not in os.environ else + os.environ["VLLM_USE_TRTLLM_ATTENTION"].lower() in ("1", "true")), # If set to 1, when we use fp8 kv, we do not quantize Q to fp8 "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 83ec65c9b4594..2179bddae2435 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -154,28 +154,31 @@ def has_nvidia_artifactory() -> bool: @functools.cache -def supports_trtllm_attention() -> tuple[bool, Optional[str]]: - """Cache result which only depends on the environment""" - # This is a lambda, call it once - env_value = envs.VLLM_USE_TRTLLM_ATTENTION - +def supports_trtllm_attention() -> bool: + """ + TRTLLM attention is supported if the platform is SM100 and + NVIDIA artifactory is accessible + """ # Requires SM100 and NVIDIA artifactory to be accessible to download cubins - if not (current_platform.is_device_capability(100) - and has_nvidia_artifactory()): - return False, env_value + return current_platform.is_device_capability( + 100) and has_nvidia_artifactory() + +@functools.cache +def _force_use_trtllm_attention(env_value: Optional[bool]) -> Optional[bool]: + """Cache the env value for VLLM_USE_TRTLLM_ATTENTION""" if env_value is not None: logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - use_trtllm = (env_value == "1") - if use_trtllm: - logger.info_once("Using TRTLLM attention.") - return use_trtllm, env_value + return env_value - return True, None + +def force_use_trtllm_attention() -> Optional[bool]: + """ + Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set, + return ``True`` if TRTLLM attention is forced to be used, + return ``False`` if TRTLLM attention is forced to be not used. + """ + return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION) def use_trtllm_attention( @@ -185,18 +188,38 @@ def use_trtllm_attention( max_seq_len: int, kv_cache_dtype: str, q_dtype: torch.dtype, - is_prefill: bool, has_sinks: bool = False, ) -> bool: - use_trtllm, env_value = supports_trtllm_attention() - if not use_trtllm: + """Return ``True`` if TRTLLM attention is used.""" + force_use_trtllm = force_use_trtllm_attention() + + # Environment variable is set to 0 - respect it + if force_use_trtllm is not None and not force_use_trtllm: return False + # The platform is not supported + if not supports_trtllm_attention(): + if force_use_trtllm: + logger.warning_once( + "TRTLLM attention is not supported on this platform, " + "but VLLM_USE_TRTLLM_ATTENTION is set to 1") + return False + + # The combination of query and key heads is not supported if num_qo_heads % num_kv_heads != 0: + if force_use_trtllm: + logger.warning_once( + "TRTLLM attention is not supported for this combination of " + "query and key heads, but VLLM_USE_TRTLLM_ATTENTION is set to 1" + ) return False # Must use TRTLLM attention if query is FP8 quantized if q_dtype == current_platform.fp8_dtype(): + if has_sinks: + raise RuntimeError( + "TRTLLM FP8-qkv kernel is not supported for attention sinks. " + "Use kv_cache_dtype=auto for now.") logger.info_once("Using TRTLLM attention (query is quantized).") return True @@ -207,15 +230,17 @@ def use_trtllm_attention( "Using TRTLLM attention (required for attention sinks).") return True - if env_value is None: + if force_use_trtllm is None: # Environment variable not set - use auto-detection - use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 + use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto") if use_trtllm: logger.warning_once("Using TRTLLM attention (auto-detected).") return use_trtllm # Environment variable is set to 1 - respect it + logger.info_once( + "Using TRTLLM attention (VLLM_USE_TRTLLM_ATTENTION is set to 1)") return True @@ -367,6 +392,7 @@ __all__ = [ "has_nvidia_artifactory", "supports_trtllm_attention", "use_trtllm_attention", + "flashinfer_disable_q_quantization", "flashinfer_scaled_fp4_mm", "flashinfer_scaled_fp8_mm", ] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 98a4cf38bc195..dda6dd4fbea7a 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -282,7 +282,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): assert self.kv_cache_spec.dtype == self.model_config.dtype self.kv_cache_dtype = self.kv_cache_spec.dtype - if supports_trtllm_attention()[0] and \ + # Use model dtype as q dtype when TRTLLM attn is not supported, or + # VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION is set to 1. Otherwise, try to + # use fp8 q if kv cache is fp8, and will fall back to model dtype + # if TRTLLM attention kernel is not used when building attn metadata + if supports_trtllm_attention() and \ not flashinfer_disable_q_quantization(): self.q_data_type = self.kv_cache_dtype else: @@ -298,7 +302,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.window_left = self.global_hyperparameters.window_left self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap self.has_sinks = self.global_hyperparameters.has_sinks - if self.has_sinks and not supports_trtllm_attention()[0]: + if self.has_sinks and not supports_trtllm_attention(): raise NotImplementedError( "FlashInfer backend currently does not support attention " "sinks, please use trtllm on blackwell or flash attention on " @@ -477,14 +481,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): paged_kv_last_page_len_np, ) - # Check if any layer uses sinks (requires TRTLLM attention) prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads, self.num_kv_heads, num_prefill_tokens, max_seq_len, self.cache_dtype, self.q_data_type, - is_prefill=True, has_sinks=self.has_sinks) decode_use_trtllm = use_trtllm_attention(self.num_qo_heads, self.num_kv_heads, @@ -492,13 +494,18 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): max_seq_len, self.cache_dtype, self.q_data_type, - is_prefill=False, has_sinks=self.has_sinks) if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm): raise NotImplementedError( "FlashInfer backend currently does not support attention " "sinks, please use trtllm on blackwell or flash attention on " "earlier GPUs.") + + # If TRTLLM attention is not used, the q quantization is not supported. + # Fall back to use model dtype. + if not (prefill_use_trtllm and decode_use_trtllm): + self.q_data_type = self.model_config.dtype + attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, q_data_type=self.q_data_type, From 2a4d6412e612d657d00daeafc0a569d86659021b Mon Sep 17 00:00:00 2001 From: Karan Goel <3261985+karan@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:41:18 -0700 Subject: [PATCH 386/584] Add a batched auto tune script (#25076) Signed-off-by: Karan Goel <karangoel@google.com> Signed-off-by: Karan Goel <3261985+karan@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- benchmarks/auto_tune/README.md | 67 +++++++++++++ benchmarks/auto_tune/batch_auto_tune.sh | 128 ++++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100755 benchmarks/auto_tune/batch_auto_tune.sh diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index 3aa988aac2548..d1bdb4c43f10b 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters: 4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far. 5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard. + +## Batched `auto_tune` + +The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file. + +### Prerequisites + +- **jq**: This script requires `jq` to parse the JSON configuration file. +- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated. + +### How to Run + +1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run. + +2. **Execute the script**: + + ```bash + bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path] + ``` + + - `<path_to_json_file>`: **Required.** Path to your JSON configuration file. + - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`). + +### Configuration File + +The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run. + +Here is an example `runs_config.json` with two benchmark configurations: + +```json +[ + { + "base": "/home/user", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "system": "TPU", # OR GPU + "tp": 8, + "input_len": 128, + "output_len": 2048, + "max_model_len": 2300, + "num_seqs_list": "128 256", + "num_batched_tokens_list": "8192 16384" + }, + { + "base": "/home/user", + "model": "meta-llama/Llama-3.1-70B-Instruct", + "system": "TPU", # OR GPU + "tp": 8, + "input_len": 4000, + "output_len": 16, + "max_model_len": 4096, + "num_seqs_list": "64 128", + "num_batched_tokens_list": "4096 8192", + "max_latency_allowed_ms": 500 + } +] +``` + +### Output + +The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added: + +- `run_id`: A unique identifier for the run, derived from the timestamp. +- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`). +- `results`: The content of the `result.txt` file from the `auto_tune.sh` run. +- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided). + +A summary of successful and failed runs is also printed to the console upon completion. diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh new file mode 100755 index 0000000000000..57ef20daf6b71 --- /dev/null +++ b/benchmarks/auto_tune/batch_auto_tune.sh @@ -0,0 +1,128 @@ +#!/bin/bash + +INPUT_JSON="$1" +GCS_PATH="$2" # Optional GCS path for uploading results for each run + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh" + +if [[ -z "$INPUT_JSON" ]]; then + echo "Error: Input JSON file not provided." + echo "Usage: $0 <path_to_json_file> [gcs_upload_path]" + exit 1 +fi + +if [[ ! -f "$INPUT_JSON" ]]; then + echo "Error: File not found at '$INPUT_JSON'" + exit 1 +fi + +if ! command -v jq &> /dev/null; then + echo "Error: 'jq' command not found. Please install jq to process the JSON input." + exit 1 +fi + +if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then + echo "Error: 'gcloud' command not found, but a GCS_PATH was provided." + exit 1 +fi + +SUCCESS_COUNT=0 +FAILURE_COUNT=0 +FAILED_RUNS=() +SCRIPT_START_TIME=$(date +%s) + +json_content=$(cat "$INPUT_JSON") +if ! num_runs=$(echo "$json_content" | jq 'length'); then + echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2 + exit 1 +fi + +echo "Found $num_runs benchmark configurations in $INPUT_JSON." +echo "Starting benchmark runs..." +echo "--------------------------------------------------" + +for i in $(seq 0 $(($num_runs - 1))); do + run_object=$(echo "$json_content" | jq ".[$i]") + + RUN_START_TIME=$(date +%s) + ENV_VARS_ARRAY=() + # Dynamically create env vars from the JSON object's keys + for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do + value=$(echo "$run_object" | jq -r ".$key") + var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_') + ENV_VARS_ARRAY+=("${var_name}=${value}") + done + + echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}" + + # Execute auto_tune.sh and capture output + RUN_OUTPUT_FILE=$(mktemp) + if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then + STATUS="SUCCESS" + ((SUCCESS_COUNT++)) + else + STATUS="FAILURE" + ((FAILURE_COUNT++)) + FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)") + fi + + RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE") + rm "$RUN_OUTPUT_FILE" + + # Parse results and optionally upload them to GCS + RUN_ID="" + RESULTS="" + GCS_RESULTS_URL="" + if [[ "$STATUS" == "SUCCESS" ]]; then + RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true) + + if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then + RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")") + RESULT_DIR=$(dirname "$RESULT_FILE_PATH") + RESULTS=$(cat "$RESULT_FILE_PATH") + + if [[ -n "$GCS_PATH" ]]; then + GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}" + echo "Uploading results to GCS..." + if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then + echo "GCS upload successful." + else + echo "Warning: GCS upload failed for RUN_ID $RUN_ID." + fi + fi + else + echo "Warning: Could not find result file for a successful run." + STATUS="WARNING_NO_RESULT_FILE" + fi + fi + + # Add the results back into the JSON object for this run + json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \ + '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}') + + RUN_END_TIME=$(date +%s) + echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS" + echo "--------------------------------------------------" + + # Save intermediate progress back to the file + echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON" + +done + +SCRIPT_END_TIME=$(date +%s) +echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds." +echo +echo "====================== SUMMARY ======================" +echo "Successful runs: $SUCCESS_COUNT" +echo "Failed runs: $FAILURE_COUNT" +echo "===================================================" + +if [[ $FAILURE_COUNT -gt 0 ]]; then + echo "Details of failed runs (see JSON file for full parameters):" + for failed in "${FAILED_RUNS[@]}"; do + echo " - $failed" + done +fi + +echo "Updated results have been saved to '$INPUT_JSON'." From e6585ddb451ba6056e044184f7fc88dcc13f8cfe Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Thu, 18 Sep 2025 07:37:23 +0800 Subject: [PATCH 387/584] [Bugfix] Fix accuracy issue for silu_mul + nvfp4 quant fusion kernel (#24833) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 2 +- .../activation_nvfp4_quant_fusion_kernels.cu | 120 ++++------------- tests/compile/test_silu_mul_quant_fusion.py | 13 +- .../quantization/test_silu_mul_nvfp4_quant.py | 75 +++++++++++ .../test_silu_nvfp4_quant_fusion.py | 126 ------------------ 5 files changed, 110 insertions(+), 226 deletions(-) create mode 100644 tests/kernels/quantization/test_silu_mul_nvfp4_quant.py delete mode 100644 tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 150dc40a91733..08c10180fc224 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -796,7 +796,7 @@ steps: # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index b4eb141cb4883..74fde23782ce5 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -30,109 +30,41 @@ namespace vllm { +// silu in float32 +__device__ __forceinline__ float silu(float x) { + return __fdividef(x, (1.f + __expf(-x))); +} + +__device__ __forceinline__ float2 silu2(float2 x) { + return make_float2(silu(x.x), silu(x.y)); +} + template <class Type> -__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec, - PackedVec<Type>& vec2) { +__inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec, + PackedVec<Type>& vec2) { PackedVec<Type> result; + using packed_type = typename TypeConverter<Type>::Type; + #pragma unroll for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) { + // silu_mul in float32 if constexpr (std::is_same_v<Type, half>) { - half2 val(0.5f, 0.5f); - half2 t0 = __hmul2(vec.elts[i], val); - half2 t1 = __hfma2(h2tanh(t0), val, val); - half2 t2 = __hmul2(vec.elts[i], t1); - result.elts[i] = __hmul2(t2, vec2.elts[i]); + float2 silu_vec = silu2(__half22float2(vec.elts[i])); + result.elts[i] = + __float22half2_rn(__fmul2_rn(silu_vec, __half22float2(vec2.elts[i]))); } else { - __nv_bfloat162 val(0.5f, 0.5f); - __nv_bfloat162 t0 = __hmul2(vec.elts[i], val); - __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val); - __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1); - result.elts[i] = __hmul2(t2, vec2.elts[i]); + float2 silu_vec = silu2(__bfloat1622float2(vec.elts[i])); + result.elts[i] = __float22bfloat162_rn( + __fmul2_rn(silu_vec, __bfloat1622float2(vec2.elts[i]))); } } return result; } -// Quantizes the provided PackedVec into the uint32_t output -template <class Type, bool UE8M0_SF = false> -__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, - PackedVec<Type>& vec2, - float SFScaleVal, - uint8_t* SFout) { - PackedVec<Type> out_silu = compute_silu(vec, vec2); - // Get absolute maximum values among the local 8 values. - auto localMax = __habs2(out_silu.elts[0]); - -// Local maximum value. -#pragma unroll - for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - localMax = __hmax2(localMax, __habs2(out_silu.elts[i])); - } - - // Get the absolute maximum among all 16 values (two threads). - localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); - // Get the final absolute maximum values. - float vecMax = float(__hmax(localMax.x, localMax.y)); - - // Get the SF (max value of the vector / max value of e2m1). - // maximum value of e2m1 = 6.0. - // TODO: use half as compute data type. - float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); - // 8 bits representation of the SF. - uint8_t fp8SFVal; - // Write the SF to global memory (STG.8). - if constexpr (UE8M0_SF) { - // Extract the 8 exponent bits from float32. - // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. - uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23; - fp8SFVal = tmp & 0xff; - // Convert back to fp32. - reinterpret_cast<uint32_t&>(SFValue) = tmp << 23; - } else { - // Here SFValue is always positive, so E4M3 is the same as UE4M3. - __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); - reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; - // Convert back to fp32. - SFValue = float(tmp); - } - // Get the output scale. - // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * - // reciprocal(SFScaleVal)) - float outputScale = - SFValue != 0 ? reciprocal_approximate_ftz( - SFValue * reciprocal_approximate_ftz(SFScaleVal)) - : 0.0f; - - if (SFout) { - // Write the SF to global memory (STG.8). - *SFout = fp8SFVal; - } - - // Convert the input to float. - float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; - -#pragma unroll - for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - if constexpr (std::is_same_v<Type, half>) { - fp2Vals[i] = __half22float2(out_silu.elts[i]); - } else { - fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]); - } - fp2Vals[i].x *= outputScale; - fp2Vals[i].y *= outputScale; - } - - // Convert to e2m1 values. - uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); - - // Write the e2m1 values to global memory. - return e2m1Vec; -} - // Use UE4M3 by default. template <class Type, bool UE8M0_SF = false> __global__ void __launch_bounds__(1024, 4) - silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) { using PackedVec = PackedVec<Type>; @@ -160,16 +92,18 @@ __global__ void __launch_bounds__(1024, 4) // Get the output tensor offset. // Same as inOffset because 8 elements are packed into one uint32_t. int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; - ; auto& out_pos = out[outOffset]; + // Compute silu and mul + PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2); + auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF>( rowIdx, colIdx, numCols, SFout); - out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>( - in_vec, in_vec2, SFScaleVal, sf_out); + out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal, + sf_out); } } } @@ -204,7 +138,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d] input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] { using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type; auto input_ptr = static_cast<cuda_type const*>(input.data_ptr()); - vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>( + vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>( m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr), reinterpret_cast<uint32_t*>(sf_out)); diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 736db80a2f379..ae190d25cad62 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -98,8 +98,9 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): return [FUSED_OPS[kNvfp4Quant]] -@pytest.mark.parametrize("num_tokens", [64]) -@pytest.mark.parametrize("hidden_size", [128]) +@pytest.mark.parametrize("num_tokens", [32, 64]) +@pytest.mark.parametrize("hidden_size", [128, 256]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize( "model_class", cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel] @@ -110,13 +111,13 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): [True, False] if cutlass_fp8_supported() else [True]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm") -def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, +def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class, cuda_force_torch): if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch: pytest.skip("Duplicate tests for NVFP4") torch.set_default_device("cuda") - torch.set_default_dtype(torch.float16) + torch.set_default_dtype(dtype) x = torch.rand(num_tokens, hidden_size * 2) @@ -145,8 +146,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, elif model_class == TestSiluMulNvfp4QuantModel: atol, rtol = 1e-1, 1e-1 - torch.testing.assert_close(result[0].to(dtype=torch.float16), - result2[0].to(dtype=torch.float16), + torch.testing.assert_close(result[0].to(dtype=dtype), + result2[0].to(dtype=dtype), atol=atol, rtol=rtol) diff --git a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py new file mode 100644 index 0000000000000..a40d0c4ef1224 --- /dev/null +++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, + FLOAT8_E4M3_MAX, + dequantize_nvfp4_to_dtype) +from vllm._custom_ops import scaled_fp4_quant +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.platforms import current_platform + +if not current_platform.has_device_capability(100): + pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.", + allow_module_level=True) + +FP4_DTYPE = torch.uint8 +FP8_DTYPE = current_platform.fp8_dtype() + +DTYPES = [torch.float16, torch.bfloat16] +SHAPES = [(128, 256), (128, 128), (256, 256), (256, 128)] +BLOCK_SIZE = 16 + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("shape", SHAPES) +@torch.inference_mode() +def test_silu_mul_nvfp4_quant( + dtype: torch.dtype, + shape: tuple[int, int], +) -> None: + current_platform.seed_everything(42) + device = 'cuda:0' + torch.set_default_device(device) + + x = torch.randn(shape, dtype=dtype) + + # ref op + ref_output = SiluAndMul().forward_native(x) + ref_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / + torch.abs(ref_output).max().to(torch.float32)) + ref_output_quant, ref_block_scale = scaled_fp4_quant( + ref_output, ref_global_scale) + + # fused op + fused_output_quant = torch.empty_like(ref_output_quant) + fused_block_scale = torch.empty_like(ref_block_scale) + torch.ops._C.silu_and_mul_nvfp4_quant(fused_output_quant, + fused_block_scale, x, + ref_global_scale) + + # check dtype + assert ref_output_quant.dtype == FP4_DTYPE + assert fused_output_quant.dtype == FP4_DTYPE + assert ref_output_quant.shape == fused_output_quant.shape + + assert ref_block_scale.dtype == FP8_DTYPE + assert fused_block_scale.dtype == FP8_DTYPE + assert ref_block_scale.shape == fused_block_scale.shape + + # check dequantized output + ref_output_dequant = dequantize_nvfp4_to_dtype(ref_output_quant, + ref_block_scale, + ref_global_scale, dtype, + device) + fused_output_dequant = dequantize_nvfp4_to_dtype(fused_output_quant, + fused_block_scale, + ref_global_scale, dtype, + device) + + atol, rtol = 3e-1, 3e-1 + torch.testing.assert_close(ref_output_dequant, + fused_output_dequant, + atol=atol, + rtol=rtol) diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py deleted file mode 100644 index 969f14cc3fe62..0000000000000 --- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest -import torch - -from tests.kernels.utils import opcheck -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.platforms import current_platform -from vllm.scalar_type import scalar_types - -if not current_platform.has_device_capability(100): - pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.", - allow_module_level=True) - -DTYPES = [torch.float16, torch.bfloat16] -SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)] -SEEDS = [42] -CUDA_DEVICES = ['cuda:0'] - -FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() -FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max - -BLOCK_SIZE = 16 - - -def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor, - global_scale: torch.Tensor, - ref_output_scale: torch.Tensor) -> torch.Tensor: - silu_and_mul_out = silu_and_mul.forward_native(x) - assert not current_platform.is_rocm() - assert silu_and_mul_out.ndim >= 1, ( - f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.') - other_dims = 1 if silu_and_mul_out.ndim == 1 else -1 - silu_and_mul_out = silu_and_mul_out.reshape(other_dims, - silu_and_mul_out.shape[-1]) - m, n = silu_and_mul_out.shape - device = silu_and_mul_out.device - - # Two fp4 values will be packed into an uint8. - out = torch.empty((m, n // 2), device=device, dtype=torch.uint8) - - output_scale = ref_output_scale - - torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale, - global_scale) - - return out, output_scale - - -def ops_impl(x: torch.Tensor, global_scale: torch.Tensor, - ref_output_scale: torch.Tensor) -> torch.Tensor: - out_shape = (x.shape[0], x.shape[1] // 4) - output_scale = ref_output_scale - out = torch.empty(out_shape, dtype=torch.uint8, device=x.device) - torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale) - return out, output_scale - - -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("shape", SHAPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_quantize_to_fp4( - dtype: torch.dtype, - shape: tuple[int, int], - seed: int, - device: str, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - - m, n = shape - - x = torch.randn((m, n), dtype=dtype) - tensor_amax = torch.abs(x).max().to(torch.float32) - global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax - - block_size = 16 - - assert n % block_size == 0, ( - f'last dim has to be multiple of 16, but got {n}.') - assert x.dtype in (torch.float16, torch.bfloat16), ( - f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.') - - round_up = lambda x, y: (x + y - 1) // y * y - rounded_m = round_up(x.shape[0], 128) - scale_n = x.shape[1] // (2 * block_size) - rounded_n = round_up(scale_n, 4) - output_scale = torch.empty((rounded_m, rounded_n // 4), - device=x.device, - dtype=torch.int32) - - layer = SiluAndMul() - - ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale) - - fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale) - - assert ref_out.dtype == torch.uint8 - assert fusion_out.dtype == torch.uint8 - assert ref_out.shape == fusion_out.shape - - assert ref_out_scale.dtype == torch.int32 - assert fusion_out_scale.dtype == torch.int32 - assert ref_out_scale.shape == fusion_out_scale.shape - - # Allow up to 2% of mismatched values since BF16 has accuracy issues. - mis_threshold = 0.02 - atol = 0.4 - rtol = 0.4 - ref_logits = ref_out[-1] - fusion_logits = fusion_out[-1] - - mis_count = torch.sum( - torch.abs(fusion_logits - ref_logits) > (atol + - rtol * torch.abs(ref_logits))) - mis_ratio = mis_count / fusion_logits.numel() - - assert mis_ratio < mis_threshold, \ - f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}" - - torch.testing.assert_close(ref_out_scale, fusion_out_scale) - - opcheck(torch.ops._C.silu_and_mul_nvfp4_quant, - (fusion_out, fusion_out_scale, x, global_scale)) From 5963b98b465007e3cfb0d39447e4459a8afa96dc Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 17 Sep 2025 19:43:31 -0400 Subject: [PATCH 388/584] [Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537) Signed-off-by: Bill Nell <bnell@redhat.com> --- .../kernels/benchmark_cutlass_fp4_moe.py | 58 +- .../kernels/benchmark_grouped_gemm_cutlass.py | 43 +- benchmarks/kernels/benchmark_moe.py | 73 +- .../moe/modular_kernel_tools/common.py | 110 +-- .../make_feature_matrix.py | 5 +- .../moe/modular_kernel_tools/mk_objects.py | 163 ++-- tests/kernels/moe/test_batched_deepgemm.py | 20 +- tests/kernels/moe/test_batched_moe.py | 4 +- tests/kernels/moe/test_block_fp8.py | 79 +- tests/kernels/moe/test_block_int8.py | 45 +- tests/kernels/moe/test_cutlass_moe.py | 53 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 87 +- tests/kernels/moe/test_deepep_moe.py | 70 +- tests/kernels/moe/test_deepgemm.py | 34 +- tests/kernels/moe/test_flashinfer.py | 32 +- tests/kernels/moe/test_flashinfer_moe.py | 68 +- .../moe/test_gpt_oss_triton_kernels.py | 30 +- .../moe/test_modular_kernel_combinations.py | 40 +- tests/kernels/moe/test_moe.py | 34 +- tests/kernels/moe/test_nvfp4_moe.py | 19 +- tests/kernels/moe/test_pplx_cutlass_moe.py | 21 +- tests/kernels/moe/test_pplx_moe.py | 62 +- tests/kernels/moe/test_triton_moe_ptpc_fp8.py | 15 +- tests/kernels/moe/utils.py | 149 +++- .../kernels/quantization/test_int8_kernel.py | 35 +- .../layers/fused_moe/__init__.py | 8 +- .../layers/fused_moe/batched_deep_gemm_moe.py | 38 +- .../batched_triton_or_deep_gemm_moe.py | 58 +- .../model_executor/layers/fused_moe/config.py | 688 +++++++++++----- .../layers/fused_moe/cutlass_moe.py | 165 ++-- .../layers/fused_moe/deep_gemm_moe.py | 65 +- .../fused_moe/deepep_ht_prepare_finalize.py | 13 +- .../fused_moe/deepep_ll_prepare_finalize.py | 45 +- .../fused_moe/flashinfer_cutlass_moe.py | 50 +- .../flashinfer_cutlass_prepare_finalize.py | 7 +- .../layers/fused_moe/flashinfer_trtllm_moe.py | 185 +++++ .../layers/fused_moe/fused_batched_moe.py | 119 +-- .../layers/fused_moe/fused_moe.py | 744 +++++------------- .../fused_moe/gpt_oss_triton_kernels_moe.py | 95 +-- vllm/model_executor/layers/fused_moe/layer.py | 137 ++-- .../layers/fused_moe/modular_kernel.py | 149 ++-- .../layers/fused_moe/pplx_prepare_finalize.py | 10 +- .../layers/fused_moe/prepare_finalize.py | 4 +- .../layers/fused_moe/rocm_aiter_fused_moe.py | 52 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 48 +- .../layers/fused_moe/trtllm_moe.py | 25 +- vllm/model_executor/layers/fused_moe/utils.py | 4 + .../layers/quantization/awq_marlin.py | 8 +- .../layers/quantization/bitsandbytes.py | 8 +- .../compressed_tensors_moe.py | 431 +++++----- .../layers/quantization/experts_int8.py | 16 +- .../model_executor/layers/quantization/fp8.py | 171 ++-- .../layers/quantization/gguf.py | 7 +- .../layers/quantization/gptq_marlin.py | 8 +- .../layers/quantization/ipex_quant.py | 5 + .../layers/quantization/modelopt.py | 237 +++--- .../layers/quantization/moe_wna16.py | 33 +- .../layers/quantization/mxfp4.py | 37 +- .../layers/quantization/quark/quark_moe.py | 46 +- .../model_executor/layers/quantization/rtn.py | 34 +- .../quantization/utils/flashinfer_fp4_moe.py | 20 +- .../quantization/utils/flashinfer_utils.py | 38 +- .../layers/quantization/utils/fp8_utils.py | 1 + vllm/model_executor/models/bert_with_rope.py | 10 +- vllm/model_executor/models/deepseek.py | 22 +- vllm/model_executor/models/minicpm.py | 21 +- vllm/model_executor/models/qwen3_moe.py | 2 +- .../model_executor/warmup/deep_gemm_warmup.py | 11 +- 68 files changed, 2698 insertions(+), 2526 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index 35c20ee41b9a9..726a2a371d109 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config, + nvfp4_moe_quant_config, +) from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types @@ -140,6 +144,12 @@ def bench_run( a_fp8_scale: torch.Tensor, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) + for _ in range(num_repeats): fused_experts( a, @@ -147,10 +157,7 @@ def bench_run( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, + quant_config=quant_config, ) def run_cutlass_moe_fp4( @@ -172,25 +179,27 @@ def bench_run( device: torch.device, num_repeats: int, ): + quant_config = nvfp4_moe_quant_config( + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + g1_alphas=w1_gs, + g2_alphas=w2_gs, + ) for _ in range(num_repeats): with nvtx.annotate("cutlass_moe_fp4", color="green"): cutlass_moe_fp4( a=a, - a1_gscale=a1_gs, - a2_gscale=a2_gs, w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_gs, w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_gs, topk_weights=topk_weights, topk_ids=topk_ids, m=m, n=n, k=k, e=num_experts, - device=device, + quant_config=quant_config, ) def run_cutlass_from_graph( @@ -211,26 +220,29 @@ def bench_run( e: int, device: torch.device, ): + quant_config = nvfp4_moe_quant_config( + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + g1_alphas=w1_gs, + g2_alphas=w2_gs, + ) + with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): return cutlass_moe_fp4( a=a, - a1_gscale=a1_gs, w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_alphas, - a2_gscale=a2_gs, w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_alphas, topk_weights=topk_weights, topk_ids=topk_ids, m=m, n=n, k=k, e=num_experts, - device=device, + quant_config=quant_config, ) def run_triton_from_graph( @@ -246,16 +258,18 @@ def bench_run( with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) return fused_experts( a, w1, w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, + quant_config=quant_config, ) def replay_graph(graph, num_repeats): diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index a6b42406b5cb0..14330ae6f03c5 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, @@ -96,6 +97,11 @@ def bench_run( a_scale: torch.Tensor, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) for _ in range(num_repeats): fused_experts( a, @@ -103,10 +109,7 @@ def bench_run( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, + quant_config=quant_config, ) def run_cutlass_moe( @@ -125,6 +128,12 @@ def bench_run( per_act_token: bool, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + per_act_token_quant=per_act_token, + ) + for _ in range(num_repeats): cutlass_moe_fp8( a, @@ -132,14 +141,11 @@ def bench_run( w2, topk_weights, topk_ids, - w1_scale, - w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, - per_act_token, - a1_scale=None, + quant_config=quant_config, ) def run_cutlass_from_graph( @@ -156,6 +162,12 @@ def bench_run( topk_weights: torch.Tensor, topk_ids: torch.Tensor, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + per_act_token_quant=per_act_token, + ) + with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): @@ -165,14 +177,11 @@ def bench_run( w2_q, topk_weights, topk_ids, - w1_scale, - w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, - per_act_token, - a1_scale=None, + quant_config=quant_config, ) def run_triton_from_graph( @@ -185,6 +194,11 @@ def bench_run( w2_scale: torch.Tensor, a_scale: torch.Tensor, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): @@ -194,10 +208,7 @@ def bench_run( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, + quant_config=quant_config, ) def replay_graph(graph, num_repeats): diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 837b2b0c10447..d2beb28f70233 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -14,6 +14,10 @@ import ray import torch from ray.experimental.tqdm_ray import tqdm +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + _get_config_dtype_str, +) from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config @@ -134,43 +138,36 @@ def benchmark_config( def run(): from vllm.model_executor.layers.fused_moe import override_config + if use_fp8_w8a8: + quant_dtype = torch.float8_e4m3fn + elif use_int8_w8a16: + quant_dtype = torch.int8 + else: + quant_dtype = None + + quant_config = FusedMoEQuantConfig.make( + quant_dtype=quant_dtype, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_quant_shape, + ) + with override_config(config): - if use_deep_gemm: - topk_weights, topk_ids, token_expert_indices = fused_topk( - x, input_gating, topk, False - ) - return fused_experts( - x, - w1, - w2, - topk_weights, - topk_ids, - inplace=True, - use_fp8_w8a8=use_fp8_w8a8, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - allow_deep_gemm=True, - ) - else: - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - ) + topk_weights, topk_ids, token_expert_indices = fused_topk( + x, input_gating, topk, renormalize=not use_deep_gemm + ) + return fused_experts( + x, + w1, + w2, + topk_weights, + topk_ids, + inplace=True, + quant_config=quant_config, + allow_deep_gemm=use_deep_gemm, + ) # JIT compilation & warmup run() @@ -414,7 +411,7 @@ class BenchmarkWorker: use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) - dtype_str = get_config_dtype_str( + dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which @@ -547,7 +544,7 @@ def save_configs( block_quant_shape: list[int], save_dir: str, ) -> None: - dtype_str = get_config_dtype_str( + dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index a10666b6ec9a7..b5fcc4cd70bf8 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx -from .mk_objects import (expert_info, make_fused_experts, +from .mk_objects import (TestMoEQuantConfig, expert_info, make_fused_experts, make_prepare_finalize, prepare_finalize_info) from .parallel_utils import ProcessGroupInfo @@ -40,7 +40,7 @@ class Config: E: int topks: Union[list[int], int] dtype: torch.dtype - quant_config: Optional[FusedMoEQuantConfig] + quant_config: Optional[TestMoEQuantConfig] prepare_finalize_type: mk.FusedMoEPrepareAndFinalize fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute @@ -52,7 +52,7 @@ class Config: def __post_init__(self): if self.quant_config is None: - self.quant_config = FusedMoEQuantConfig() + self.quant_config = TestMoEQuantConfig(None, False, False, None) def describe(self) -> str: s = "" @@ -275,21 +275,19 @@ class WeightTensors: or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8) def to_current_device(self): - self.w1 = self.w1.to(device=torch.cuda.current_device()) - self.w2 = self.w2.to(device=torch.cuda.current_device()) + device = torch.cuda.current_device() + self.w1 = self.w1.to(device=device) + self.w2 = self.w2.to(device=device) - if self.is_quantized(): - assert self.w1_scale is not None - assert self.w2_scale is not None - self.w1_scale = self.w1_scale.to( - device=torch.cuda.current_device()) - self.w2_scale = self.w2_scale.to( - device=torch.cuda.current_device()) + if self.w1_scale is not None: + self.w1_scale = self.w1_scale.to(device=device) + if self.w2_scale is not None: + self.w2_scale = self.w2_scale.to(device=device) if self.w1_gs is not None: - assert self.w2_gs is not None - self.w1_gs = self.w1_gs.to(device=torch.cuda.current_device()) - self.w2_gs = self.w2_gs.to(device=torch.cuda.current_device()) + self.w1_gs = self.w1_gs.to(device=device) + if self.w2_gs is not None: + self.w2_gs = self.w2_gs.to(device=device) def slice_weights(self, rank: int, num_local_experts: int) -> "WeightTensors": @@ -297,20 +295,12 @@ class WeightTensors: e = s + num_local_experts w1 = self.w1[s:e, :, :] w2 = self.w2[s:e, :, :] - - w1_scale, w2_scale = (None, None) - if self.is_quantized(): - assert self.w1_scale is not None - assert self.w2_scale is not None - w1_scale = self.w1_scale[s:e, :, :] - w2_scale = self.w2_scale[s:e, :, :] - - w1_gs = self.w1_gs - w2_gs = self.w2_gs - if w1_gs is not None: - assert w2_gs is not None - w1_gs = w1_gs[s:e] - w2_gs = w2_gs[s:e] + w1_scale = self.w1_scale[ + s:e, :, :] if self.w1_scale is not None else None + w2_scale = self.w2_scale[ + s:e, :, :] if self.w2_scale is not None else None + w1_gs = self.w1_gs[s:e] if self.w1_gs is not None else None + w2_gs = self.w2_gs[s:e] if self.w2_gs is not None else None return WeightTensors(w1, w2, w1_scale, w2_scale, w1_gs, w2_gs) @@ -323,7 +313,8 @@ class WeightTensors: in_dtype=config.dtype, quant_dtype=config.quant_dtype, block_shape=config.quant_block_shape, - per_act_token_quant=config.is_per_out_ch_quant, + per_out_ch_quant=config. + is_per_act_token_quant, # or config.is_per_out_ch_quant ) return WeightTensors(w1=w1, w2=w2, @@ -342,8 +333,6 @@ class RankTensors: topk_ids: torch.Tensor expert_map: Optional[torch.Tensor] - quant_config: Optional[FusedMoEQuantConfig] - def describe(self): s = "" s += "== Rank Tensors: \n" @@ -426,7 +415,6 @@ class RankTensors: topk_weights=topk_weights, topk_ids=topk_ids, expert_map=expert_map, - quant_config=config.quant_config, ) @@ -522,10 +510,16 @@ def reference_moe_impl(config: Config, weights: WeightTensors, and config.supports_apply_weight_on_input()) +def _make_gscale(num_experts: int) -> torch.Tensor: + return torch.ones((num_experts, ), + device=torch.cuda.current_device(), + dtype=torch.float32) + + def make_modular_kernel( config: Config, vllm_config: VllmConfig, - weights: WeightTensors, + quant_config: FusedMoEQuantConfig, ) -> mk.FusedMoEModularKernel: def next_power_of_2(x): @@ -548,20 +542,20 @@ def make_modular_kernel( num_local_experts=config.num_local_experts, moe_parallel_config=moe_parallel_config, in_dtype=config.dtype, - quant_config=config.quant_config, max_num_tokens=next_power_of_2(config.M), ) # make modular kernel prepare_finalize = make_prepare_finalize(config.prepare_finalize_type, - config.all2all_backend(), moe) + config.all2all_backend(), moe, + quant_config) fused_experts = make_fused_experts( config.fused_experts_type, moe, + quant_config, prepare_finalize.num_dispatchers(), - weights.w1_gs, - weights.w2_gs, + config.N, ) modular_kernel = mk.FusedMoEModularKernel( @@ -583,12 +577,38 @@ def run_modular_kernel( # weights for rank rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts) - mk = make_modular_kernel(config, vllm_config, weights) + if config.quant_dtype == "nvfp4": + gscale = _make_gscale(config.num_local_experts) + else: + gscale = None + + quant_config = FusedMoEQuantConfig.make( + config.quant_dtype, + w1_scale=rank_weights.w1_scale, + w2_scale=rank_weights.w2_scale, + a1_scale=rank_tensors.hidden_states_scale, + g1_alphas=(1 / rank_weights.w1_gs) + if rank_weights.w1_gs is not None else None, + g2_alphas=(1 / rank_weights.w2_gs) + if rank_weights.w2_gs is not None else None, + a1_gscale=gscale, + a2_gscale=gscale, + block_shape=config.quant_block_shape, + per_act_token_quant=config.is_per_act_token_quant, + per_out_ch_quant=config.is_per_out_ch_quant, + ) + + mk = make_modular_kernel(config, vllm_config, quant_config) + + # impls might update the tensor in place + hidden_states = rank_tensors.hidden_states.clone() + + topk_ids = rank_tensors.topk_ids.to( + mk.prepare_finalize.topk_indices_dtype()) mk_kwargs = { "hidden_states": - rank_tensors.hidden_states.clone( - ), # impls might update the tensor in place + hidden_states, "w1": rank_weights.w1, "w2": @@ -596,15 +616,9 @@ def run_modular_kernel( "topk_weights": rank_tensors.topk_weights, "topk_ids": - rank_tensors.topk_ids.to(mk.prepare_finalize.topk_indices_dtype()), + topk_ids, "expert_map": rank_tensors.expert_map, - "w1_scale": - rank_weights.w1_scale, - "w2_scale": - rank_weights.w2_scale, - "a1_scale": - rank_tensors.hidden_states_scale, "global_num_experts": config.E, "apply_router_weight_on_input": diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py index 5dbfdfc153f9f..c1037b60bf383 100644 --- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py +++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py @@ -10,7 +10,8 @@ import torch from tqdm import tqdm from vllm.config import VllmConfig, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG) from vllm.platforms import current_platform from .common import (Config, RankTensors, WeightTensors, reference_moe_impl, @@ -86,7 +87,7 @@ def make_feature_matrix(csv_file_path: str): quant_config_dict = config_dict['quant_config'] del config_dict['quant_config'] if quant_config_dict is None: - quant_config = FusedMoEQuantConfig(None) + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG quant_config_dict = asdict(quant_config) config_dict |= quant_config_dict diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index aecffae36ae5e..7947391d03483 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -32,6 +32,14 @@ from vllm.utils.deep_gemm import is_deep_gemm_supported from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +@dataclass +class TestMoEQuantConfig: + quant_dtype: Union[torch.dtype, str, None] + per_out_ch_quant: bool + per_act_token_quant: bool + block_shape: Optional[list[int]] + + @dataclass class PrepareFinalizeInfo: activation_format: mk.FusedMoEActivationFormat @@ -66,7 +74,7 @@ common_float_types: list[Union[torch.dtype, str]] = [ torch.float8_e4m3fn, torch.bfloat16, torch.float16, torch.float32 ] common_float_and_int_types = common_float_types + [torch.int8] -nv_fp4_types = ["nvfp4"] +nvfp4_types = ["nvfp4"] fp8_types = [torch.float8_e4m3fn] @@ -219,7 +227,7 @@ if (has_flashinfer_cutlass_fused_moe() register_prepare_and_finalize( FlashInferCutlassMoEPrepareAndFinalize, standard_format, - nv_fp4_types, + nvfp4_types, blocked_quantization_support=True, backend=None, force_multigpu=True, @@ -229,7 +237,7 @@ if (has_flashinfer_cutlass_fused_moe() register_experts( FlashInferExperts, standard_format, - nv_fp4_types, + nvfp4_types, blocked_quantization_support=True, supports_chunking=True, # Note: this is a hack to get it to run for now @@ -306,39 +314,39 @@ if cutlass_fp4_supported(): register_experts( CutlassExpertsFp4, standard_format, - nv_fp4_types, + nvfp4_types, blocked_quantization_support=True, supports_chunking=True, supports_expert_map=False, ) -MK_QUANT_CONFIGS = [ +MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [ None, # per-channel / per-column weights and per-tensor activations - FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, - per_out_ch_quant=True, - per_act_token_quant=False, - block_shape=None), + TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, + per_out_ch_quant=True, + per_act_token_quant=False, + block_shape=None), # per-channel / per-column weights and per-token activations - FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, - per_out_ch_quant=True, - per_act_token_quant=True, - block_shape=None), + TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, + per_out_ch_quant=True, + per_act_token_quant=True, + block_shape=None), # per-tensor weights and per-tensor activations - FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, - per_out_ch_quant=False, - per_act_token_quant=False, - block_shape=None), + TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, + per_out_ch_quant=False, + per_act_token_quant=False, + block_shape=None), # per-tensor weights and per-token activations - FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, - per_out_ch_quant=False, - per_act_token_quant=True, - block_shape=None), + TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, + per_out_ch_quant=False, + per_act_token_quant=True, + block_shape=None), # block-quantized weights and 128 block per-token activations - FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, - per_out_ch_quant=False, - per_act_token_quant=False, - block_shape=[128, 128]), + TestMoEQuantConfig(quant_dtype=torch.float8_e4m3fn, + per_out_ch_quant=False, + per_act_token_quant=False, + block_shape=[128, 128]), # TODO (varun) : Should we test the following combinations ? # block-quantized weights and per-token activations # block-quantized weights and per-tensor activations @@ -346,33 +354,27 @@ MK_QUANT_CONFIGS = [ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): MK_QUANT_CONFIGS += [ - FusedMoEQuantConfig(quant_dtype="nvfp4", - per_out_ch_quant=False, - per_act_token_quant=False, - block_shape=None), + TestMoEQuantConfig(quant_dtype="nvfp4", + per_out_ch_quant=False, + per_act_token_quant=False, + block_shape=None), ] -def _make_gscale(num_experts: int) -> torch.Tensor: - return torch.ones((num_experts, ), - device=torch.cuda.current_device(), - dtype=torch.float32) - - def make_prepare_finalize( prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, backend: Optional[str], moe: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, ) -> mk.FusedMoEPrepareAndFinalize: if backend != "naive" and backend is not None: - prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(moe) + prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize( + moe, quant_config) assert prepare_finalize is not None return prepare_finalize elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize: return FlashInferCutlassMoEPrepareAndFinalize( - use_dp=moe.moe_parallel_config.dp_size > 1, - a1_gscale=_make_gscale(moe.num_local_experts), - ) + use_dp=moe.moe_parallel_config.dp_size > 1) else: return MoEPrepareAndFinalizeNoEP() @@ -383,34 +385,39 @@ def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor: return t[s:e] +def make_cutlass_strides( + e: int, + n: int, + k: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + return ab_strides1, ab_strides2, c_strides1, c_strides2 + + def make_fused_experts( fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute, moe: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, num_dispatchers: int, - w1_gs: Optional[torch.Tensor], - w2_gs: Optional[torch.Tensor], + N: int, ) -> mk.FusedMoEPermuteExpertsUnpermute: - use_fp8 = moe.quant_dtype == torch.float8_e4m3fn batch_kwargs = { "max_num_tokens": moe.max_num_tokens, "num_dispatchers": num_dispatchers, } quant_kwargs = { - "use_fp8_w8a8": use_fp8, - "use_int8_w8a8": False, - "use_int8_w8a16": False, - "use_int4_w4a16": False, - "block_shape": moe.block_shape, - "per_act_token_quant": moe.per_act_token_quant, + "quant_config": quant_config, } deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()} + torch.set_printoptions(threshold=0, edgeitems=0, linewidth=10000) + if fused_experts_type == BatchedDeepGemmExperts: - kwargs = batch_kwargs | { - "block_shape": moe.block_shape, - "per_act_token_quant": moe.per_act_token_quant, - } + kwargs = batch_kwargs | quant_kwargs print(f"Making BatchedDeepGemmExperts {kwargs} ...") experts = BatchedDeepGemmExperts(**kwargs) elif fused_experts_type == BatchedTritonExperts: @@ -422,8 +429,8 @@ def make_fused_experts( print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...") experts = BatchedTritonOrDeepGemmExperts(**kwargs) elif fused_experts_type == DeepGemmExperts: - print("Making DeepGemmExperts () ...") - experts = DeepGemmExperts() + print("Making DeepGemmExperts {quant_config} ...") + experts = DeepGemmExperts(quant_config) elif fused_experts_type == TritonExperts: kwargs = quant_kwargs print(f"Making TritonExperts {kwargs} ...") @@ -437,62 +444,50 @@ def make_fused_experts( print(f"Making NaiveBatchedExperts {kwargs} ...") experts = NaiveBatchedExperts(**kwargs) elif fused_experts_type == CutlassExpertsFp8: + strides = make_cutlass_strides(moe.num_experts, N, moe.hidden_dim) kwargs = { "out_dtype": moe.in_dtype, - "per_act_token_quant": moe.per_act_token_quant, - "per_out_ch_quant": moe.per_out_ch_quant, - "block_shape": moe.block_shape, - } + "ab_strides1": strides[0], + "ab_strides2": strides[1], + "c_strides1": strides[2], + "c_strides2": strides[3], + } | quant_kwargs print(f"Making CutlassExpertsFp8 {kwargs} ...") experts = CutlassExpertsFp8(**kwargs) elif fused_experts_type == CutlassBatchedExpertsFp8: + strides = make_cutlass_strides(moe.num_experts, N, moe.hidden_dim) kwargs = { "max_experts_per_worker": moe.num_local_experts, "num_dispatchers": num_dispatchers, "out_dtype": moe.in_dtype, - "per_act_token_quant": moe.per_act_token_quant, - "per_out_ch_quant": moe.per_out_ch_quant, - "block_shape": moe.block_shape, - } + "ab_strides1": strides[0], + "ab_strides2": strides[1], + "c_strides1": strides[2], + "c_strides2": strides[3], + } | quant_kwargs print(f"Making CutlassBatchedExpertsFp8 {kwargs} ...") experts = CutlassBatchedExpertsFp8(**kwargs) elif fused_experts_type == CutlassExpertsFp4: - assert w1_gs is not None and w2_gs is not None - num_experts = moe.num_local_experts - rank = moe.moe_parallel_config.dp_rank kwargs = { - "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)), - "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)), - "a1_gscale": _make_gscale(num_experts), - "a2_gscale": _make_gscale(num_experts), - "max_experts_per_worker": num_experts, - "out_dtype": moe.in_dtype, - "per_act_token_quant": moe.per_act_token_quant, - "per_out_ch_quant": moe.per_out_ch_quant, - "block_shape": moe.block_shape, + "max_experts_per_worker": moe.num_local_experts, "num_dispatchers": num_dispatchers, - } + "out_dtype": moe.in_dtype, + } | quant_kwargs print(f"Making CutlassExpertsFp4 {kwargs} ...") experts = CutlassExpertsFp4(**kwargs) elif fused_experts_type == FlashInferExperts: - assert w1_gs is not None and w2_gs is not None - num_experts = moe.num_local_experts - rank = moe.moe_parallel_config.dp_rank kwargs = { - "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)), - "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)), - "a1_gscale": _make_gscale(num_experts), - "a2_gscale": _make_gscale(num_experts), "out_dtype": moe.in_dtype, - "quant_dtype": "nvfp4", "ep_rank": moe.ep_rank, "ep_size": moe.ep_size, "tp_rank": moe.tp_rank, "tp_size": moe.tp_size, - } + } | quant_kwargs print(f"Making FlashInferExperts {kwargs} ...") experts = FlashInferExperts(**kwargs) else: raise RuntimeError(f"Unknown fused experts type: {fused_experts_type}") + torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80) + return experts diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py index 018d4c224f75e..afec97e8cffd0 100644 --- a/tests/kernels/moe/test_batched_deepgemm.py +++ b/tests/kernels/moe/test_batched_deepgemm.py @@ -6,6 +6,8 @@ import torch from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( BatchedDeepGemmExperts) +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config) from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts) from vllm.model_executor.layers.fused_moe.modular_kernel import ( @@ -56,13 +58,18 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int, rank=0, ) + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_s, + w2_scale=w2_s, + per_act_token_quant=False, + block_shape=BLOCK_SIZE, + ) + # triton (reference) triton_experts = BatchedTritonExperts( max_num_tokens=max_num_tokens, num_dispatchers=1, - use_fp8_w8a8=True, - per_act_token_quant=False, - block_shape=BLOCK_SIZE, + quant_config=quant_config, ) mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts) @@ -73,8 +80,6 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int, topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - w1_scale=w1_s, - w2_scale=w2_s, global_num_experts=E, ) @@ -82,8 +87,7 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int, deepgemm_experts = BatchedDeepGemmExperts( max_num_tokens=max_num_tokens, num_dispatchers=1, - block_shape=BLOCK_SIZE, - per_act_token_quant=False, + quant_config=quant_config, ) mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts) @@ -94,8 +98,6 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int, topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - w1_scale=w1_s, - w2_scale=w2_s, global_num_experts=E, ) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 00b2d780e66f5..7e79828937c77 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -140,7 +140,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, in_dtype=act_dtype, quant_dtype=quant_dtype, block_shape=block_shape, - per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_act_token_quant, ) out_shape = (num_experts, max_tokens_per_expert, N) @@ -250,7 +250,7 @@ def test_fused_moe_batched_experts( block_shape=block_shape, in_dtype=act_dtype, quant_dtype=quant_dtype, - per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_act_token_quant, ) if input_scales and quant_dtype is not None: diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index ecc57acc67963..da383e18c3721 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -4,7 +4,7 @@ import pytest import torch -from tests.kernels.moe.utils import make_test_weights +from tests.kernels.moe.utils import make_test_quant_config, make_test_weights from tests.kernels.quant_utils import (native_per_token_group_quant_fp8, native_w8a8_block_matmul) from vllm.config import VllmConfig, set_current_vllm_config @@ -161,22 +161,17 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed, a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - (_, w1, w1_s, _), (_, w2, w2_s, - _) = make_test_weights(E, - N, - K, - dtype, - torch.float8_e4m3fn, - per_act_token_quant=False, - block_shape=block_size) + w1, w2, quant_config = make_test_quant_config( + E, + N, + K, + dtype, + quant_dtype=torch.float8_e4m3fn, + per_act_token_quant=False, + block_shape=block_size, + ) - m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - use_mxfp4_w4a4=False, - per_act_token_quant=False, - block_shape=block_size) + m_fused_moe = modular_triton_fused_moe(quant_config) topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False) @@ -186,37 +181,24 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed, a, w1, w2, - w1_s, - w2_s, + quant_config.w1_scale, + quant_config.w2_scale, topk_weights, topk_ids, block_size, ) - out = fused_experts( - a, - w1, - w2, - topk_weights, - topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_s, - w2_scale=w2_s, - block_shape=block_size, - ) + out = fused_experts(a, + w1, + w2, + topk_weights, + topk_ids, + quant_config=quant_config) - m_out = m_fused_moe( - a, - w1, - w2, - topk_weights, - topk_ids, - w1_scale=w1_s, - w2_scale=w2_s, - ) + m_out = m_fused_moe(a, w1, w2, topk_weights, topk_ids) - # 0.039 only needed for [40000-4608-7168-2-1-block_size852-dtype852-0] - tol = 0.035 if M < 40000 else 0.039 + # 0.039 only needed for M >= 8192 + tol = 0.035 if M < 8192 else 0.039 torch.testing.assert_close(out, ref_out, atol=tol, rtol=tol) torch.testing.assert_close(m_out, ref_out, atol=tol, rtol=tol) @@ -248,14 +230,15 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - (_, w1, w1_s, _), (_, w2, w2_s, - _) = make_test_weights(E, - N, - K, - dtype, - torch.float8_e4m3fn, - per_act_token_quant=False, - block_shape=block_size) + (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( + E, + N, + K, + dtype, + torch.float8_e4m3fn, + per_out_ch_quant=False, + block_shape=block_size, + ) # Note: for now use_compile will error out if the problem size is # large enough to trigger chunking. I'm leaving the flag and diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py index 5e4a93963f8e8..041a13ca5585a 100644 --- a/tests/kernels/moe/test_block_int8.py +++ b/tests/kernels/moe/test_block_int8.py @@ -4,12 +4,12 @@ import pytest import torch -from tests.kernels.moe.utils import make_test_weights +from tests.kernels.moe.utils import make_test_quant_config from tests.kernels.quant_utils import (native_per_token_group_quant_int8, native_w8a8_block_matmul) from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform if current_platform.get_device_capability() < (7, 0): @@ -50,7 +50,7 @@ MNK_FACTORS = [ (2048, 128, 128), (2048, 1024, 7168), (2048, 4096, 512), - (2048, 4096, 7168), + (2048, 4096, 4096), ] E = [8, 24] @@ -117,31 +117,28 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) + topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False) - (_, w1, w1_s, _), (_, w2, w2_s, - _) = make_test_weights(E, - N, - K, - dtype, - torch.int8, - per_act_token_quant=False, - block_shape=block_size) + w1, w2, quant_config = make_test_quant_config( + E, + N, + K, + dtype, + quant_dtype=torch.int8, + per_act_token_quant=False, + block_shape=block_size, + ) # Set the context to avoid lots of warning spam. with set_current_vllm_config(vllm_config): - out = fused_moe( - a, - w1, - w2, - score, - topk, - renormalize=False, - use_int8_w8a8=True, - w1_scale=w1_s, - w2_scale=w2_s, - block_shape=block_size, - ) - ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, + out = fused_experts(a, + w1, + w2, + topk_weights, + topk_ids, + quant_config=quant_config) + ref_out = torch_w8a8_block_int8_moe(a, w1, w2, quant_config.w1_scale, + quant_config.w2_scale, score, topk, block_size) # Check results diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index c84f66383b902..ca6be767dab39 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import dataclasses from math import prod from typing import Optional @@ -9,6 +10,8 @@ import torch from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, fp8_w8a8_moe_quant_config) from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp8, run_cutlass_moe_fp8) from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts, @@ -154,7 +157,7 @@ def run_with_expert_maps(num_experts: int, num_local_experts: int, def slice_experts(): slice_params = [ "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1", - "c_strides2", "w1_scale", "w2_scale" + "c_strides2" ] full_tensors = { k: v @@ -162,6 +165,8 @@ def run_with_expert_maps(num_experts: int, num_local_experts: int, if k in slice_params and k in cutlass_moe_kwargs } + quant_config = cutlass_moe_kwargs["quant_config"] + for i in range(0, num_experts, num_local_experts): s, e = i, i + num_local_experts @@ -178,6 +183,12 @@ def run_with_expert_maps(num_experts: int, num_local_experts: int, for k, t in full_tensors.items(): cutlass_moe_kwargs[k] = t[s:e] + new_quant_config = copy.deepcopy(quant_config) + new_quant_config._w1.scale = quant_config.w1_scale[s:e] + new_quant_config._w2.scale = quant_config.w2_scale[s:e] + + cutlass_moe_kwargs["quant_config"] = new_quant_config + yield cutlass_moe_kwargs out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"]) @@ -191,6 +202,7 @@ def run_8_bit(moe_tensors: MOETensors8Bit, topk_weights: torch.Tensor, topk_ids: torch.Tensor, per_act_token: bool, + per_out_ch: bool, num_local_experts: Optional[int] = None) -> torch.Tensor: assert not any([ t is None for t in [ @@ -199,20 +211,27 @@ def run_8_bit(moe_tensors: MOETensors8Bit, ] ]) + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=moe_tensors.w1_scale, + w2_scale=moe_tensors.w2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + # Set to moe_tensors.a_scale iff static scales + per tensor. + # This is not currently being tested. + a1_scale=None, + ) + kwargs = { 'a': moe_tensors.a, 'w1_q': moe_tensors.w1_q, # type: ignore[union-attr] 'w2_q': moe_tensors.w2_q, # type: ignore[union-attr] 'topk_weights': topk_weights, 'topk_ids': topk_ids, - 'w1_scale': moe_tensors.w1_scale, - 'w2_scale': moe_tensors.w2_scale, 'ab_strides1': moe_tensors.ab_strides1, 'ab_strides2': moe_tensors.ab_strides2, 'c_strides1': moe_tensors.c_strides1, 'c_strides2': moe_tensors.c_strides2, - 'per_act_token': per_act_token, - 'a1_scale': None #moe_tensors.a_scale + 'quant_config': quant_config, } num_experts = moe_tensors.w1.size(0) @@ -261,16 +280,23 @@ def test_cutlass_moe_8_bit_no_graph( # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. - triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, - topk_ids) + + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + triton_output = fused_experts(mt.a_d, + mt.w1_d, + mt.w2_d, + topk_weights, + topk_ids, + quant_config=quant_config) if ep_size is not None: assert e % ep_size == 0, "Cannot distribute experts evenly" number_local_experts = e // ep_size else: number_local_experts = None + cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token, - number_local_experts) + per_out_ch, number_local_experts) # Note 5.5 only needed for larger problem sizes, 5 works ok for # the rest. @@ -315,14 +341,19 @@ def test_cutlass_moe_8_bit_cuda_graph( # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. - triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, - topk_ids) + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + triton_output = fused_experts(mt.a_d, + mt.w1_d, + mt.w2_d, + topk_weights, + topk_ids, + quant_config=quant_config) stream = torch.cuda.Stream() graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph, stream=stream): cutlass_output = run_8_bit(mt, topk_weights, topk_ids, - per_act_token) + per_act_token, per_out_ch) torch.cuda.synchronize() graph.replay() diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 6558cab6a9eff..ced5457d4f53b 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -15,6 +15,8 @@ from torch.distributed import ProcessGroup from typing_extensions import ParamSpec from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, fp8_w8a8_moe_quant_config) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) @@ -71,9 +73,12 @@ def make_block_quant_fp8_weights( Return weights w1q, w2q, w1_scale, w2_scale """ (_, w1q, w1_scale, _), (_, w2q, w2_scale, - _) = make_test_weights(e, n, k, torch.bfloat16, + _) = make_test_weights(e, + n, + k, + torch.bfloat16, torch.float8_e4m3fn, - block_size) + block_shape=block_size) return w1q, w2q, w1_scale, w2_scale @@ -130,10 +135,11 @@ class TestTensors: config=config) -def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, - max_tokens_per_rank: int, dp_size: int, - hidden_size: int, q_dtype: Optional[torch.dtype], - test_config: TestConfig) -> FusedMoEModularKernel: +def make_ll_modular_kernel( + pg: ProcessGroup, pgi: ProcessGroupInfo, max_tokens_per_rank: int, + dp_size: int, hidden_size: int, q_dtype: Optional[torch.dtype], + test_config: TestConfig, + quant_config: FusedMoEQuantConfig) -> FusedMoEModularKernel: assert test_config.low_latency assert test_config.use_fp8_dispatch is not None @@ -154,17 +160,18 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, fused_experts = BatchedDeepGemmExperts( max_num_tokens=max_tokens_per_rank, num_dispatchers=pgi.world_size // dp_size, - block_shape=test_config.block_size, - per_act_token_quant=test_config.per_act_token_quant) + quant_config=quant_config, + ) mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts) return mk -def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, - dp_size: int, num_local_experts: int, - q_dtype: Optional[torch.dtype], - test_config: TestConfig) -> FusedMoEModularKernel: +def make_ht_modular_kernel( + pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, + num_local_experts: int, q_dtype: Optional[torch.dtype], + test_config: TestConfig, + quant_config: FusedMoEQuantConfig) -> FusedMoEModularKernel: assert not test_config.low_latency assert test_config.use_fp8_dispatch is None @@ -178,15 +185,16 @@ def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, q_dtype=q_dtype, block_shape=test_config.block_size) - fused_experts = DeepGemmExperts() + fused_experts = DeepGemmExperts(quant_config) mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts) return mk -def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, - num_local_experts: int, - test_tensors: TestTensors) -> FusedMoEModularKernel: +def make_modular_kernel( + pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, + num_local_experts: int, test_tensors: TestTensors, + quant_config: FusedMoEQuantConfig) -> FusedMoEModularKernel: q_dtype = torch.float8_e4m3fn test_config = test_tensors.config @@ -204,10 +212,16 @@ def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, dp_size=dp_size, hidden_size=hidden_size, q_dtype=q_dtype, - test_config=test_config) + test_config=test_config, + quant_config=quant_config) else: - mk = make_ht_modular_kernel(pg, pgi, dp_size, num_local_experts, - q_dtype, test_config) + mk = make_ht_modular_kernel(pg, + pgi, + dp_size, + num_local_experts, + q_dtype, + test_config, + quant_config=quant_config) return mk @@ -233,17 +247,23 @@ def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32) + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + # Low-Latency kernels can't dispatch scales. + a1_scale=(None if test_config.low_latency else + test_tensors.rank_token_scales), + block_shape=test_config.block_size, + ) + # Make modular kernel mk: FusedMoEModularKernel = make_modular_kernel( pg=pg, pgi=pgi, dp_size=dp_size, num_local_experts=num_local_experts, - test_tensors=test_tensors) - - # Low-Latency kernels can't dispatch scales. - a1_scale = (None - if test_config.low_latency else test_tensors.rank_token_scales) + test_tensors=test_tensors, + quant_config=quant_config) out = mk.forward(hidden_states=test_tensors.rank_tokens, w1=w1, @@ -254,12 +274,6 @@ def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, activation="silu", global_num_experts=num_experts, expert_map=build_expert_map(), - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=None, - w2_zp=None, - a1_scale=a1_scale, - a2_scale=None, apply_router_weight_on_input=False) return out @@ -269,6 +283,13 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, a1_scale: torch.Tensor, block_shape: list[int]): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + block_shape=block_shape, + ) + return fused_experts( hidden_states=a, w1=w1, @@ -276,11 +297,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor, topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - block_shape=block_shape, + quant_config=quant_config, # Make sure this is set to False so we # don't end up comparing the same implementation. allow_deep_gemm=False) diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index 6a53af68cd53a..54d3a62b03fcc 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -15,6 +15,7 @@ from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import TritonExperts +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts) from vllm.model_executor.layers.fused_moe.modular_kernel import ( @@ -129,11 +130,9 @@ def make_modular_kernel( num_local_experts: int, q_dtype: Optional[torch.dtype], use_fp8_dispatch: bool, - per_act_token_quant: bool, + quant_config: FusedMoEQuantConfig, ) -> FusedMoEModularKernel: - is_quantized = q_dtype is not None - ht_args: Optional[DeepEPHTArgs] = None ll_args: Optional[DeepEPLLArgs] = None @@ -159,24 +158,14 @@ def make_modular_kernel( num_dispatchers = pgi.world_size // dp_size if low_latency_mode: - assert not per_act_token_quant, "not supported in ll mode" + assert not quant_config.per_act_token_quant, "not supported in ll mode" fused_experts = BatchedTritonExperts( max_num_tokens=MAX_TOKENS_PER_RANK, num_dispatchers=num_dispatchers, - use_fp8_w8a8=is_quantized, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - per_act_token_quant=False, + quant_config=quant_config, ) else: - fused_experts = TritonExperts( - use_fp8_w8a8=is_quantized, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - per_act_token_quant=per_act_token_quant, - ) + fused_experts = TritonExperts(quant_config=quant_config) mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts) @@ -217,11 +206,6 @@ def deep_ep_moe_impl( if is_quantized: q_dtype = torch.float8_e4m3fn - # Make modular kernel - mk: FusedMoEModularKernel = make_modular_kernel( - pg, pgi, low_latency_mode, hidden_size, dp_size, num_experts, - num_local_experts, q_dtype, use_fp8_dispatch, per_act_token_quant) - out_hidden_states = torch.empty_like(test_tensors.rank_tokens) total_num_tokens = test_tensors.rank_tokens.size(0) @@ -236,6 +220,19 @@ def deep_ep_moe_impl( rank_token_scales_chunk = rank_token_scales_chunk[ chunk_start:chunk_end] + quant_config = FusedMoEQuantConfig.make( + q_dtype, + w1_scale=w1_scale, + w2_scale=w2_scale, + per_act_token_quant=per_act_token_quant, + a1_scale=rank_token_scales_chunk, + ) + + # Make modular kernel + mk: FusedMoEModularKernel = make_modular_kernel( + pg, pgi, low_latency_mode, hidden_size, dp_size, num_experts, + num_local_experts, q_dtype, use_fp8_dispatch, quant_config) + out = mk.forward(hidden_states=rank_tokens_chunk, w1=w1, w2=w2, @@ -245,12 +242,6 @@ def deep_ep_moe_impl( activation="silu", global_num_experts=num_experts, expert_map=build_expert_map(), - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=None, - w2_zp=None, - a1_scale=rank_token_scales_chunk, - a2_scale=None, apply_router_weight_on_input=False) if not skip_result_store: @@ -407,7 +398,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn] @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("m,n,k", MNKs) @pytest.mark.parametrize("num_experts", [32]) @pytest.mark.parametrize("topk", [6]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @@ -416,7 +407,9 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn] @requires_deep_ep def test_deep_ep_moe( dtype: torch.dtype, - mnk: tuple[int, int, int], + m: int, + n: int, + k: int, num_experts: int, topk: int, world_dp_size: tuple[int, int], @@ -424,7 +417,6 @@ def test_deep_ep_moe( ): low_latency_mode = False use_fp8_dispatch = False - m, n, k = mnk current_platform.seed_everything(7) world_size, dp_size = world_dp_size @@ -456,20 +448,24 @@ USE_FP8_DISPATCH = [True, False] @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("m,n,k", MNKs) @pytest.mark.parametrize("num_experts", [32]) @pytest.mark.parametrize("topk", [6]) @pytest.mark.parametrize("world_dp_size", [(2, 1)]) @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH) @multi_gpu_test(num_gpus=2) @requires_deep_ep -def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int], - num_experts: int, topk: int, - world_dp_size: tuple[int, int], - use_fp8_dispatch: bool): - +def test_low_latency_deep_ep_moe( + dtype: torch.dtype, + m: int, + n: int, + k: int, + num_experts: int, + topk: int, + world_dp_size: tuple[int, int], + use_fp8_dispatch: bool, +): low_latency_mode = True - m, n, k = mnk if (low_latency_mode and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES): diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index 4472f34a6291a..d575b6d4ca62c 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -11,6 +11,8 @@ import math import pytest import torch +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config) # vLLM fused-expert reference (Triton fallback + DeepGEMM option) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.quantization.utils.fp8_utils import ( @@ -94,6 +96,13 @@ def run_single_case(m, n, k, topk, num_experts, block_size): topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1) topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_s, + w2_scale=w2_s, + a1_scale=a1_scale, + block_shape=block_size, + ) + # triton reference out_triton = fused_experts( hidden_states=tokens_bf16, @@ -102,11 +111,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size): topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - use_fp8_w8a8=True, - w1_scale=w1_s, - w2_scale=w2_s, - a1_scale=a1_scale, - block_shape=block_size, + quant_config=quant_config, allow_deep_gemm=False, ) @@ -118,19 +123,14 @@ def run_single_case(m, n, k, topk, num_experts, block_size): topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - use_fp8_w8a8=True, - w1_scale=w1_s, - w2_scale=w2_s, - a1_scale=a1_scale, - block_shape=block_size, + quant_config=quant_config, allow_deep_gemm=True, ) diff = calc_diff(out_deepgemm, out_triton) assert diff < 0.001, f"Diff exceeded 1%: {diff}" -# Note: W1 has shape (E, 2N, K), so N = 512 -# can trigger the deepgemm path. +# Note: N <= 512 will disable the deepgemm path due to performance issues. MNKs = [ (1024, 768, 128), (1024, 768, 512), @@ -144,15 +144,15 @@ TOPKS = [2, 6] NUM_EXPERTS = [32] -@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize(("m", "n", "k"), MNKs) @pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("num_experts", NUM_EXPERTS) @pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels") -def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch): +def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_DEEP_GEMM", "1") + with monkeypatch.context() as mp: + mp.setenv("VLLM_USE_DEEP_GEMM", "1") _fused_moe_mod = importlib.import_module( "vllm.model_executor.layers.fused_moe.fused_moe") @@ -168,8 +168,6 @@ def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch): monkeypatch.setattr(_fused_moe_mod, "deep_gemm_moe_fp8", _spy_deep_gemm_moe_fp8) - m, n, k = mnk - if topk > num_experts: pytest.skip(f"topk={topk} > num_experts={num_experts}") diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 52a3d2ca3b422..5564db3cda0e3 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -6,6 +6,8 @@ import pytest import torch from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( @@ -145,6 +147,14 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( custom_routing_function=Llama4MoE.custom_routing_function, scoring_func="softmax") + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=td.w13_weight_scale, + w2_scale=td.w2_weight_scale, + a1_scale=td.a1_scale, + a2_scale=td.a2_scale, + per_act_token_quant=False, + ) + output = fused_experts( td.hidden_states, td.w13_quantized, @@ -153,15 +163,10 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( topk_ids=topk_ids, inplace=False, activation="silu", - use_fp8_w8a8=True, - per_channel_quant=False, global_num_experts=e, expert_map=None, - w1_scale=td.w13_weight_scale, - w2_scale=td.w2_weight_scale, - a1_scale=td.a1_scale, - a2_scale=td.a2_scale, apply_router_weight_on_input=True, + quant_config=quant_config, ) flashinfer_output = apply_flashinfer_per_tensor_scale_fp8( @@ -210,6 +215,14 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( custom_routing_function=Llama4MoE.custom_routing_function, scoring_func="softmax") + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=td.w13_weight_scale, + w2_scale=td.w2_weight_scale, + a1_scale=td.a1_scale, + a2_scale=td.a2_scale, + per_act_token_quant=False, + ) + output = fused_experts( td.hidden_states, td.w13_quantized, @@ -218,15 +231,10 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( topk_ids=topk_ids, inplace=False, activation="silu", - use_fp8_w8a8=True, - per_channel_quant=False, global_num_experts=e, expert_map=None, - w1_scale=td.w13_weight_scale, - w2_scale=td.w2_weight_scale, - a1_scale=td.a1_scale, - a2_scale=td.a2_scale, apply_router_weight_on_input=True, + quant_config=quant_config, ) td.layer.dp_size = 1 diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index 1c14df2b914aa..8bf096b798cb8 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -3,7 +3,7 @@ import pytest import torch -from tests.kernels.moe.utils import make_test_weights +from tests.kernels.moe.utils import make_test_quant_config from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dtype) @@ -41,7 +41,6 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [40, 64, 256]) -#@pytest.mark.parametrize("e", [128, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) @torch.inference_mode() @@ -56,16 +55,15 @@ def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, quant_blocksize = 16 - (_, w1_q, w1_blockscale, - w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights( - e, - n, - k, - in_dtype=dtype, - quant_dtype="nvfp4", - block_shape=None, # use quant_blocksize? - per_act_token_quant=False, - ) + w1_q, w2_q, quant_config = make_test_quant_config( + e, + n, + k, + in_dtype=dtype, + quant_dtype="nvfp4", + block_shape=None, + per_act_token_quant=False, + ) score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids, _ = fused_topk(a, @@ -73,35 +71,17 @@ def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, topk, renormalize=False) - a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) - a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) - assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q) - assert w1_gs is not None - assert w2_gs is not None - assert w1_blockscale is not None - assert w2_blockscale is not None - flashinfer_experts = FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), - FlashInferExperts( - a1_gscale=a1_gs, - g1_alphas=(1 / w1_gs), - a2_gscale=a2_gs, - g2_alphas=(1 / w2_gs), - out_dtype=dtype, - quant_dtype="nvfp4", - )) + FlashInferExperts(out_dtype=dtype, quant_config=quant_config), + ) flashinfer_output = flashinfer_experts( hidden_states=a, w1=w1_q, - w1_scale=w1_blockscale, w2=w2_q, - w2_scale=w2_blockscale, - a1_scale=a1_gs, - a2_scale=a2_gs, topk_weights=topk_weights, topk_ids=topk_ids, ) @@ -122,18 +102,18 @@ def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype) for idx in range(0, e): - w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx], - w1_blockscale[idx], - w1_gs[idx], - dtype=dtype, - device=w1_q.device, - block_size=quant_blocksize) - w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx], - w2_blockscale[idx], - w2_gs[idx], - dtype=dtype, - device=w2_q.device, - block_size=quant_blocksize) + w1_d[idx] = dequantize_nvfp4_to_dtype( + w1_q[idx], + quant_config.w1_scale[idx], (1 / quant_config.g1_alphas[idx]), + dtype=dtype, + device=w1_q.device, + block_size=quant_blocksize) + w2_d[idx] = dequantize_nvfp4_to_dtype( + w2_q[idx], + quant_config.w2_scale[idx], (1 / quant_config.g2_alphas[idx]), + dtype=dtype, + device=w2_q.device, + block_size=quant_blocksize) torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk) diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index 54f2351bf6d9b..024993c7677dd 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -23,6 +23,7 @@ from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor from triton_kernels.tensor_details import layout from triton_kernels.testing import assert_close +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk @@ -293,6 +294,13 @@ def test_equiv(num_token, a_dtype, w_dtype, tp): pc2, ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8) + quant_config = FusedMoEQuantConfig.make( + w1_bias=w1_bias_tri, + w2_bias=w2_bias_tri, + w1_precision=pc1, + w2_precision=pc2, + ) + out_triton_monolithic = triton_kernel_moe_forward( hidden_states=x_tri, w1=w1_tri, @@ -300,10 +308,7 @@ def test_equiv(num_token, a_dtype, w_dtype, tp): gating_output=exp_data_tri, topk=topk, renormalize=True, - w1_bias=w1_bias_tri, - w2_bias=w2_bias_tri, - w1_precision=pc1, - w2_precision=pc2, + quant_config=quant_config, ) out_triton_monolithic = out_triton_monolithic[..., :K] @@ -336,6 +341,13 @@ def batched_moe( ) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) + quant_config = FusedMoEQuantConfig.make( + w1_precision=w1_precision, + w2_precision=w2_precision, + w1_bias=w1_bias, + w2_bias=w2_bias, + ) + fused_experts = FusedMoEModularKernel( BatchedPrepareAndFinalize( max_num_tokens, @@ -344,19 +356,12 @@ def batched_moe( rank=0, ), BatchedOAITritonExperts( - None, max_num_tokens=max_num_tokens, num_dispatchers=1, - w1_precision=w1_precision, - w2_precision=w2_precision, + quant_config=quant_config, ), ) - extra_expert_args = { - "w1_bias": w1_bias, - "w2_bias": w2_bias, - } - topk_weight, topk_ids, _ = fused_topk(a, gating_output, topk, renormalize) return fused_experts( @@ -365,7 +370,6 @@ def batched_moe( w2, topk_weight, topk_ids, - extra_expert_args=extra_expert_args, ) diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 6112183be5475..19c4301bd23d5 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -12,7 +12,6 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.config import VllmConfig, current_platform, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe @@ -22,7 +21,8 @@ from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors, run_modular_kernel) from .modular_kernel_tools.mk_objects import ( MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, - MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, expert_info) + MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, TestMoEQuantConfig, + expert_info) from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo, parallel_launch_with_config) @@ -55,7 +55,7 @@ def rank_worker( pgi: ProcessGroupInfo, vllm_config: VllmConfig, cpu_group, - config: Config, + base_config: Config, weights: WeightTensors, verbose: bool, ): @@ -63,42 +63,44 @@ def rank_worker( # sanity check from vllm import envs - if config.fused_moe_chunk_size is not None: - assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE) + if base_config.fused_moe_chunk_size is not None: + assert ( + base_config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE) # get weights to this device weights.to_current_device() - Ms = config.Ms + Ms = base_config.Ms assert isinstance(Ms, list) - TOPKs = config.topks + TOPKs = base_config.topks assert isinstance(TOPKs, list) exceptions = [] count = 0 for m, topk in product(Ms, TOPKs): + # override m and topk + config = copy.deepcopy(base_config) + config.Ms = m + config.topks = topk + try: print(f"Running[{pgi.rank}]: m={m}, topk={topk} ...") count = count + 1 - # override m and topk - cfgx = copy.deepcopy(config) - cfgx.Ms = m - cfgx.topks = topk # inputs for rank - rank_tensors = RankTensors.make(cfgx, pgi) + rank_tensors = RankTensors.make(config, pgi) # modular kernel out - mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, + mk_out = run_modular_kernel(pgi, vllm_config, config, weights, rank_tensors) with set_current_vllm_config(vllm_config): - ref_out = reference_moe_impl(cfgx, weights, rank_tensors) + ref_out = reference_moe_impl(config, weights, rank_tensors) if config.quant_dtype == "nvfp4": - atol = 1e-1 - rtol = 1e-1 + atol = 1e-1 if config.K < 4096 else 2e-1 + rtol = 1e-1 if config.K < 4096 else 2e-1 else: atol = 3e-2 rtol = 3e-2 @@ -132,7 +134,7 @@ Ms = [32, 64] # hidden sizes, making this too large will cause fp4 tests to fail. # Also needs to be a multiple of 1024 for deep_gemm. Ks = [2048] -Ns = [2048] +Ns = [1024] TOPKs = [4, 1] Es = [32] DTYPEs = [torch.bfloat16] @@ -167,7 +169,7 @@ def is_nyi_config(config: Config) -> bool: @meets_multi_gpu_requirements def test_modular_kernel_combinations_multigpu( k: int, n: int, e: int, dtype: torch.dtype, - quant_config: Optional[FusedMoEQuantConfig], + quant_config: Optional[TestMoEQuantConfig], combination: tuple[mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute], fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig): @@ -208,7 +210,7 @@ def test_modular_kernel_combinations_multigpu( @pytest.mark.parametrize("world_size", [1]) def test_modular_kernel_combinations_singlegpu( k: int, n: int, e: int, dtype: torch.dtype, - quant_config: Optional[FusedMoEQuantConfig], + quant_config: Optional[TestMoEQuantConfig], combination: tuple[mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute], fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig): diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 850c486b95240..00835bec9a15c 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -15,11 +15,14 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock import vllm.model_executor.layers.fused_moe # noqa +from tests.kernels.moe.utils import fused_moe from tests.kernels.utils import opcheck, stack_and_dev, torch_moe from vllm.config import VllmConfig, set_current_vllm_config from vllm.distributed.parallel_state import init_distributed_environment from vllm.forward_context import set_forward_context -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, int4_w4a16_moe_quant_config, + int8_w8a16_moe_quant_config) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, modular_triton_fused_moe) from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( @@ -187,14 +190,9 @@ def test_fused_moe( # # Setup test functions # + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG - m_fused_moe_fn = modular_triton_fused_moe(use_fp8_w8a8=False, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - use_mxfp4_w4a4=False, - per_act_token_quant=False, - block_shape=None) + m_fused_moe_fn = modular_triton_fused_moe(quant_config) def m_fused_moe( a: torch.Tensor, @@ -340,6 +338,18 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, else: e_map = None + if weight_bits == 4: + quant_config_builder = int4_w4a16_moe_quant_config + else: + assert weight_bits == 8 + quant_config_builder = int8_w8a16_moe_quant_config + + quant_config = quant_config_builder(w1_scale=w1_scales, + w2_scale=w2_scales, + w1_zp=w1_qzeros if has_zp else None, + w2_zp=w2_qzeros if has_zp else None, + block_shape=[0, group_size]) + with set_current_vllm_config(vllm_config): triton_output = fused_moe(a, w1_qweight, @@ -347,15 +357,9 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, score, topk, renormalize=False, - use_int4_w4a16=weight_bits == 4, - use_int8_w8a16=weight_bits == 8, global_num_experts=e, expert_map=e_map, - w1_scale=w1_scales, - w2_scale=w2_scales, - w1_zp=w1_qzeros if has_zp else None, - w2_zp=w2_qzeros if has_zp else None, - block_shape=[0, group_size]) + quant_config=quant_config) torch_output = torch_moe(a, w1_ref, w2_ref, diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index 30388ef9375d4..a48bfeb10b2e6 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -10,6 +10,7 @@ from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, from tests.kernels.utils import torch_moe from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.platforms import current_platform @@ -56,7 +57,7 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, in_dtype=dtype, quant_dtype="nvfp4", block_shape=None, # use quant_blocksize? - per_act_token_quant=False, + per_out_ch_quant=False, ) score = torch.randn((m, e), device="cuda", dtype=dtype) @@ -73,18 +74,22 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, assert w1_blockscale is not None assert w2_blockscale is not None + quant_config = nvfp4_moe_quant_config( + g1_alphas=(1 / w1_gs), + g2_alphas=(1 / w2_gs), + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + ) + cutlass_output = cutlass_moe_fp4( a=a, - a1_gscale=a1_gs, w1_fp4=w1_q, - w1_blockscale=w1_blockscale, - g1_alphas=(1 / w1_gs), - a2_gscale=a2_gs, w2_fp4=w2_q, - w2_blockscale=w2_blockscale, - g2_alphas=(1 / w2_gs), topk_weights=topk_weights, topk_ids=topk_ids, + quant_config=quant_config, m=m, n=n, k=k, diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 9e78f4d6e4da0..59126cef6adbb 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -9,6 +9,8 @@ import torch from tests.kernels.utils import torch_experts from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config) from vllm.model_executor.layers.fused_moe.cutlass_moe import ( CutlassBatchedExpertsFp8) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk @@ -143,10 +145,16 @@ def pplx_cutlass_moe( device="cuda", dtype=torch.int64) - experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers, - out_dtype, per_act_token, per_out_ch, - ab_strides1, ab_strides2, c_strides1, - c_strides2) + experts = CutlassBatchedExpertsFp8( + num_local_experts, num_dispatchers, out_dtype, ab_strides1, + ab_strides2, c_strides1, c_strides2, + fp8_w8a8_moe_quant_config( + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + w1_scale=chunk_by_rank(w1_scale, rank, world_size), + w2_scale=chunk_by_rank(w2_scale, rank, world_size), + a1_scale=chunk_by_rank(a1_scale, rank, world_size) + if per_act_token else a1_scale[rank])) fused_cutlass_experts = FusedMoEModularKernel( prepare_finalize, @@ -167,10 +175,7 @@ def pplx_cutlass_moe( chunk_topk_ids, global_num_experts=num_experts, expert_map=None, #TODO - w1_scale=chunk_by_rank(w1_scale, rank, world_size), - w2_scale=chunk_by_rank(w2_scale, rank, world_size), - a1_scale=chunk_by_rank(a1_scale, rank, world_size) - if per_act_token else a1_scale[rank]) + ) torch.cuda.synchronize() diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 394f521140859..4ca4a1e79c57c 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -58,7 +58,7 @@ BATCHED_MOE_MNK_FACTORS = [ ] PPLX_COMBOS = [ - # TODO: figure out why this fails, seems to be test problem + # TODO(bnell): figure out why this fails, seems to be test problem #(1, 128, 128), (2, 128, 512), (3, 1024, 2048), @@ -360,18 +360,18 @@ def pplx_prepare_finalize( b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare( a_chunk, - a1_scale, - a2_scale, chunk_topk_weight, chunk_topk_ids, num_experts, None, False, - FusedMoEQuantConfig( + FusedMoEQuantConfig.make( quant_dtype, - per_act_token_quant, - False, - block_shape, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=False, + block_shape=block_shape, + a1_scale=a1_scale, + a2_scale=a2_scale, ), ) @@ -540,20 +540,6 @@ def pplx_moe( topk_ids = topk_ids.to(dtype=torch.uint32) - experts = BatchedTritonExperts( - max_num_tokens=max_num_tokens, - num_dispatchers=prepare_finalize.num_dispatchers(), - use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn, - block_shape=block_shape, - per_act_token_quant=per_act_token_quant, - ) - - fused_experts = FusedMoEModularKernel( - prepare_finalize, - experts, - shared_experts, - ) - # Note: workers with the same dp_rank must use the exact same inputs. a_chunk = chunk_by_rank(a, rank, world_size) chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size) @@ -567,6 +553,28 @@ def pplx_moe( a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size) a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size) + quant_config = FusedMoEQuantConfig.make( + quant_dtype, + block_shape=block_shape, + per_act_token_quant=per_act_token_quant, + w1_scale=w1_scale_chunk, + w2_scale=w2_scale_chunk, + a1_scale=a1_scale_chunk, + a2_scale=a2_scale_chunk, + ) + + experts = BatchedTritonExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=quant_config, + ) + + fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + shared_experts, + ) + # Note: for now use_compile will error out if the problem size is # large enough to trigger chunking. I'm leaving the flag and # setup code in case we are able to revisit this later. @@ -585,10 +593,6 @@ def pplx_moe( w2_chunk, chunk_topk_weight, chunk_topk_ids, - w1_scale=w1_scale_chunk, - w2_scale=w2_scale_chunk, - a1_scale=a1_scale_chunk, - a2_scale=a2_scale_chunk, global_num_experts=num_experts) if use_cudagraphs: @@ -605,10 +609,6 @@ def pplx_moe( w2_chunk, chunk_topk_weight, chunk_topk_ids, - w1_scale=w1_scale_chunk, - w2_scale=w2_scale_chunk, - a1_scale=a1_scale_chunk, - a2_scale=a2_scale_chunk, global_num_experts=num_experts) torch.cuda.synchronize() @@ -820,7 +820,7 @@ def test_pplx_moe_slow( k, quant_dtype=quant_dtype, block_shape=block_shape, - per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_act_token_quant, ) parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e, @@ -897,7 +897,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, k, quant_dtype=quant_dtype, block_shape=block_shape, - per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_act_token_quant, ) args["w1"] = w1 args["w2"] = w2 diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index dfd0f35c8da3d..1c31464b30e7f 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -7,10 +7,12 @@ import itertools import pytest import torch +from tests.kernels.moe.utils import fused_moe from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config) from vllm.platforms import current_platform if current_platform.get_device_capability() < (9, 0): @@ -152,11 +154,12 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): score, topk, renormalize=False, - use_fp8_w8a8=True, # using fp8 - per_channel_quant=True, - w1_scale=w1_s, - w2_scale=w2_s, - block_shape=None, # Not using block quantization + quant_config=fp8_w8a8_moe_quant_config( + per_act_token_quant=True, + w1_scale=w1_s, + w2_scale=w2_s, + block_shape=None, # Not using block quantization + ), ) # Check results diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 4b58a28eed125..7a0feb6a20795 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -9,7 +9,8 @@ from tests.kernels.quant_utils import per_block_cast_to_int8 from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_experts +from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) from vllm.model_executor.layers.fused_moe.modular_kernel import ( @@ -34,18 +35,22 @@ def triton_moe( per_act_token_quant=False, block_shape: Optional[list[int]] = None, ) -> torch.Tensor: + quant_config = FusedMoEQuantConfig.make( + quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) + return fused_experts(a, w1, w2, topk_weight, topk_ids, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - per_channel_quant=per_act_token_quant, - use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn, - block_shape=block_shape) + quant_config=quant_config) def batched_moe( @@ -64,6 +69,16 @@ def batched_moe( ) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) + quant_config = FusedMoEQuantConfig.make( + quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) + fused_experts = FusedMoEModularKernel( BatchedPrepareAndFinalize(max_num_tokens, num_dispatchers=1, @@ -72,21 +87,11 @@ def batched_moe( BatchedTritonExperts( max_num_tokens=max_num_tokens, num_dispatchers=1, - use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, + quant_config=quant_config, ), ) - return fused_experts(a, - w1, - w2, - topk_weight, - topk_ids, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale) + return fused_experts(a, w1, w2, topk_weight, topk_ids) def naive_batched_moe( @@ -105,6 +110,16 @@ def naive_batched_moe( ) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) + quant_config = FusedMoEQuantConfig.make( + quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) + fused_experts = FusedMoEModularKernel( BatchedPrepareAndFinalize(max_num_tokens, num_dispatchers=1, @@ -113,21 +128,11 @@ def naive_batched_moe( NaiveBatchedExperts( max_num_tokens=max_num_tokens, num_dispatchers=1, - use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, + quant_config=quant_config, ), ) - return fused_experts(a, - w1, - w2, - topk_weight, - topk_ids, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale) + return fused_experts(a, w1, w2, topk_weight, topk_ids) def chunk_scales(scales: Optional[torch.Tensor], start: int, @@ -216,7 +221,7 @@ def make_test_weight( in_dtype: torch.dtype = torch.bfloat16, quant_dtype: Union[torch.dtype, str, None] = None, block_shape: Optional[list[int]] = None, - per_act_token_quant: bool = False, + per_out_ch_quant: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15 @@ -228,7 +233,7 @@ def make_test_weight( w_gs_l = [None] * e for idx in range(e): w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights( - w_16[idx], None, quant_dtype, per_act_token_quant, block_shape) + w_16[idx], None, quant_dtype, per_out_ch_quant, block_shape) w = torch.stack(w_l) w_s = torch.stack(w_s_l) @@ -258,16 +263,16 @@ def make_test_weights( in_dtype: torch.dtype = torch.bfloat16, quant_dtype: Union[torch.dtype, str, None] = None, block_shape: Optional[list[int]] = None, - per_act_token_quant: bool = False, + per_out_ch_quant: bool = False, ) -> tuple[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]], tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]]: return ( make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape, - per_act_token_quant), + per_out_ch_quant), make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, - per_act_token_quant), + per_out_ch_quant), ) @@ -285,6 +290,76 @@ def per_token_cast_to_fp8( return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) +def make_test_quant_config( + e: int, + n: int, + k: int, + in_dtype: torch.dtype, + quant_dtype: Union[torch.dtype, str, None] = None, + per_act_token_quant: bool = False, + block_shape: Optional[list[int]] = None, +) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]: + (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights( + e, + n, + k, + in_dtype, + quant_dtype, + per_out_ch_quant=per_act_token_quant, + block_shape=block_shape, + ) + + # Hacky/trivial scales for nvfp4. + a1_gscale: Optional[torch.Tensor] = None + a2_gscale: Optional[torch.Tensor] = None + if quant_dtype == "nvfp4": + a1_gscale = torch.ones((e, ), device="cuda", dtype=torch.float32) + a2_gscale = torch.ones((e, ), device="cuda", dtype=torch.float32) + a1_scale = a1_gscale + a2_scale = a2_gscale + else: + a1_scale = None + a2_scale = None + + return w1, w2, FusedMoEQuantConfig.make( + quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + w1_scale=w1_s, + w2_scale=w2_s, + a1_gscale=a1_gscale, + a2_gscale=a2_gscale, + a1_scale=a1_scale, + a2_scale=a2_scale, + # TODO: make sure this is handled properly + g1_alphas=(1 / w1_gs) if w1_gs is not None else None, + g2_alphas=(1 / w2_gs) if w2_gs is not None else None, + ) + + +def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + score: torch.Tensor, + topk: int, + renormalize: bool = False, + quant_config: Optional[FusedMoEQuantConfig] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, +) -> torch.Tensor: + topk_weights, topk_ids, _ = fused_topk(hidden_states, score.float(), topk, + renormalize) + return fused_experts(hidden_states, + w1, + w2, + topk_weights, + topk_ids, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=quant_config) + + # CustomOp? class BaselineMM(torch.nn.Module): diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py index dc5fecbf4ccc8..f2271e6be5420 100644 --- a/tests/kernels/quantization/test_int8_kernel.py +++ b/tests/kernels/quantization/test_int8_kernel.py @@ -8,7 +8,8 @@ import pytest import torch from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_experts +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.quantization.utils.int8_utils import ( per_token_quant_int8) from vllm.platforms import current_platform @@ -42,7 +43,8 @@ def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): return C.reshape(origin_C_shape).to(output_dtype) -def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk): +def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, topk, topk_weight, + topk_ids): """This function performs fused moe with per-column int8 quantization using native torch.""" @@ -57,8 +59,6 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk): out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) # Calculate routing - score = torch.softmax(score, dim=-1, dtype=torch.float32) - topk_weight, topk_ids = torch.topk(score, topk) topk_weight = topk_weight.view(-1) topk_ids = topk_ids.view(-1) # Process each expert @@ -127,20 +127,27 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale score = torch.randn((M, E), dtype=dtype) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weights, topk_ids = torch.topk(score, topk) - ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk) - out = fused_moe( + ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, topk, + topk_weights, topk_ids) + + quant_config = FusedMoEQuantConfig.make( + torch.int8, + per_act_token_quant=True, + block_shape=None, + w1_scale=w1_s, + w2_scale=w2_s, + ) + + out = fused_experts( a, w1, w2, - score, - topk, - renormalize=False, - use_int8_w8a8=True, # Using int8-w8a8 - per_channel_quant=True, - w1_scale=w1_s, - w2_scale=w2_s, - block_shape=None, # Not using block quantization + topk_weights, + topk_ids, + quant_config=quant_config, ) # Check results diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 3007643d7a288..6730f051e3d71 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -10,6 +10,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEActivationFormat, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize) +from vllm.model_executor.layers.fused_moe.utils import activation_without_mul from vllm.triton_utils import HAS_TRITON _config: Optional[dict[str, Any]] = None @@ -36,6 +37,7 @@ __all__ = [ "FusedMoEPermuteExpertsUnpermute", "FusedMoEActivationFormat", "FusedMoEPrepareAndFinalize", + "activation_without_mul", "override_config", "get_config", ] @@ -43,7 +45,6 @@ __all__ = [ if HAS_TRITON: # import to register the custom ops import vllm.model_executor.layers.fused_moe.fused_marlin_moe # noqa - import vllm.model_executor.layers.fused_moe.fused_moe # noqa from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( BatchedDeepGemmExperts) from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 @@ -56,13 +57,12 @@ if HAS_TRITON: from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts) from vllm.model_executor.layers.fused_moe.fused_moe import ( - TritonExperts, fused_experts, fused_moe, fused_topk, - get_config_file_name, grouped_topk) + TritonExperts, fused_experts, fused_topk, get_config_file_name, + grouped_topk) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts) __all__ += [ - "fused_moe", "fused_topk", "fused_experts", "get_config_file_name", diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 0ab6355f41565..e9dfb22bea27b 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -8,6 +8,8 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.deep_gemm_utils import ( + deep_gemm_block_shape) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import _resize_cache @@ -212,27 +214,20 @@ def silu_mul_fp8_quant_deep_gemm_cuda( class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): - # The Deep Gemm kernels only support block size of 128 - DEEPGEMM_BLOCK_SHAPE: list[int] = [128, 128] - def __init__(self, - max_num_tokens: int, - num_dispatchers: int, - block_shape: list[int], - per_act_token_quant=False): + def __init__( + self, + max_num_tokens: int, + num_dispatchers: int, + quant_config: FusedMoEQuantConfig, + ): """ max_num_tokens: Maximum number of tokens from a DP Rank num_dispatchers: The number of DP dispatchers. - block_shape: Block quantization block shape. - per_act_token_quant: Per activation token quantization flag. + quant_config: Quantization configuration """ - super().__init__( - FusedMoEQuantConfig( - quant_dtype=torch.float8_e4m3fn, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, - )) - assert self.block_shape == self.DEEPGEMM_BLOCK_SHAPE + super().__init__(quant_config) + assert self.block_shape == deep_gemm_block_shape() self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers @@ -290,12 +285,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -321,11 +311,11 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # for the M expectation of each batch, correctly setting this value # may lead to better performance. expected_m = max_num_tokens - fp8_m_grouped_gemm_nt_masked((a1q, a1q_scale), (w1, w1_scale), + fp8_m_grouped_gemm_nt_masked((a1q, a1q_scale), (w1, self.w1_scale), workspace1, expert_num_tokens, expected_m) a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm_cuda( workspace1, expert_num_tokens) - fp8_m_grouped_gemm_nt_masked((a2q, a2q_scale), (w2, w2_scale), output, - expert_num_tokens, expected_m) + fp8_m_grouped_gemm_nt_masked((a2q, a2q_scale), (w2, self.w2_scale), + output, expert_num_tokens, expected_m) diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index 89d7412ee2236..8b9070f098898 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -8,55 +8,37 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( BatchedDeepGemmExperts) from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.deep_gemm_utils import ( + deep_gemm_block_shape) from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts) class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): - def __init__(self, - max_num_tokens: int, - num_dispatchers: int, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - block_shape: Optional[list[int]] = None, - per_act_token_quant: bool = False, - allow_deep_gemm: bool = False): - assert not use_int8_w8a8, "NYI" - assert not use_int8_w8a16, "NYI" - assert not use_int4_w4a16, "NYI" - - super().__init__( - FusedMoEQuantConfig.make( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - block_shape=block_shape, - per_act_token_quant=per_act_token_quant, - )) + def __init__( + self, + max_num_tokens: int, + num_dispatchers: int, + quant_config: FusedMoEQuantConfig, + allow_deep_gemm: bool = False, + ): + super().__init__(quant_config) self.batched_triton_experts = BatchedTritonExperts( max_num_tokens=max_num_tokens, num_dispatchers=num_dispatchers, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - per_act_token_quant=self.per_act_token_quant, - block_shape=self.block_shape, + quant_config=self.quant_config, ) - self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8 - and self.block_shape - == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE) + self.allow_deep_gemm = (allow_deep_gemm + and self.quant_config.use_fp8_w8a8 and + self.block_shape == deep_gemm_block_shape()) self.batched_deep_gemm_experts = BatchedDeepGemmExperts( max_num_tokens=max_num_tokens, num_dispatchers=num_dispatchers, - block_shape=self.block_shape, # type: ignore[arg-type] + quant_config=self.quant_config, ) if self.allow_deep_gemm else None assert (self.batched_deep_gemm_experts is not None @@ -143,12 +125,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -158,7 +135,6 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): if self.allow_deep_gemm else self.batched_triton_experts) assert experts is not None experts.apply(output, hidden_states, w1, w2, topk_weights, topk_ids, - activation, global_num_experts, expert_map, w1_scale, - w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale, workspace13, - workspace2, expert_tokens_meta, + activation, global_num_experts, expert_map, a1q_scale, + workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 0b501cd87fb5d..742df3dbdc6af 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -1,103 +1,322 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union import torch -from compressed_tensors.quantization import (QuantizationArgs, - QuantizationStrategy, - QuantizationType) import vllm.envs as envs from vllm.config import ParallelConfig from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.utils import cdiv +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape) +from vllm.utils import cdiv, has_triton_kernels from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +if TYPE_CHECKING and has_triton_kernels: + from triton_kernels.matmul_ogs import PrecisionConfig + logger = init_logger(__name__) -def _get_quant_config_quantization_args( - quant_config: Optional[QuantizationConfig], - prop_name: str, -) -> Optional[QuantizationArgs]: - if (quant_config is not None and hasattr(quant_config, 'target_scheme_map') - and "Linear" in quant_config.target_scheme_map and - "input_activations" in quant_config.target_scheme_map["Linear"]): - return quant_config.target_scheme_map["Linear"].get(prop_name) - else: - return None - - -def get_quant_config_input_quant( - quant_config: Optional[QuantizationConfig] -) -> Optional[QuantizationArgs]: - return _get_quant_config_quantization_args(quant_config, - "input_activations") - - -def get_quant_config_weight_quant( - quant_config: Optional[QuantizationConfig] -) -> Optional[QuantizationArgs]: - return _get_quant_config_quantization_args(quant_config, "weights") - - -def get_config_quant_dtype( - use_fp8_w8a8: bool, - use_int8_w8a8: bool, - use_int8_w8a16: bool, - use_int4_w4a16: bool, - use_mxfp4_w4a4: bool, -) -> Union[None, torch.dtype, str]: +def _get_config_dtype_str( + dtype: torch.dtype, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, +) -> Optional[str]: + """ + Return a string used to construct the filename that contains the + tuning info for a particular quantization scheme. See + try_get_optimal_moe_config in fused_moe.py. + """ if use_fp8_w8a8: - return torch.float8_e4m3fn - elif use_int8_w8a8: - return torch.int8 + return "fp8_w8a8" + elif use_int8_w8a16: + return "int8_w8a16" + elif use_int4_w4a16: + return "int4_w4a16" elif use_mxfp4_w4a4: - return "mxfp4" + return "mxfp4_w4a4" + elif dtype == torch.float: + # avoiding cases where kernel fails when float32 MoE + # use fp16/bfloat16 configs + return "float32" return None +def _quant_flags_to_group_shape( + quant_dtype: Union[torch.dtype, str, None], + per_act_token_quant: bool, + per_out_ch_quant: bool, + block_shape: Optional[list[int]], +) -> tuple[Optional[GroupShape], Optional[GroupShape]]: + """ + Convert MoE quantization flags into more generic GroupShapes. + """ + a_shape: Optional[GroupShape] + w_shape: Optional[GroupShape] + if block_shape is not None: + assert not per_act_token_quant + assert not per_out_ch_quant + # TODO(bnell): this is not quite right for activations since first + # dim should be 1. + a_shape = GroupShape(row=block_shape[0], col=block_shape[1]) + w_shape = GroupShape(row=block_shape[0], col=block_shape[1]) + else: + w_shape = None + a_shape = None if quant_dtype is None else GroupShape.PER_TENSOR + + if per_act_token_quant: + a_shape = GroupShape.PER_TOKEN + + if per_out_ch_quant: + w_shape = GroupShape.PER_TOKEN + + return a_shape, w_shape + + +@dataclass +class FusedMoEQuantDesc: + """ + A quantization descriptor for fused MoE ops. This class can describe + either activations or weights. + """ + + # The quantized type of this parameters. None means unquantized or + # already quantized. + # TODO (bnell): use scalar_type instead of Union. + dtype: Union[torch.dtype, str, None] = None + + # A field that describes the quantization group shape, from quant_utils.py. + # * (-1, -1) for per-tensor quantization + # * (1, -1) for per-row quantization + # * (-1, 1) for per-column quantization + # * (128, 128) for 128x128 deepseek style block quantization + # * (1, 128) for deepseek style activation quantization + # (i.e. per-token-per-group) + shape: Optional[GroupShape] = None + + # Quantization scales. + # TODO(bnell): maybe put PrecisionConfigs in subclass of QuantDesc? + scale: Union[torch.Tensor, "PrecisionConfig", None] = None + + # Quantization alphas or gscales, used for nvfp4 types. + # TODO(bnell): put some of these in subclasses + alpha_or_gscale: Optional[torch.Tensor] = None + + # Zero points for int4/int8 types + zp: Optional[torch.Tensor] = None + + # Biases for GPT triton MoE + bias: Optional[torch.Tensor] = None + + +# TODO(bnell): have subclasses for specific moe methods? +# e.g. for specific arguments bias, precision, etc. @dataclass class FusedMoEQuantConfig: - # The post quantization activation type. - # TODO (bnell): use scalar_type instead of Union. - quant_dtype: Union[torch.dtype, str, None] = None - per_act_token_quant: bool = False - per_out_ch_quant: bool = False - block_shape: Optional[list[int]] = None + """ + The FusedMoEQuantConfig contains all the quantization parameters for + a single FusedMoEMethodBase operation. It consists of four + FusedMoEQuantDescs, one for each activation and set of weights. - # TODO: add col major flag? - # add detailed quant info for input, intermediates, weights, etc? + Each FusedMoEMethodBase must implement a get_fused_moe_quant_config + method to construct a FusedMoEQuantConfig for use with that class. + + FusedMoEQuant configs are only used for modular kernels, fused_experts + (from fused_moe.py), cutlass_moe_fp[48], rocm_aiter_fused_experts and + triton_kernel_moe_forward. Other MoE methods can ignore the + FusedMoEQuantConfig (for now) and hardcode it to None. + + There are currently some restrictions on what can be expressed: + - Most MoE ops only support similar quantization strategies for + each parameter, e.g. both weights must have the same GroupShape + and both activations must share the same GroupShape. One exception to + this is the cutlass moe which allows per channel quantization on the + outputs. Note: this restrictions are not always rigorously checked. + - Not all fused MoE functions support all the parameters, e.g. zero points, + global scales, alphas and biases are not universally supported. + - Fully general GroupShapes are not allowed. Activations only support + per token, per tensor or K-blocked. + - Weights are not required to have a GroupShape since they have already + been quantized. + + Other notes: + - PrecisionConfigs are specific to GPT OSS Triton. + - As a follow up it would probably make sense to subclass FusedMoEQuantDesc + or FusedMoEQuantConfig for particular FusedMoEMethodBase subclasses + so that only the required quantization parameters are used/stored. + """ + + # TODO(bnell) make sure a1_scales/a2_scales don't interfere with chunking + _a1: FusedMoEQuantDesc + _a2: FusedMoEQuantDesc + _w1: FusedMoEQuantDesc + _w2: FusedMoEQuantDesc def __post_init__(self): assert (not self.per_act_token_quant or self.block_shape is None), "illegal quantization" + # + # Convenience accessors for various properties. + # + + @property + def quant_dtype(self) -> Union[torch.dtype, str, None]: + return self._a1.dtype + @property def is_quantized(self) -> bool: return self.quant_dtype is not None @property def is_per_act_token(self) -> bool: - return self.per_act_token_quant + return self._a1.shape == GroupShape.PER_TOKEN + + @property + def per_act_token_quant(self) -> bool: + return self._a1.shape == GroupShape.PER_TOKEN + + @property + def per_out_ch_quant(self) -> bool: + return self._w1.shape == GroupShape.PER_TOKEN + + @property + def is_per_tensor(self) -> bool: + return self._a1.shape == GroupShape.PER_TENSOR + + @property + def block_shape(self) -> Optional[list[int]]: + if (self._a1.shape is not None + and self._a1.shape != GroupShape.PER_TENSOR + and self._a1.shape != GroupShape.PER_TOKEN): + return [self._a1.shape.row, self._a1.shape.col] + else: + return None @property def is_block_quantized(self) -> bool: return self.block_shape is not None @property - def is_per_tensor(self) -> bool: - return not self.per_act_token_quant and self.block_shape is None + def a1_scale(self) -> Optional[torch.Tensor]: + assert self._a1.scale is None or isinstance(self._a1.scale, + torch.Tensor) + return self._a1.scale + + @property + def a1_gscale(self) -> Optional[torch.Tensor]: + return self._a1.alpha_or_gscale + + @property + def a2_scale(self) -> Optional[torch.Tensor]: + assert self._a2.scale is None or isinstance(self._a2.scale, + torch.Tensor) + return self._a2.scale + + @property + def a2_gscale(self) -> Optional[torch.Tensor]: + return self._a2.alpha_or_gscale + + @property + def w1_scale(self) -> Optional[torch.Tensor]: + assert self._w1.scale is None or isinstance(self._w1.scale, + torch.Tensor) + return self._w1.scale + + @property + def w1_zp(self) -> Optional[torch.Tensor]: + return self._w1.zp + + @property + def w1_bias(self) -> Optional[torch.Tensor]: + return self._w1.bias + + @property + def w1_precision(self) -> Optional["PrecisionConfig"]: + assert self._w1.scale is None or isinstance(self._w1.scale, + PrecisionConfig) + return self._w1.scale + + @property + def g1_alphas(self) -> Optional[torch.Tensor]: + return self._w1.alpha_or_gscale + + @property + def w2_scale(self) -> Optional[torch.Tensor]: + assert self._w2.scale is None or isinstance(self._w2.scale, + torch.Tensor) + return self._w2.scale + + @property + def w2_zp(self) -> Optional[torch.Tensor]: + return self._w2.zp + + @property + def w2_bias(self) -> Optional[torch.Tensor]: + return self._w2.bias + + @property + def w2_precision(self) -> Optional["PrecisionConfig"]: + assert self._w2.scale is None or isinstance(self._w2.scale, + PrecisionConfig) + return self._w2.scale + + @property + def g2_alphas(self) -> Optional[torch.Tensor]: + return self._w2.alpha_or_gscale + + @property + def use_fp8_w8a8(self) -> bool: + return self.quant_dtype == torch.float8_e4m3fn + + @property + def use_int8_w8a8(self) -> bool: + return self.quant_dtype == torch.int8 + + @property + def use_int8_w8a16(self) -> bool: + return (self._a1.dtype is None and self._w1.dtype == torch.int8) + + @property + def use_int4_w4a16(self) -> bool: + return (self._a1.dtype is None and self._w1.dtype == "int4") + + @property + def use_mxfp4_w4a4(self) -> bool: + return self.quant_dtype == "mxfp4" + + @property + def use_nvfp4_w4a4(self) -> bool: + return self.quant_dtype == "nvfp4" + + def config_name(self, dtype: torch.dtype) -> Optional[str]: + """ + Return a string used to construct the filename that contains the + tuning info for a particular quantization scheme. See + try_get_optimal_moe_config in fused_moe.py. + """ + return _get_config_dtype_str( + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + use_mxfp4_w4a4=self.use_mxfp4_w4a4, + dtype=dtype, + ) def scale_shape( self, max_tokens: int, hidden_dim: int, ) -> Optional[tuple[int, int]]: + """ + Construct the proper activation scale shape for this + config. + """ if self.is_quantized: if self.is_block_quantized: assert self.block_shape is not None @@ -117,6 +336,10 @@ class FusedMoEQuantConfig: max_tokens: int, hidden_dim: int, ) -> Optional[tuple[int, int, int]]: + """ + Construct the proper activation batched scale shape for this + config, e.g. (num experts, *scale_shape). + """ if self.is_quantized: scale_shape = self.scale_shape(max_tokens, hidden_dim) assert scale_shape is not None @@ -126,38 +349,218 @@ class FusedMoEQuantConfig: @staticmethod def make( - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, + quant_dtype: Union[torch.dtype, str, None] = None, per_act_token_quant: bool = False, per_out_ch_quant: bool = False, block_shape: Optional[list[int]] = None, + w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None, + w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + g1_alphas: Optional[torch.Tensor] = None, + g2_alphas: Optional[torch.Tensor] = None, + a1_gscale: Optional[torch.Tensor] = None, + a2_gscale: Optional[torch.Tensor] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, ) -> "FusedMoEQuantConfig": - assert sum([ - int(flag) for flag in [ - use_fp8_w8a8, - use_int8_w8a8, - use_int8_w8a16, - use_int4_w4a16, - use_mxfp4_w4a4, - ] - ]) <= 1, "Quantization flags are mutually exclusive." + """ + General builder function for a FusedMoEQuantConfig. + - quant_dtype: Optional quantization type. None if activations are + unquantized or quantized prior to calling. Note: "nvfp4" and + "mxfp4" are the only valid string values for quant_dtype. + - per_act_token_quant: Activations have per token quantization. + - per_out_ch_quant: Outputs have per channel quantization. (only + for cutlass). + - block_shape: Optional block size for block-wise quantization. + Incompatible with per_act_token and per_out_ch quant. + - w1_scale: Optional scale to be used for w1. + - w2_scale: Optional scale to be used for w2. + - a1_scale: Optional scale to be used for a1. + - a2_scale: Optional scale to be used for a2. + - g1_alphas: Optional global quantization scales for w1 (for nvfp4). + - g2_alphas: Optional global quantization scales for w2 (for nvfp4). + - a1_gscale: Optional global quantization scales for a1 (for nvfp4). + - a2_gscale: Optional global quantization scales for a2 (for nvfp4). + - w1_bias: Optional biases for w1 (GPT OSS Triton). + - w2_bias: Optional biases for w1 (GPT OSS Triton). + - w1_zp: Optional w1 zero points for int4/int8 quantization. + - w2_zp: Optional w2 zero points for int4/int8 quantization. + """ + assert (not isinstance(quant_dtype, str) or quant_dtype == "nvfp4" + or quant_dtype == "mxfp4") + a_shape, w_shape = _quant_flags_to_group_shape(quant_dtype, + per_act_token_quant, + per_out_ch_quant, + block_shape) + quant_config = FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(quant_dtype, a_shape, a1_scale, a1_gscale), + _a2=FusedMoEQuantDesc(quant_dtype, a_shape, a2_scale, a2_gscale), + _w1=FusedMoEQuantDesc(quant_dtype, w_shape, w1_scale, g1_alphas, + w1_zp, w1_bias), + _w2=FusedMoEQuantDesc(quant_dtype, w_shape, w2_scale, g2_alphas, + w2_zp, w2_bias), + ) + assert quant_config.per_act_token_quant == per_act_token_quant + assert quant_config.per_out_ch_quant == per_out_ch_quant + assert quant_config.block_shape == block_shape + return quant_config - quant_dtype = get_config_quant_dtype( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - ) - return FusedMoEQuantConfig( - quant_dtype, - per_act_token_quant, - per_out_ch_quant, - block_shape, - ) + +def fp8_w8a8_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + per_act_token_quant: bool = False, + per_out_ch_quant: bool = False, + block_shape: Optional[list[int]] = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for fp8 activations and fp8 weights. + """ + return FusedMoEQuantConfig.make(torch.float8_e4m3fn, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_out_ch_quant, + block_shape=block_shape) + + +def int8_w8a8_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + per_act_token_quant: bool = False, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for int8 activations and int8 weights. + """ + return FusedMoEQuantConfig.make( + torch.int8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=False, + block_shape=None, + ) + + +def mxfp4_w4a4_moe_quant_config( + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for mxfp4 activations and mxfp4 weights. + """ + return FusedMoEQuantConfig.make( + "mxfp4", + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + w1_bias=w1_bias, + w2_bias=w2_bias, + per_act_token_quant=False, + per_out_ch_quant=False, + block_shape=block_shape, + ) + + +def nvfp4_moe_quant_config( + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for mxfp4 activations and nvp4 weights. + """ + return FusedMoEQuantConfig.make( + "nvfp4", + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_gscale=a1_gscale, + a2_gscale=a2_gscale, + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + per_act_token_quant=False, + per_out_ch_quant=False, + block_shape=None, + ) + + +def int4_w4a16_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + block_shape: Optional[list[int]] = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for 16-bit float activations and int4 weights. + Note: Activations are pre-quantized. + """ + group_shape = GroupShape(*block_shape) if block_shape is not None else None + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(shape=group_shape), + _a2=FusedMoEQuantDesc(shape=group_shape), + _w1=FusedMoEQuantDesc("int4", group_shape, w1_scale, None, w1_zp), + _w2=FusedMoEQuantDesc("int4", group_shape, w2_scale, None, w2_zp), + ) + + +def int8_w8a16_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + block_shape: Optional[list[int]] = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for 16-bit float activations and int8 weights. + Note: Activations are pre-quantized. + """ + group_shape = GroupShape(*block_shape) if block_shape is not None else None + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(shape=group_shape), + _a2=FusedMoEQuantDesc(shape=group_shape), + _w1=FusedMoEQuantDesc(torch.int8, group_shape, w1_scale, None, w1_zp), + _w2=FusedMoEQuantDesc(torch.int8, group_shape, w2_scale, None, w2_zp), + ) + + +def biased_moe_quant_config( + w1_bias: Optional[torch.Tensor], + w2_bias: Optional[torch.Tensor], +) -> FusedMoEQuantConfig: + """ + Construct a quant config for unquantized activations with biases. + """ + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc(bias=w1_bias), + _w2=FusedMoEQuantDesc(bias=w2_bias), + ) + + +# A FusedMoEQuantConfig constant for an unquantized MoE op. +FUSED_MOE_UNQUANTIZED_CONFIG: FusedMoEQuantConfig = FusedMoEQuantConfig.make() @dataclass @@ -315,8 +718,6 @@ class FusedMoEConfig: # The activation type. in_dtype: torch.dtype - quant_config: Optional[FusedMoEQuantConfig] = None - max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE has_bias: bool = False @@ -328,34 +729,6 @@ class FusedMoEConfig: assert self.max_num_tokens > 0 - @property - def quant_dtype(self) -> Union[torch.dtype, str, None]: - if self.quant_config is not None: - return self.quant_config.quant_dtype - else: - return None - - @property - def block_shape(self) -> Optional[list[int]]: - if self.quant_config is not None: - return self.quant_config.block_shape - else: - return None - - @property - def per_act_token_quant(self) -> bool: - if self.quant_config is not None: - return self.quant_config.per_act_token_quant - else: - return False - - @property - def per_out_ch_quant(self) -> bool: - if self.quant_config is not None: - return self.quant_config.per_out_ch_quant - else: - return False - @property def tp_size(self): return self.moe_parallel_config.tp_size @@ -401,97 +774,6 @@ class FusedMoEConfig: """ Whether to use FlashInfer cutlass kernels for NVFP4 MoE. """ - return (self.quant_config is not None - and self.quant_config.quant_dtype == "nvfp4" - and envs.VLLM_USE_FLASHINFER_MOE_FP4 + return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and has_flashinfer_cutlass_fused_moe() and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput") - - @staticmethod - def make( - num_experts: int, - experts_per_token: int, - hidden_dim: int, - num_local_experts: int, - moe_parallel_config: FusedMoEParallelConfig, - in_dtype: torch.dtype, - max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE, - quant_config: Optional[Union[FusedMoEQuantConfig, - QuantizationConfig]] = None, - has_bias: bool = False, - ) -> "FusedMoEConfig": - - _quant_config: Optional[FusedMoEQuantConfig] = None - - if quant_config is not None and isinstance(quant_config, - QuantizationConfig): - if hasattr(quant_config, 'weight_block_size'): - block_shape = quant_config.weight_block_size - else: - block_shape = None - per_act_token_quant = False - per_out_ch_quant = False - quant_dtype: Union[torch.dtype, str, None] = None - - input_quant = get_quant_config_input_quant(quant_config) - weight_quant = get_quant_config_weight_quant(quant_config) - - if input_quant is not None: - per_act_token_quant = (input_quant.strategy - == QuantizationStrategy.TOKEN - if input_quant is not None else False) - - if input_quant.num_bits == 8: - if input_quant.type == QuantizationType.FLOAT: - quant_dtype = torch.float8_e4m3fn - elif input_quant.type == QuantizationType.INT: - quant_dtype = torch.int8 - - from vllm.model_executor.layers.quantization.fp8 import Fp8Config - if quant_dtype is None and isinstance(quant_config, Fp8Config): - quant_dtype = torch.float8_e4m3fn - - from vllm.model_executor.layers.quantization.mxfp4 import ( - Mxfp4Config) - if (quant_dtype is None and isinstance(quant_config, Mxfp4Config) - and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8): - quant_dtype = "mxfp8" - - from vllm.model_executor.layers.quantization.modelopt import ( - ModelOptNvFp4Config) - if quant_dtype is None and isinstance(quant_config, - ModelOptNvFp4Config): - quant_dtype = "nvfp4" - - if weight_quant is not None: - per_out_ch_quant = ( - weight_quant.strategy == QuantizationStrategy.CHANNEL) - - if quant_dtype is not None: - _quant_config = FusedMoEQuantConfig( - quant_dtype=quant_dtype, - per_act_token_quant=per_act_token_quant, - per_out_ch_quant=per_out_ch_quant, - block_shape=block_shape, - ) - else: - _quant_config = FusedMoEQuantConfig() - if moe_parallel_config.dp_size > 1: - logger.warning_once("MoE DP setup unable to determine " - "quantization scheme or unsupported " - "quantization type. This model will " - "not run with DP enabled.") - else: - _quant_config = quant_config - - return FusedMoEConfig( - num_experts=num_experts, - experts_per_token=experts_per_token, - hidden_dim=hidden_dim, - num_local_experts=num_local_experts, - moe_parallel_config=moe_parallel_config, - in_dtype=in_dtype, - quant_config=_quant_config, - max_num_tokens=max_num_tokens, - has_bias=has_bias, - ) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 95d23ec0346c1..957ffca0d1246 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -211,21 +211,14 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, out_dtype: Optional[torch.dtype], - per_act_token_quant: bool, - per_out_ch_quant: bool, ab_strides1: torch.Tensor, ab_strides2: torch.Tensor, c_strides1: torch.Tensor, c_strides2: torch.Tensor, - block_shape: Optional[list[int]] = None, + quant_config: FusedMoEQuantConfig, ): - super().__init__( - FusedMoEQuantConfig( - quant_dtype=torch.float8_e4m3fn, - per_act_token_quant=per_act_token_quant, - per_out_ch_quant=per_out_ch_quant, - block_shape=block_shape, - )) + assert quant_config.use_fp8_w8a8 + super().__init__(quant_config) self.out_dtype = out_dtype self.ab_strides1 = ab_strides1 self.ab_strides2 = ab_strides2 @@ -247,19 +240,14 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): - assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE" - assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE" + assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE" + assert self.w2_zp is None, "w2_zp is not supported in CUTLASS MoE" expert_num_tokens = None if expert_tokens_meta is not None: @@ -273,9 +261,10 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): in_dtype = hidden_states.dtype run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, - global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, - self.c_strides2, workspace13, workspace2, expert_num_tokens, + global_num_experts, expert_map, self.w1_scale, self.w2_scale, + a1q_scale, self.a2_scale, self.ab_strides1, self.ab_strides2, + self.c_strides1, self.c_strides2, workspace13, workspace2, + expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, self.per_act_token_quant, self.per_out_ch_quant, use_batched_format, topk_weights) @@ -286,23 +275,19 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): def __init__( self, out_dtype: Optional[torch.dtype], - per_act_token_quant: bool, - per_out_ch_quant: bool, ab_strides1: torch.Tensor, ab_strides2: torch.Tensor, c_strides1: torch.Tensor, c_strides2: torch.Tensor, - block_shape: Optional[list[int]] = None, + quant_config: FusedMoEQuantConfig, ): super().__init__( out_dtype, - per_act_token_quant, - per_out_ch_quant, ab_strides1, ab_strides2, c_strides1, c_strides2, - block_shape, + quant_config, ) @property @@ -348,23 +333,19 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): max_experts_per_worker: int, num_dispatchers: int, out_dtype: Optional[torch.dtype], - per_act_token_quant: bool, - per_out_ch_quant: bool, ab_strides1: torch.Tensor, ab_strides2: torch.Tensor, c_strides1: torch.Tensor, c_strides2: torch.Tensor, - block_shape: Optional[list[int]] = None, + quant_config: FusedMoEQuantConfig, ): super().__init__( out_dtype, - per_act_token_quant, - per_out_ch_quant, ab_strides1, ab_strides2, c_strides1, c_strides2, - block_shape, + quant_config, ) assert max_experts_per_worker > 0 self.max_experts_per_worker = max_experts_per_worker @@ -414,16 +395,12 @@ def cutlass_moe_fp8( w2_q: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, ab_strides1: torch.Tensor, ab_strides2: torch.Tensor, c_strides1: torch.Tensor, c_strides2: torch.Tensor, - per_act_token: Optional[bool] = None, + quant_config: FusedMoEQuantConfig, activation: str = "silu", - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, expert_map: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, @@ -475,10 +452,18 @@ def cutlass_moe_fp8( Returns: - torch.Tensor: The fp16 output tensor after applying the MoE layer. """ - if per_act_token is None: - per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( - a2_scale.numel() != 1 if a2_scale is not None else False) - per_out_ch = w1_scale.numel() != w1_q.size(0) + assert quant_config is not None + + if quant_config.a1_scale is not None: + assert (quant_config.per_act_token_quant == + quant_config.a1_scale.numel() != 1) + if quant_config.a2_scale is not None: + assert (quant_config.per_act_token_quant == + quant_config.a2_scale.numel() != 1) + + assert (quant_config.w1_scale is None + or (quant_config.per_out_ch_quant == (quant_config.w1_scale.size(1) + == w1_q.size(1)))) num_experts = global_num_experts if global_num_experts != -1 else w1_q.size( 0) @@ -487,12 +472,11 @@ def cutlass_moe_fp8( MoEPrepareAndFinalizeNoEP(), CutlassExpertsFp8( out_dtype=a.dtype, - per_act_token_quant=per_act_token, - per_out_ch_quant=per_out_ch, ab_strides1=ab_strides1, ab_strides2=ab_strides2, c_strides1=c_strides1, c_strides2=c_strides2, + quant_config=quant_config, ), ) @@ -502,14 +486,9 @@ def cutlass_moe_fp8( w2_q, topk_weights, topk_ids, - False, - activation, - num_experts, - expert_map, - w1_scale, - w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, + activation=activation, + global_num_experts=num_experts, + expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -542,7 +521,7 @@ def run_cutlass_moe_fp4( ) -> None: """ MoE implementation for FP4 Inputs - + # Gemm 1 a: Input tensor: [m, k] (half/bfloat16) a1_gscale: Activation scale per expert: [e] (float32) @@ -552,16 +531,16 @@ def run_cutlass_moe_fp4( full precision) w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3) (Block size = 16 for NVFP4) - + # Gemm 2 a2_gscale: Activation scale per expert: [e] w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n] w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1) w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3 - + topk_weights: [m, topk] dtype: float8 topk_ids: [m, topk] dtype: float8 - + m, n, k: Unquantized weight shapes, dtype: int e: number of experts, dtype: int @@ -652,42 +631,21 @@ def run_cutlass_moe_fp4( return +# Split into batched and non-batched class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - g1_alphas: torch.Tensor, - g2_alphas: torch.Tensor, - a1_gscale: torch.Tensor, - a2_gscale: torch.Tensor, max_experts_per_worker: int, out_dtype: torch.dtype, - per_act_token_quant: bool, - per_out_ch_quant: bool, - block_shape: Optional[list[int]] = None, + quant_config: FusedMoEQuantConfig, use_batched_format: bool = False, ): - super().__init__( - # NVFP4 requires two levels of quantization, which involves - # computing some scaling factors dynamically. This makes it - # incompatible with the typical prepare -> MoE -> finalize - # pipeline. Move the quantization logic into the MoE body. - FusedMoEQuantConfig( - quant_dtype=None, # skip quantization in prepare/finalize - per_act_token_quant=per_act_token_quant, - per_out_ch_quant=per_out_ch_quant, - block_shape=block_shape, - )) + super().__init__(quant_config) self.max_experts_per_worker = max_experts_per_worker self.out_dtype = out_dtype self.use_batched_format = use_batched_format - # TODO(bnell): put this stuff into quant config? - self.g1_alphas = g1_alphas - self.g2_alphas = g2_alphas - self.a1_gscale = a1_gscale - self.a2_gscale = a2_gscale - @property def activation_formats( self @@ -746,12 +704,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], - a1q_scale: Optional[torch.Tensor], - a2_scale: torch.Tensor, + a1q_scale: Optional[torch.Tensor], # unused workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -765,11 +718,11 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): a=hidden_states, a1_gscale=self.a1_gscale, w1_fp4=w1, - w1_blockscale=w1_scale, + w1_blockscale=self.w1_scale, w1_alphas=self.g1_alphas, a2_gscale=self.a2_gscale, w2_fp4=w2, - w2_blockscale=w2_scale, + w2_blockscale=self.w2_scale, w2_alphas=self.g2_alphas, topk_weights=topk_weights, topk_ids=topk_ids, @@ -788,14 +741,9 @@ def cutlass_moe_fp4( a: torch.Tensor, w1_fp4: torch.Tensor, w2_fp4: torch.Tensor, - w1_blockscale: torch.Tensor, - w2_blockscale: torch.Tensor, - g1_alphas: torch.Tensor, - g2_alphas: torch.Tensor, - a1_gscale: torch.Tensor, - a2_gscale: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + quant_config: FusedMoEQuantConfig, m: int, n: int, k: int, @@ -805,17 +753,31 @@ def cutlass_moe_fp4( assert expert_map is None, ("Expert Parallelism / expert_map " "is currently not supported for " "ModelOptNvFp4FusedMoE's cutlass_moe_fp4.") + + # TODO(bnell): this feels a bit hacky + # NVFP4 requires two levels of quantization, which involves + # computing some scaling factors dynamically. This makes it + # incompatible with the typical prepare -> MoE -> finalize + # pipeline. Move the quantization logic into the MoE body. + quant_config = FusedMoEQuantConfig.make( + quant_dtype=None, # skip quantization in prepare/finalize + per_act_token_quant=quant_config.per_act_token_quant, + per_out_ch_quant=quant_config.per_out_ch_quant, + block_shape=quant_config.block_shape, + g1_alphas=quant_config.g1_alphas, + g2_alphas=quant_config.g2_alphas, + a1_gscale=quant_config.a1_gscale, + a2_gscale=quant_config.a2_gscale, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + ) + fn = mk.FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), CutlassExpertsFp4( - g1_alphas, - g2_alphas, - a1_gscale, - a2_gscale, max_experts_per_worker=e, out_dtype=a.dtype, - per_act_token_quant=False, - per_out_ch_quant=False, + quant_config=quant_config, use_batched_format=False, ), ) @@ -830,10 +792,6 @@ def cutlass_moe_fp4( activation="silu", global_num_experts=e, expert_map=None, - w1_scale=w1_blockscale, - w2_scale=w2_blockscale, - a1_scale=None, - a2_scale=None, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -891,6 +849,7 @@ def _valid_cutlass_block_scaled_grouped_gemm( return True +# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8. def run_cutlass_block_scaled_fused_experts( a: torch.Tensor, w1: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index c0bfda73eee0d..8830b95df7cf0 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import functools from typing import Optional import torch @@ -9,9 +8,11 @@ from tqdm import tqdm import vllm.envs as env import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, fp8_w8a8_moe_quant_config) from vllm.model_executor.layers.fused_moe.deep_gemm_utils import ( - compute_aligned_M, deepgemm_moe_permute, deepgemm_unpermute_and_reduce) + compute_aligned_M, deep_gemm_block_shape, deepgemm_moe_permute, + deepgemm_unpermute_and_reduce) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( @@ -25,14 +26,6 @@ from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous logger = init_logger(__name__) -@functools.cache -def deep_gemm_block_shape() -> list[int]: - # Lazy import to avoid CUDA initialization problems. - import deep_gemm as dg - block = dg.get_m_alignment_for_contiguous_layout() - return [block, block] - - def _valid_deep_gemm_shape(M: int, N: int, K: int) -> bool: align = deep_gemm_block_shape()[0] return align <= M and N % align == 0 and K % align == 0 @@ -163,13 +156,12 @@ def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor, class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): - def __init__(self): - super().__init__( - FusedMoEQuantConfig( - quant_dtype=torch.float8_e4m3fn, - per_act_token_quant=False, - block_shape=deep_gemm_block_shape(), - )) + def __init__(self, quant_config: FusedMoEQuantConfig): + super().__init__(quant_config) + assert quant_config.block_shape == deep_gemm_block_shape() + assert quant_config.quant_dtype == torch.float8_e4m3fn + assert not quant_config.per_act_token_quant + assert not quant_config.per_out_ch_quant @property def activation_formats( @@ -221,21 +213,17 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): - assert self.block_shape is not None assert a1q_scale is not None - assert w1_scale is not None - assert w2_scale is not None + assert self.a2_scale is None + assert self.block_shape is not None + assert self.w1_scale is not None + assert self.w2_scale is not None a1q = hidden_states _, N, K = w1.size() @@ -270,7 +258,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): aq_out=a1q_perm) assert a1q.size(0) == M_sum - m_grouped_fp8_gemm_nt_contiguous((a1q, a1q_scale), (w1, w1_scale), + m_grouped_fp8_gemm_nt_contiguous((a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids) self.activation(activation, act_out, mm1_out.view(-1, N)) @@ -281,7 +269,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): column_major_scales=True, out_q=quant_out) - m_grouped_fp8_gemm_nt_contiguous((a2q, a2q_scale), (w2, w2_scale), + m_grouped_fp8_gemm_nt_contiguous((a2q, a2q_scale), (w2, self.w2_scale), mm2_out, expert_ids) if apply_router_weight_on_input: @@ -348,9 +336,16 @@ def deep_gemm_moe_fp8( Returns: - torch.Tensor: The bfloat16 output tensor after applying the MoE layer. """ + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=deep_gemm_block_shape()) + fn = mk.FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), - DeepGemmExperts(), + DeepGemmExperts(quant_config), ) return fn( hidden_states, @@ -358,13 +353,9 @@ def deep_gemm_moe_fp8( w2, topk_weights, topk_ids, - inplace, - activation, - global_num_experts, - expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 92cbb1742974c..5d6b9c87a6b76 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -183,8 +183,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare_async( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -204,7 +202,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # Quant and Dispatch a1q, a1q_scale = moe_kernel_quantize_input( a1, - a1_scale, + quant_config.a1_scale, quant_dtype=quant_config.quant_dtype, per_act_token_quant=quant_config.per_act_token_quant, block_shape=quant_config.block_shape, @@ -215,7 +213,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): else: a1q = a1 a1q_scale = None - a1_post_scale = a1_scale + a1_post_scale = quant_config.a1_scale return (lambda *args: None, self._do_dispatch(tokens=a1q, @@ -229,8 +227,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -238,9 +234,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - (_, receiver) = self.prepare_async(a1, a1_scale, a2_scale, - topk_weights, topk_ids, num_experts, - expert_map, + (_, receiver) = self.prepare_async(a1, topk_weights, topk_ids, + num_experts, expert_map, apply_router_weight_on_input, quant_config) return receiver() diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 61f8297f0f148..01df7770463d0 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -77,15 +77,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def _do_quant( self, x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], - a1_scale: Optional[torch.Tensor], a1_dtype: torch.dtype, - quant_dtype: Union[torch.dtype, str, None], - per_act_token_quant: bool, - block_shape: Optional[list[int]], + quant_config: FusedMoEQuantConfig, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - block_k = block_shape[1] if block_shape is not None else None if self.use_fp8_dispatch: + block_k = quant_config.block_shape[ + 1] if quant_config.block_shape is not None else None if block_k == DEEPEP_QUANT_BLOCK_SIZE: # DeepEP kernels did the quantization for us. x, x_scales = x @@ -101,12 +99,12 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # TODO (varun): Optimization - Use a batched version of quant x = x.view((-1, hidden_dim)) - x, x_scales = moe_kernel_quantize_input(x, a1_scale, quant_dtype, - per_act_token_quant, - block_shape) + x, x_scales = moe_kernel_quantize_input( + x, quant_config.a1_scale, quant_config.quant_dtype, + quant_config.per_act_token_quant, quant_config.block_shape) x = x.view((num_experts, -1, hidden_dim)) - if quant_dtype is not None: + if quant_config.quant_dtype is not None: assert x_scales is not None x_scales = normalize_batched_scales_shape(x_scales, num_experts) @@ -118,8 +116,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare_async( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -139,9 +135,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): assert hidden_size % 128 == 0, \ "DeepEP kernels quantize the inputs in blocks of shape 128" - has_per_token_scales = a1_scale.numel( - ) != 1 if a1_scale is not None else ( - a2_scale.numel() != 1 if a2_scale is not None else False) + has_per_token_scales = quant_config.a1_scale.numel( + ) != 1 if quant_config.a1_scale is not None else ( + quant_config.a2_scale.numel() != 1 + if quant_config.a2_scale is not None else False) assert not has_per_token_scales, ( "low_latency kernels doesn't support dispatching per-token scales") @@ -163,20 +160,21 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return_recv_hook=True) self.handles[a2a_idx] = handle - return (hook, lambda: self._receiver(expert_x, expert_num_tokens, - a1_scale, a1.dtype, quant_config)) + return ( + hook, + lambda: self._receiver(expert_x, expert_num_tokens, quant_config. + a1_scale, a1.dtype, quant_config)) def _receiver( self, expert_x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], expert_num_tokens: torch.Tensor, - a1_scale, - a1_dtype, + a1_scale: Optional[torch.Tensor], + a1_dtype: torch.dtype, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - expert_x, expert_x_scale = self._do_quant( - expert_x, a1_scale, a1_dtype, quant_config.quant_dtype, - quant_config.per_act_token_quant, quant_config.block_shape) + expert_x, expert_x_scale = self._do_quant(expert_x, a1_dtype, + quant_config) expert_tokens_meta = mk.ExpertTokensMetadata( expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None) @@ -186,8 +184,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -195,8 +191,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: - hook, receiver = self.prepare_async(a1, a1_scale, a2_scale, - topk_weights, topk_ids, + hook, receiver = self.prepare_async(a1, topk_weights, topk_ids, num_experts, expert_map, apply_router_weight_on_input, quant_config) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index feab3f74cac53..6eeec18a6ec87 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union +from typing import Optional import torch @@ -44,33 +44,20 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - g1_alphas: torch.Tensor, - g2_alphas: torch.Tensor, - a1_gscale: torch.Tensor, - a2_gscale: torch.Tensor, out_dtype: torch.dtype, - quant_dtype: Union[torch.dtype, str, None], + quant_config: FusedMoEQuantConfig, ep_rank: int = 0, ep_size: int = 1, tp_rank: int = 0, tp_size: int = 1, ): - super().__init__( - FusedMoEQuantConfig( - quant_dtype=quant_dtype, - per_act_token_quant=False, - block_shape=None, - )) - assert quant_dtype in ("nvfp4", torch.float8_e4m3fn), ( + super().__init__(quant_config) + assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn), ( "Only nvfp4,fp8 quantization are currently supported.") self.ep_rank = ep_rank self.ep_size = ep_size self.tp_rank = tp_rank self.tp_size = tp_size - self.g1_alphas = g1_alphas - self.g2_alphas = g2_alphas - self.a1_gscale = a1_gscale - self.a2_gscale = a2_gscale self.out_dtype = out_dtype @property @@ -141,12 +128,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], # Not used workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -162,17 +144,17 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): fc2_expert_weights = w2 else: # Ensure w1_scale and w2_scale are not None before calling view - assert w1_scale is not None and w2_scale is not None, ( + assert self.w1_scale is not None and self.w2_scale is not None, ( "w1_scale and w2_scale must not " "be None for FlashInferExperts") # Flashinfer CUTLASS kernel takes scalar global scales, # min because inv_scale. quant_scales = [ self.a1_gscale, - w1_scale.view(torch.int32), + self.w1_scale.view(torch.int32), self.g1_alphas, self.a2_gscale, - w2_scale.view(torch.int32), + self.w2_scale.view(torch.int32), self.g2_alphas, ] # FlashInfer API requires weight to be long for nvfp4 @@ -202,12 +184,7 @@ def flashinfer_cutlass_moe_fp4( w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - g1_alphas: torch.Tensor, - g2_alphas: torch.Tensor, - a1_gscale: torch.Tensor, - a2_gscale: torch.Tensor, + quant_config: FusedMoEQuantConfig, inplace: bool = False, activation: str = "silu", global_num_experts: int = -1, @@ -216,15 +193,10 @@ def flashinfer_cutlass_moe_fp4( ) -> torch.Tensor: fused_experts = mk.FusedMoEModularKernel( - FlashInferCutlassMoEPrepareAndFinalize(use_dp=False, - a1_gscale=a1_gscale), + FlashInferCutlassMoEPrepareAndFinalize(use_dp=False), FlashInferExperts( - g1_alphas=g1_alphas, - g2_alphas=g2_alphas, - a1_gscale=a1_gscale, - a2_gscale=a2_gscale, out_dtype=hidden_states.dtype, - quant_dtype="nvfp4", + quant_config=quant_config, )) return fused_experts( @@ -237,7 +209,5 @@ def flashinfer_cutlass_moe_fp4( activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 157cb36d4ffd3..8c7eff59f3cd1 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -22,13 +22,11 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def __init__( self, use_dp: bool, - a1_gscale: Optional[torch.Tensor], num_dispatchers: int = 1, ): super().__init__() self.num_dispatchers_ = num_dispatchers self.use_dp = use_dp - self.a1_gscale = a1_gscale self.local_tokens = None @property @@ -47,14 +45,11 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], # Not used - a2_scale: Optional[torch.Tensor], # Not used topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, - # TODO(bnell): use quant_config + scales instead of ctor args quant_config: FusedMoEQuantConfig, ) -> mk.PrepareResultType: @@ -67,7 +62,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): a1q, a1q_scale = moe_kernel_quantize_input( a1, - self.a1_gscale, + quant_config.a1_gscale, quant_config.quant_dtype, quant_config.per_act_token_quant, quant_config.block_shape, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py new file mode 100644 index 0000000000000..e358143fac7c7 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import List # noqa: UP035 +from typing import Optional + +import torch + +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + calculate_tile_tokens_dim) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8) +from vllm.utils import direct_register_custom_op + + +def flashinfer_fused_moe_blockscale_fp8( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor, + x: torch.Tensor, + w13_weight: torch.Tensor, + w13_weight_scale_inv: torch.Tensor, + w2_weight: torch.Tensor, + w2_weight_scale_inv: torch.Tensor, + global_num_experts: int, + top_k: int, + num_expert_group: int, + topk_group: int, + intermediate_size: int, + expert_offset: int, + local_num_experts: int, + block_shape: List[int], #noqa: UP006 + routed_scaling: float = 1.0) -> torch.Tensor: + from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe + assert top_k <= global_num_experts + assert top_k <= 8 + assert topk_group <= 4 + assert global_num_experts > num_expert_group + assert global_num_experts % num_expert_group == 0 + assert global_num_experts % 4 == 0 + assert top_k < (topk_group * global_num_experts / num_expert_group) + assert block_shape == [128, 128] + + a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1]) + # NOTE: scales of hidden states have to be transposed! + a_sf_t = a_sf.t().contiguous() + return flashinfer_trtllm_fp8_block_scale_moe( + routing_logits=routing_logits, + routing_bias=routing_bias, + hidden_states=a_q, + hidden_states_scale=a_sf_t, + gemm1_weights=w13_weight, + gemm1_weights_scale=w13_weight_scale_inv, + gemm2_weights=w2_weight, + gemm2_weights_scale=w2_weight_scale_inv, + num_experts=global_num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=intermediate_size, + local_expert_offset=expert_offset, + local_num_experts=local_num_experts, + routed_scaling_factor=routed_scaling, + tile_tokens_dim=calculate_tile_tokens_dim(x.shape[0], top_k, + global_num_experts), + routing_method_type=2, # DeepSeek-styled routing method + use_shuffled_weight=False, + ) + + +def flashinfer_fused_moe_blockscale_fp8_fake( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor, + x: torch.Tensor, + w13_weight: torch.Tensor, + w13_weight_scale_inv: torch.Tensor, + w2_weight: torch.Tensor, + w2_weight_scale_inv: torch.Tensor, + global_num_experts: int, + top_k: int, + num_expert_group: int, + topk_group: int, + intermediate_size: int, + expert_offset: int, + local_num_experts: int, + block_shape: list[int], + routed_scaling: float = 1.0) -> torch.Tensor: + return torch.empty_like(x) + + +# TODO(bnell): Does this really need to be a torch.op? +direct_register_custom_op( + op_name="flashinfer_fused_moe_blockscale_fp8", + op_func=flashinfer_fused_moe_blockscale_fp8, + mutates_args=[], + fake_impl=flashinfer_fused_moe_blockscale_fp8_fake, + tags=(torch.Tag.needs_fixed_stride_order, ), +) + + +def flashinfer_fused_moe_per_tensor_scale_fp8( + routing_logits: torch.Tensor, + routing_bias: Optional[torch.Tensor], + hidden_states: torch.Tensor, + input_scale: torch.Tensor, + gemm1_weights: torch.Tensor, + gemm2_weights: torch.Tensor, + output1_scales_scalar: torch.Tensor, + output1_scales_gate_scalar: torch.Tensor, + output2_scales_scalar: torch.Tensor, + num_experts: int, + top_k: int, + num_expert_group: Optional[int], + topk_group: Optional[int], + intermediate_size: int, + local_expert_offset: int, + local_num_experts: int, + use_routing_scales_on_input: bool, + routing_method_type: int, + routed_scaling_factor: float = 1.0) -> torch.Tensor: + num_expert_group = num_expert_group if num_expert_group is not None else 0 + topk_group = topk_group if topk_group is not None else 0 + + quant_hidden_states, _ = moe_kernel_quantize_input( + hidden_states, + input_scale, + quant_dtype=torch.float8_e4m3fn, + per_act_token_quant=False) + + from vllm.utils.flashinfer import ( + flashinfer_trtllm_fp8_per_tensor_scale_moe) + return flashinfer_trtllm_fp8_per_tensor_scale_moe( + routing_logits=routing_logits, + routing_bias=routing_bias, + hidden_states=quant_hidden_states, + gemm1_weights=gemm1_weights, + output1_scales_scalar=output1_scales_scalar, + output1_scales_gate_scalar=output1_scales_gate_scalar, + gemm2_weights=gemm2_weights, + output2_scales_scalar=output2_scales_scalar, + num_experts=num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=intermediate_size, + local_expert_offset=local_expert_offset, + local_num_experts=local_num_experts, + routed_scaling_factor=routed_scaling_factor, + use_routing_scales_on_input=use_routing_scales_on_input, + tile_tokens_dim=calculate_tile_tokens_dim(hidden_states.shape[0], + top_k, num_experts), + routing_method_type=routing_method_type) + + +def flashinfer_fused_moe_per_tensor_scale_fp8_fake( + routing_logits: torch.Tensor, + routing_bias: Optional[torch.Tensor], + hidden_states: torch.Tensor, + input_scale: torch.Tensor, + gemm1_weights: torch.Tensor, + gemm2_weights: torch.Tensor, + output1_scales_scalar: torch.Tensor, + output1_scales_gate_scalar: torch.Tensor, + output2_scales_scalar: torch.Tensor, + num_experts: int, + top_k: int, + num_expert_group: Optional[int], + topk_group: Optional[int], + intermediate_size: int, + local_expert_offset: int, + local_num_experts: int, + use_routing_scales_on_input: bool, + routing_method_type: int, + routed_scaling_factor: float = 1.0) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +# TODO(bnell): Does this really need to be a torch.op? +direct_register_custom_op( + op_name="flashinfer_fused_moe_per_tensor_scale_fp8", + op_func=flashinfer_fused_moe_per_tensor_scale_fp8, + mutates_args=["hidden_states"], + fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake, + tags=(torch.Tag.needs_fixed_stride_order, ), +) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 88063668e9188..fe6ac458a9593 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -8,7 +8,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.fused_moe import ( - get_config_dtype_str, try_get_optimal_moe_config) + try_get_optimal_moe_config) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNaiveBatched) from vllm.model_executor.layers.fused_moe.utils import ( @@ -498,8 +498,6 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -545,14 +543,13 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): dtype=torch.float32, device=a1.device) else: - assert a1_scale is None + assert quant_config.a1_scale is None b_a1_scale = None first_expert = num_local_experts * self.rank last_expert = first_expert + num_local_experts - a1_scale = normalize_scales_shape(a1_scale) - a2_scale = normalize_scales_shape(a2_scale) + a1_scale = normalize_scales_shape(quant_config.a1_scale) for expert_id in range(first_expert, last_expert): topks = torch.any(topk_ids == expert_id, dim=1).flatten() @@ -623,28 +620,13 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): self, max_num_tokens: int, num_dispatchers: int, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - block_shape: Optional[list[int]] = None, - per_act_token_quant: bool = False, + quant_config: FusedMoEQuantConfig, ): - super().__init__( - FusedMoEQuantConfig.make( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, - )) - assert not use_int8_w8a8, "NYI" - assert not use_int8_w8a16, "NYI" - assert not use_int4_w4a16, "NYI" - assert not use_mxfp4_w4a4, "NYI" + super().__init__(quant_config) + assert not self.quant_config.use_int8_w8a8, "NYI" + assert not self.quant_config.use_int8_w8a16, "NYI" + assert not self.quant_config.use_int4_w4a16, "NYI" + assert not self.quant_config.use_mxfp4_w4a4, "NYI" self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers @@ -705,12 +687,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -740,10 +717,10 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): tmp = _resize_cache(workspace2, (num, N)) if self.quant_config.is_quantized: - assert a1q_scale is not None and w1_scale is not None + assert a1q_scale is not None and self.w1_scale is not None input = self.dequant(hidden_states[expert, :, :], a1q_scale[expert]) - w1_dq = self.dequant(w1[expert], w1_scale[expert]) + w1_dq = self.dequant(w1[expert], self.w1_scale[expert]) input = input[:num] @ w1_dq.transpose(0, 1) else: input = hidden_states[expert, :num, :] @ w1[expert].transpose( @@ -752,8 +729,8 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): self.activation(activation, tmp, input.to(tmp.dtype)) if self.quant_config.is_quantized: - assert w2_scale is not None - w2_dq = self.dequant(w2[expert], w2_scale[expert]) + assert self.w2_scale is not None + w2_dq = self.dequant(w2[expert], self.w2_scale[expert]) else: w2_dq = w2[expert] @@ -840,35 +817,15 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): self, max_num_tokens: int, num_dispatchers: int, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + quant_config: FusedMoEQuantConfig, ): - super().__init__( - FusedMoEQuantConfig.make( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, - )) - assert not use_int8_w8a8, "NYI" - assert not use_int8_w8a16, "NYI" - assert not use_int4_w4a16, "NYI" - assert not use_mxfp4_w4a4, "NYI" + super().__init__(quant_config) + assert not self.quant_config.use_int8_w8a8, "NYI" + assert not self.quant_config.use_int8_w8a16, "NYI" + assert not self.quant_config.use_int4_w4a16, "NYI" + assert not self.quant_config.use_mxfp4_w4a4, "NYI" assert max_num_tokens > 0 assert num_dispatchers > 0 - self.use_fp8_w8a8 = use_fp8_w8a8 - self.use_int8_w8a8 = use_int8_w8a8 - self.use_int4_w4a16 = use_int4_w4a16 - self.use_int8_w8a16 = use_int8_w8a16 - self.use_mxfp4_w4a4 = use_mxfp4_w4a4 self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers @@ -921,19 +878,14 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): # Check constraints. - if self.use_int4_w4a16: + if self.quant_config.use_int4_w4a16: assert hidden_states.size(-1) // 2 == w1.size(2), ( "Hidden size mismatch") else: @@ -958,11 +910,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): assert w1.size(0) == E assert w2.size(0) == E - config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, - use_mxfp4_w4a4=self.use_mxfp4_w4a4, - dtype=hidden_states.dtype) + config_dtype = self.quant_config.config_name(hidden_states.dtype) config = try_get_optimal_moe_config( w1.size(), @@ -992,7 +940,8 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2)) - if self.use_fp8_w8a8: + # TODO(bnell): should this be done for any quantized type? + if self.quant_config.use_fp8_w8a8: intermediate_cache1.fill_(0) a1q_scale = normalize_batched_scales_shape(a1q_scale, E) @@ -1005,11 +954,11 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_num_tokens=expert_num_tokens, compute_type=compute_type, A_scale=a1q_scale, - B_scale=w1_scale, - B_zp=w1_zp, - use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, + B_scale=self.w1_scale, + B_zp=self.w1_zp, + use_fp8_w8a8=self.quant_config.use_fp8_w8a8, + use_int8_w8a16=self.quant_config.use_int8_w8a16, + use_int4_w4a16=self.quant_config.use_int4_w4a16, config=config, per_act_token_quant=self.per_act_token_quant, block_shape=self.block_shape) @@ -1021,7 +970,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache1.view(-1, N)) qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input( - intermediate_cache2, a2_scale, max_num_tokens, E, N, + intermediate_cache2, self.a2_scale, max_num_tokens, E, N, expert_num_tokens, self.quant_dtype, self.per_act_token_quant, self.block_shape) @@ -1032,11 +981,11 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_num_tokens=expert_num_tokens, compute_type=compute_type, A_scale=a2q_scale, - B_scale=w2_scale, - B_zp=w2_zp, - use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, + B_scale=self.w2_scale, + B_zp=self.w2_zp, + use_fp8_w8a8=self.quant_config.use_fp8_w8a8, + use_int8_w8a16=self.quant_config.use_int8_w8a16, + use_int4_w4a16=self.quant_config.use_int4_w4a16, config=config, per_act_token_quant=self.per_act_token_quant, block_shape=self.block_shape) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 36c2ab8b2d5f3..d4de3f640865e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Fused MoE kernel.""" +"""Fused MoE Triton kernels.""" import functools import json import os # torch.compile needs typing.List. It will fail torch.library.infer_schema # otherwise from typing import List # noqa: UP035 -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch import torch.nn.functional as F @@ -18,7 +18,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger # yapf: disable from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEQuantConfig, get_config_quant_dtype) + FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig, _get_config_dtype_str) from vllm.model_executor.layers.fused_moe.cutlass_moe import ( _valid_cutlass_block_scaled_grouped_gemm, run_cutlass_block_scaled_fused_experts) @@ -32,11 +32,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP) from vllm.model_executor.layers.fused_moe.utils import ( - _resize_cache, moe_kernel_quantize_input) -from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( - calculate_tile_tokens_dim) -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8) + _resize_cache, activation_without_mul, moe_kernel_quantize_input) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( dequant_mxfp4) from vllm.platforms import current_platform @@ -1049,87 +1045,66 @@ def fused_grouped_topk( return topk_values.to(torch.float32), topk_indices.to(torch.int32) -def get_config_dtype_str( - dtype: torch.dtype, - use_int4_w4a16: Optional[bool] = False, - use_int8_w8a16: Optional[bool] = False, - use_fp8_w8a8: Optional[bool] = False, - use_mxfp4_w4a4: Optional[bool] = False) -> Optional[str]: - if use_fp8_w8a8: - return "fp8_w8a8" - elif use_int8_w8a16: - return "int8_w8a16" - elif use_int4_w4a16: - return "int4_w4a16" - elif use_mxfp4_w4a4: - return "mxfp4_w4a4" - elif dtype == torch.float: - # avoiding cases where kernel fails when float32 MoE - # use fp16/bfloat16 configs - return "float32" - return None - - def inplace_fused_experts( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str = "silu", - is_act_and_mul: bool = True, - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None, #noqa: UP006 - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None) -> None: + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, #noqa: UP006 + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, +) -> None: fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, - activation, is_act_and_mul, - apply_router_weight_on_input, use_fp8_w8a8, + activation, apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape, w1_bias, w2_bias) -def inplace_fused_experts_fake(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str = "silu", - is_act_and_mul: bool = True, - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None) -> None: +def inplace_fused_experts_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, #noqa: UP006 + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, +) -> None: pass @@ -1143,175 +1118,6 @@ direct_register_custom_op( ) -def flashinfer_fused_moe_blockscale_fp8( - routing_logits: torch.Tensor, - routing_bias: torch.Tensor, - x: torch.Tensor, - w13_weight: torch.Tensor, - w13_weight_scale_inv: torch.Tensor, - w2_weight: torch.Tensor, - w2_weight_scale_inv: torch.Tensor, - global_num_experts: int, - top_k: int, - num_expert_group: int, - topk_group: int, - intermediate_size: int, - expert_offset: int, - local_num_experts: int, - block_shape: List[int], #noqa: UP006 - routed_scaling: float = 1.0) -> torch.Tensor: - from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe - assert top_k <= global_num_experts - assert top_k <= 8 - assert topk_group <= 4 - assert global_num_experts > num_expert_group - assert global_num_experts % num_expert_group == 0 - assert global_num_experts % 4 == 0 - assert top_k < (topk_group * global_num_experts / num_expert_group) - assert block_shape == [128, 128] - - a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1]) - # NOTE: scales of hidden states have to be transposed! - a_sf_t = a_sf.t().contiguous() - return flashinfer_trtllm_fp8_block_scale_moe( - routing_logits=routing_logits, - routing_bias=routing_bias, - hidden_states=a_q, - hidden_states_scale=a_sf_t, - gemm1_weights=w13_weight, - gemm1_weights_scale=w13_weight_scale_inv, - gemm2_weights=w2_weight, - gemm2_weights_scale=w2_weight_scale_inv, - num_experts=global_num_experts, - top_k=top_k, - n_group=num_expert_group, - topk_group=topk_group, - intermediate_size=intermediate_size, - local_expert_offset=expert_offset, - local_num_experts=local_num_experts, - routed_scaling_factor=routed_scaling, - tile_tokens_dim=calculate_tile_tokens_dim(x.shape[0], top_k, - global_num_experts), - routing_method_type=2, # DeepSeek-styled routing method - use_shuffled_weight=False, - ) - - -def flashinfer_fused_moe_blockscale_fp8_fake( - routing_logits: torch.Tensor, - routing_bias: torch.Tensor, - x: torch.Tensor, - w13_weight: torch.Tensor, - w13_weight_scale_inv: torch.Tensor, - w2_weight: torch.Tensor, - w2_weight_scale_inv: torch.Tensor, - global_num_experts: int, - top_k: int, - num_expert_group: int, - topk_group: int, - intermediate_size: int, - expert_offset: int, - local_num_experts: int, - block_shape: list[int], - routed_scaling: float = 1.0) -> torch.Tensor: - return torch.empty_like(x) - - -direct_register_custom_op( - op_name="flashinfer_fused_moe_blockscale_fp8", - op_func=flashinfer_fused_moe_blockscale_fp8, - mutates_args=[], - fake_impl=flashinfer_fused_moe_blockscale_fp8_fake, - tags=(torch.Tag.needs_fixed_stride_order, ), -) - - -def flashinfer_fused_moe_per_tensor_scale_fp8( - routing_logits: torch.Tensor, - routing_bias: Optional[torch.Tensor], - hidden_states: torch.Tensor, - input_scale: torch.Tensor, - gemm1_weights: torch.Tensor, - gemm2_weights: torch.Tensor, - output1_scales_scalar: torch.Tensor, - output1_scales_gate_scalar: torch.Tensor, - output2_scales_scalar: torch.Tensor, - num_experts: int, - top_k: int, - num_expert_group: Optional[int], - topk_group: Optional[int], - intermediate_size: int, - local_expert_offset: int, - local_num_experts: int, - use_routing_scales_on_input: bool, - routing_method_type: int, - routed_scaling_factor: float = 1.0) -> torch.Tensor: - num_expert_group = num_expert_group if num_expert_group is not None else 0 - topk_group = topk_group if topk_group is not None else 0 - - quant_hidden_states, _ = moe_kernel_quantize_input( - hidden_states, - input_scale, - quant_dtype=torch.float8_e4m3fn, - per_act_token_quant=False) - - from vllm.utils.flashinfer import ( - flashinfer_trtllm_fp8_per_tensor_scale_moe) - return flashinfer_trtllm_fp8_per_tensor_scale_moe( - routing_logits=routing_logits, - routing_bias=routing_bias, - hidden_states=quant_hidden_states, - gemm1_weights=gemm1_weights, - output1_scales_scalar=output1_scales_scalar, - output1_scales_gate_scalar=output1_scales_gate_scalar, - gemm2_weights=gemm2_weights, - output2_scales_scalar=output2_scales_scalar, - num_experts=num_experts, - top_k=top_k, - n_group=num_expert_group, - topk_group=topk_group, - intermediate_size=intermediate_size, - local_expert_offset=local_expert_offset, - local_num_experts=local_num_experts, - routed_scaling_factor=routed_scaling_factor, - use_routing_scales_on_input=use_routing_scales_on_input, - tile_tokens_dim=calculate_tile_tokens_dim(hidden_states.shape[0], - top_k, num_experts), - routing_method_type=routing_method_type) - - -def flashinfer_fused_moe_per_tensor_scale_fp8_fake( - routing_logits: torch.Tensor, - routing_bias: Optional[torch.Tensor], - hidden_states: torch.Tensor, - input_scale: torch.Tensor, - gemm1_weights: torch.Tensor, - gemm2_weights: torch.Tensor, - output1_scales_scalar: torch.Tensor, - output1_scales_gate_scalar: torch.Tensor, - output2_scales_scalar: torch.Tensor, - num_experts: int, - top_k: int, - num_expert_group: Optional[int], - topk_group: Optional[int], - intermediate_size: int, - local_expert_offset: int, - local_num_experts: int, - use_routing_scales_on_input: bool, - routing_method_type: int, - routed_scaling_factor: float = 1.0) -> torch.Tensor: - pass - - -direct_register_custom_op( - op_name="flashinfer_fused_moe_per_tensor_scale_fp8", - op_func=flashinfer_fused_moe_per_tensor_scale_fp8, - mutates_args=["hidden_states"], - fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake, - tags=(torch.Tag.needs_fixed_stride_order, ), -) - - def outplace_fused_experts( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -1319,7 +1125,6 @@ def outplace_fused_experts( topk_weights: torch.Tensor, topk_ids: torch.Tensor, activation: str = "silu", - is_act_and_mul: bool = True, apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, @@ -1341,37 +1146,37 @@ def outplace_fused_experts( ) -> torch.Tensor: return fused_experts_impl( hidden_states, w1, w2, topk_weights, topk_ids, False, activation, - is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8, - use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, - per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, - w1_zp, w2_zp, a1_scale, a2_scale, block_shape, w1_bias, w2_bias) + apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8, + use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, per_channel_quant, + global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, + a1_scale, a2_scale, block_shape, w1_bias, w2_bias) def outplace_fused_experts_fake( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str = "silu", - is_act_and_mul: bool = True, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor: + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -1403,45 +1208,36 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]: # TODO (bnell): replace this with modular op. Can get rid of inplace/outplace # torch ops. -def fused_experts(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - inplace: bool = False, - activation: str = "silu", - is_act_and_mul: bool = True, - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, - allow_deep_gemm: bool = False, - allow_cutlass_block_scaled_grouped_gemm: bool = False, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor: +def fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + quant_config: Optional[FusedMoEQuantConfig] = None, + allow_deep_gemm: bool = False, + allow_cutlass_block_scaled_grouped_gemm: bool = False, +) -> torch.Tensor: + + if quant_config is None: + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + use_fp8_w8a8 = quant_config.use_fp8_w8a8 + # For now, disable DeepGemm for small N (<= 512) until better # permute/unpermute ops are available. # However, on B200, we use DeepGemm for all cases because they only support # E8M0 scale, which means we requantize the weight and input to the specific # scale. Fallen back to cutlass or triton for some cases would cause # accuracy issue. - if (allow_deep_gemm and use_fp8_w8a8 and + if (allow_deep_gemm and quant_config.use_fp8_w8a8 and (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))): + assert quant_config is not None assert apply_router_weight_on_input is False - assert is_act_and_mul, ( - "DeepGemm only supports is_act_and_mul=True for now.") return deep_gemm_moe_fp8( hidden_states=hidden_states, w1=w1, @@ -1452,22 +1248,23 @@ def fused_experts(hidden_states: torch.Tensor, activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + a1_scale=quant_config.a1_scale, + a2_scale=quant_config.a2_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8 and _valid_cutlass_block_scaled_grouped_gemm( w1, w2, inplace, activation, apply_router_weight_on_input, expert_map)): + assert quant_config is not None return run_cutlass_block_scaled_fused_experts( a=hidden_states, w1=w1, w2=w2, - w1_scale=w1_scale, - w2_scale=w2_scale, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, topk_weights=topk_weights, topk_ids=topk_ids) else: @@ -1478,26 +1275,49 @@ def fused_experts(hidden_states: torch.Tensor, topk_weights=topk_weights, topk_ids=topk_ids, activation=activation, - is_act_and_mul=is_act_and_mul, apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_channel_quant=per_channel_quant, + use_fp8_w8a8=quant_config.use_fp8_w8a8, + use_int8_w8a8=quant_config.use_int8_w8a8, + use_int8_w8a16=quant_config.use_int8_w8a16, + use_int4_w4a16=quant_config.use_int4_w4a16, + use_mxfp4_w4a4=quant_config.use_mxfp4_w4a4, + per_channel_quant=quant_config.per_act_token_quant, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_shape, - w1_bias=w1_bias, - w2_bias=w2_bias, - ) + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + w1_zp=quant_config.w1_zp, + w2_zp=quant_config.w2_zp, + a1_scale=quant_config.a1_scale, + a2_scale=quant_config.a2_scale, + block_shape=quant_config.block_shape, + w1_bias=quant_config.w1_bias, + w2_bias=quant_config.w2_bias) + + +SILU_NO_MUL: str = activation_without_mul("silu") +GELU_NO_MUL: str = activation_without_mul("gelu") + + +def _get_config_quant_dtype( + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_mxfp4_w4a4: bool, +) -> Union[None, torch.dtype, str]: + """ + Get the quantization type based on the quantization strategy flags. + We don't have a quant_config at this point so we need to work backwards. + A return type of None means no quantization is required because the + input is unquantized or has been quantized prior to calling + fused_experts_impl. + """ + if use_fp8_w8a8: + return torch.float8_e4m3fn + elif use_int8_w8a8: + return torch.int8 + elif use_mxfp4_w4a4: + return "mxfp4" + return None def fused_experts_impl( @@ -1508,7 +1328,6 @@ def fused_experts_impl( topk_ids: torch.Tensor, inplace: bool = False, activation: str = "silu", - is_act_and_mul: bool = True, apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, @@ -1557,17 +1376,18 @@ def fused_experts_impl( # https://github.com/vllm-project/vllm/issues/5938 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE M = min(num_tokens, CHUNK_SIZE) - config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - dtype=hidden_states.dtype) - qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4) + config_dtype = _get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_mxfp4_w4a4=use_mxfp4_w4a4, + dtype=hidden_states.dtype) + + # Note: for use_int8_w8a16 or use_int4_w4a16, the activations are + # quantized prior to calling fused_experts. + quant_dtype = _get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_mxfp4_w4a4=use_mxfp4_w4a4) get_config_func = functools.partial( try_get_optimal_moe_config, @@ -1640,7 +1460,7 @@ def fused_experts_impl( qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input( A=curr_hidden_states, A_scale=a1_scale, - quant_dtype=qtype, + quant_dtype=quant_dtype, per_act_token_quant=per_channel_quant, block_shape=block_shape) @@ -1671,30 +1491,29 @@ def fused_experts_impl( B_bias=w1_bias) # Activation function with multiplication - if activation == "silu" and is_act_and_mul: + if activation == "silu": torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) - elif activation == "gelu" and is_act_and_mul: + elif activation == "gelu": torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) - elif activation == "swigluoai" and is_act_and_mul: + elif activation == "swigluoai": # alpha = 1.702, limit = 7.0 torch.ops._C.swigluoai_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) # Activation function without multiplication - elif activation == "silu": + elif activation == SILU_NO_MUL: intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) - elif activation == "gelu": + elif activation == GELU_NO_MUL: intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) else: - raise ValueError(f"Unsupported FusedMoe activation: {activation}, " - f"with is_act_and_mul={is_act_and_mul}.") + raise ValueError(f"Unsupported FusedMoe activation: {activation}.") qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( A=intermediate_cache2, A_scale=a2_scale, - quant_dtype=qtype, + quant_dtype=quant_dtype, per_act_token_quant=per_channel_quant, block_shape=block_shape) @@ -1726,164 +1545,13 @@ def fused_experts_impl( return out_hidden_states -def fused_moe( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - gating_output: torch.Tensor, - topk: int, - renormalize: bool, - inplace: bool = False, - activation: str = "silu", - is_act_and_mul: bool = True, - use_grouped_topk: bool = False, - num_expert_group: Optional[int] = None, - topk_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None, -) -> torch.Tensor: - """ - This function computes a Mixture of Experts (MoE) layer using two sets of - weights, w1 and w2, and top-k gating mechanism. - - Parameters: - - hidden_states (torch.Tensor): The input tensor to the MoE layer. - - w1 (torch.Tensor): The first set of expert weights. - - w2 (torch.Tensor): The second set of expert weights. - - gating_output (torch.Tensor): The output of the gating operation - (before softmax). - - topk (int): The number of top-k experts to select. - - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - - inplace (bool): If True, perform the operation in-place. - Defaults to False. - - activation (str): The activation function to apply after the first - MoE layer. - - is_act_and_mul (bool): If True, use activation-and-mul function for - activation (self-gated activation), otherwise use activation function - for activation (ungated activation). - - num_expert_group: Optional[int]: additional parameter for grouped_topk - - topk_group: Optional[int]: additional parameter for grouped_topk - - use_grouped_topk: If True, use grouped_topk instead of fused_topk - note: Deepseekv2 model uses grouped_topk - - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner - products for w1 and w2. Defaults to False. - - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner - products for w1 and w2. Defaults to False. - - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16 - activation to compute the inner products for w1 and w2. - Defaults to False. - - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16 - activation to compute the inner products for w1 and w2. - Defaults to False. - - use_mxfp4_w4a4 (bool): If True, use matmul of OCP MXFP4 weight and - OCP MXFP4 activation to compute the inner products for w1 and w2. - Defaults to False. - - global_num_experts (int): The total number of experts in the global - expert space. - - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - from the global expert space to the local expert space of the expert - parallel shard. - - w1_scale (Optional[torch.Tensor]): Optional scale to be used for - w1. - - w2_scale (Optional[torch.Tensor]): Optional scale to be used for - w2. - - a1_scale (Optional[torch.Tensor]): Optional scale to be used for - a1. - - a2_scale (Optional[torch.Tensor]): Optional scale to be used for - a2. - - block_shape: (Optional[list[int]]): Optional block size for block-wise - quantization. - - Returns: - - torch.Tensor: The output tensor after applying the MoE layer. - """ - if not is_act_and_mul: - assert inplace is False, ( - "is_act_and_mul=False is not supported with inplace=True") - - if use_grouped_topk: - assert num_expert_group is not None and topk_group is not None - topk_weights, topk_ids = grouped_topk(hidden_states, gating_output, - topk, renormalize, - num_expert_group, topk_group) - elif custom_routing_function is None: - topk_weights, topk_ids, token_expert_indices = fused_topk( - hidden_states, gating_output, topk, renormalize) - else: - topk_weights, topk_ids = custom_routing_function( - hidden_states, gating_output, topk, renormalize) - - return fused_experts(hidden_states, - w1, - w2, - topk_weights, - topk_ids, - inplace=inplace, - activation=activation, - is_act_and_mul=is_act_and_mul, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_channel_quant=per_channel_quant, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_shape, - w1_bias=w1_bias, - w2_bias=w2_bias) - - class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + quant_config: FusedMoEQuantConfig, ): - super().__init__( - FusedMoEQuantConfig.make( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, - )) - - self.use_fp8_w8a8 = use_fp8_w8a8 - self.use_int4_w4a16 = use_int4_w4a16 - self.use_int8_w8a8 = use_int8_w8a8 - self.use_int8_w8a16 = use_int8_w8a16 - self.use_mxfp4_w4a4 = use_mxfp4_w4a4 + super().__init__(quant_config) @property def activation_formats( @@ -1929,19 +1597,14 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): # Check constraints. - if self.use_int4_w4a16: + if self.quant_config.use_int4_w4a16: assert hidden_states.size(-1) // 2 == w1.size(2), ( "Hidden size mismatch") else: @@ -1964,17 +1627,11 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): if global_num_experts == -1: global_num_experts = E - config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, - use_mxfp4_w4a4=self.use_mxfp4_w4a4, - dtype=hidden_states.dtype) - config = try_get_optimal_moe_config( w1.size(), w2.size(), top_k_num, - config_dtype, + self.quant_config.config_name(hidden_states.dtype), num_tokens, block_shape=self.block_shape, ) @@ -2008,8 +1665,8 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): w1, intermediate_cache1, a1q_scale, - w1_scale, - w1_zp, + self.w1_scale, + self.w1_zp, None, # topk_weights sorted_token_ids, expert_ids, @@ -2018,13 +1675,13 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): top_k_num, config, compute_type=compute_type, - use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a8=self.use_int8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, + use_fp8_w8a8=self.quant_config.use_fp8_w8a8, + use_int8_w8a8=self.quant_config.use_int8_w8a8, + use_int8_w8a16=self.quant_config.use_int8_w8a16, + use_int4_w4a16=self.quant_config.use_int4_w4a16, per_channel_quant=self.per_act_token_quant, block_shape=self.block_shape, - B_bias=None # TODO support B_bias + B_bias=self.w1_bias, ) self.activation(activation, intermediate_cache2, @@ -2033,7 +1690,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): a2q_scale: Optional[torch.Tensor] = None qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( - intermediate_cache2, a2_scale, self.quant_dtype, + intermediate_cache2, self.a2_scale, self.quant_dtype, self.per_act_token_quant, self.block_shape) invoke_fused_moe_kernel( @@ -2041,8 +1698,8 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): w2, intermediate_cache3, a2q_scale, - w2_scale, - w2_zp, + self.w2_scale, + self.w2_zp, topk_weights, sorted_token_ids, expert_ids, @@ -2051,36 +1708,21 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): 1, config, compute_type=compute_type, - use_fp8_w8a8=self.use_fp8_w8a8, - use_int8_w8a8=self.use_int8_w8a8, - use_int8_w8a16=self.use_int8_w8a16, - use_int4_w4a16=self.use_int4_w4a16, + use_fp8_w8a8=self.quant_config.use_fp8_w8a8, + use_int8_w8a8=self.quant_config.use_int8_w8a8, + use_int8_w8a16=self.quant_config.use_int8_w8a16, + use_int4_w4a16=self.quant_config.use_int4_w4a16, per_channel_quant=self.per_act_token_quant, block_shape=self.block_shape, - B_bias=None # TODO support B_bias + B_bias=self.w2_bias, ) ops.moe_sum(intermediate_cache3, output) def modular_triton_fused_moe( - use_fp8_w8a8: bool, - use_int8_w8a8: bool, - use_int8_w8a16: bool, - use_int4_w4a16: bool, - use_mxfp4_w4a4: bool, - per_act_token_quant: bool, - block_shape: Optional[list[int]] = None, -) -> mk.FusedMoEModularKernel: + quant_config: FusedMoEQuantConfig) -> mk.FusedMoEModularKernel: return mk.FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), - TritonExperts( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, - ), + TritonExperts(quant_config), ) diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 312befe2c1d71..614a83ad1158c 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Optional +from typing import Optional import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) from vllm.utils import has_triton_kernels @@ -23,9 +25,6 @@ if has_triton_kernels(): "Failed to import Triton kernels. Please make sure your triton " "version is compatible.") -if TYPE_CHECKING: - from triton_kernels.matmul_ogs import PrecisionConfig - def triton_kernel_moe_forward( hidden_states: torch.Tensor, @@ -35,20 +34,10 @@ def triton_kernel_moe_forward( topk: int, renormalize: bool, activation: str = "silu", + quant_config: Optional[FusedMoEQuantConfig] = None, apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - per_channel_quant: bool = False, global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None, - w1_precision: Optional["PrecisionConfig"] = None, - w2_precision: Optional["PrecisionConfig"] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, ) -> torch.Tensor: routing_data, gather_idx, scatter_idx = routing(gating_output, @@ -64,20 +53,10 @@ def triton_kernel_moe_forward( gather_idx, scatter_idx, activation=activation, + quant_config=quant_config, apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=use_fp8_w8a8, - per_channel_quant=per_channel_quant, global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_bias=w1_bias, - w2_bias=w2_bias, - w1_precision=w1_precision, - w2_precision=w2_precision, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_shape) + expert_map=expert_map) # This is a triton implementation of the fused_experts function @@ -90,28 +69,23 @@ def triton_kernel_fused_experts( gather_indx, # GatherIndx scatter_indx, # ScatterIndx activation: str = "silu", + quant_config: Optional[FusedMoEQuantConfig] = None, swiglu_alpha: float = 1.702, swiglu_limit: float = 7.0, apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - per_channel_quant: bool = False, global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_bias: Optional[torch.Tensor] = None, - w2_bias: Optional[torch.Tensor] = None, - w1_precision: Optional["PrecisionConfig"] = None, - w2_precision: Optional["PrecisionConfig"] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, + a1q_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: + if quant_config is None: + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG # type check, uint8 means mxfp4 assert hidden_states.dtype == torch.bfloat16 - assert w1_bias is None or w1_bias.dtype == torch.float32 - assert w2_bias is None or w2_bias.dtype == torch.float32 + assert (quant_config.w1_bias is None + or quant_config.w1_bias.dtype == torch.float32) + assert (quant_config.w2_bias is None + or quant_config.w2_bias.dtype == torch.float32) # Shape check, only check non-mxfp4 assert hidden_states.shape[-1] == w1.shape[-2] @@ -130,20 +104,20 @@ def triton_kernel_fused_experts( intermediate_cache1 = matmul_ogs( hidden_states, w1, - w1_bias, + quant_config.w1_bias, routing_data, gather_indx=gather_indx, - precision_config=w1_precision, + precision_config=quant_config.w1_precision, gammas=gammas if apply_router_weight_on_input else None, fused_activation=act) intermediate_cache3 = matmul_ogs( intermediate_cache1, w2, - w2_bias, + quant_config.w2_bias, routing_data, scatter_indx=scatter_indx, - precision_config=w2_precision, + precision_config=quant_config.w2_precision, gammas=None if apply_router_weight_on_input else gammas, y=output_tensor, ) @@ -154,21 +128,13 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - quant_config, max_num_tokens: int, num_dispatchers: int, - w1_precision: "PrecisionConfig", - w2_precision: "PrecisionConfig", - w1_bias: Optional[torch.Tensor], - w2_bias: Optional[torch.Tensor], + quant_config: FusedMoEQuantConfig, ): super().__init__(quant_config) self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers - self.w1_precision = w1_precision - self.w2_precision = w2_precision - self.w1_bias = w1_bias - self.w2_bias = w2_bias @property def activation_formats( @@ -212,12 +178,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -228,20 +189,12 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): hidden_states, w1, w2, - None, - None, - None, + routing_data=None, + gather_indx=None, + scatter_indx=None, activation=activation, + quant_config=self.quant_config, apply_router_weight_on_input=False, - use_fp8_w8a8=False, - per_channel_quant=False, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_bias=self.w1_bias, - w2_bias=self.w2_bias, - w1_precision=self.w1_precision, - w2_precision=self.w2_precision, - a1_scale=a1q_scale, - a2_scale=a2_scale) + a1q_scale=a1q_scale) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d22bb253f4a72..ae3b67a2b84e6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -22,7 +22,8 @@ from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp # yapf: disable from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, FusedMoEParallelConfig) + FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEConfig, FusedMoEParallelConfig, + FusedMoEQuantConfig, biased_moe_quant_config) # yapf: enable from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEActivationFormat, FusedMoEModularKernel, @@ -78,11 +79,11 @@ class FusedMoeWeightScaleSupported(Enum): class FusedMoEMethodBase(QuantizeMethodBase): - # TODO(bnell): also pass quant_config? def __init__(self, moe: FusedMoEConfig): super().__init__() self.moe = moe - self.fused_experts: Optional[Callable] = None + self.moe_quant_config: Optional[FusedMoEQuantConfig] = None + self.fused_experts: Optional[FusedMoEModularKernel] = None self.topk_indices_dtype = None @abstractmethod @@ -103,23 +104,28 @@ class FusedMoEMethodBase(QuantizeMethodBase): @staticmethod def _maybe_make_prepare_finalize( - moe: FusedMoEConfig, ) -> Optional[FusedMoEPrepareAndFinalize]: + moe: FusedMoEConfig, + quant_config: Optional[FusedMoEQuantConfig], + ) -> Optional[FusedMoEPrepareAndFinalize]: all2all_manager = get_ep_group().device_communicator.all2all_manager assert all2all_manager is not None prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None + # TODO: could allow this now assert not moe.use_flashinfer_cutlass_kernels, \ "Must be created in modelopt.py" if moe.use_pplx_kernels: + assert quant_config is not None + hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes( moe.max_num_tokens, moe.hidden_dim, moe.in_dtype, - moe.quant_dtype, - per_act_token_quant=moe.per_act_token_quant, - block_shape=moe.block_shape, + quant_config.quant_dtype, + per_act_token_quant=quant_config.per_act_token_quant, + block_shape=quant_config.block_shape, ) all_to_all_args = dict( @@ -165,6 +171,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): ) elif moe.use_deepep_ll_kernels: + assert quant_config is not None all_to_all_args = dict( max_num_tokens_per_dp_rank=moe.max_num_tokens, token_hidden_size=moe.hidden_dim, @@ -174,13 +181,11 @@ class FusedMoEMethodBase(QuantizeMethodBase): all2all_manager.world_size) handle = all2all_manager.get_handle(all_to_all_args) - # Note : We may want to use FP8 dispatch even otherwise just to - # reduce datamovement - use_fp8_dispatch = (moe.quant_config is not None - and moe.quant_config.quant_dtype - == current_platform.fp8_dtype() - and moe.quant_config.block_shape - == DEEPEP_QUANT_BLOCK_SHAPE) + # Note: We may want to use FP8 dispatch just to reduce + # data movement. + use_fp8_dispatch = ( + quant_config.quant_dtype == current_platform.fp8_dtype() + and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE) prepare_finalize = DeepEPLLPrepareAndFinalize( handle, @@ -192,11 +197,10 @@ class FusedMoEMethodBase(QuantizeMethodBase): return prepare_finalize def maybe_make_prepare_finalize( - self, - moe: FusedMoEConfig, - ) -> Optional[FusedMoEPrepareAndFinalize]: - if moe.moe_parallel_config.use_all2all_kernels: - return FusedMoEMethodBase._maybe_make_prepare_finalize(moe) + self) -> Optional[FusedMoEPrepareAndFinalize]: + if self.moe.moe_parallel_config.use_all2all_kernels: + return FusedMoEMethodBase._maybe_make_prepare_finalize( + self.moe, self.moe_quant_config) else: return None @@ -204,7 +208,13 @@ class FusedMoEMethodBase(QuantizeMethodBase): # prepare_communication_buffer_for_model. def init_prepare_finalize(self, layer: torch.nn.Module): assert self.moe is not None - prepare_finalize = self.maybe_make_prepare_finalize(self.moe) + + # We must get the quant config here so that the layer is + # completely initialized, i.e. all weights loaded and post + # processed. + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + + prepare_finalize = self.maybe_make_prepare_finalize() if prepare_finalize is not None: logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__, @@ -213,7 +223,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): assert self.fused_experts is None, \ f"Attempt to override experts for {id(self)}!" self.topk_indices_dtype = prepare_finalize.topk_indices_dtype() - experts = self.select_gemm_impl(prepare_finalize, self.moe, layer) + experts = self.select_gemm_impl(prepare_finalize, layer) self.fused_experts = FusedMoEModularKernel( prepare_finalize, experts, @@ -223,7 +233,6 @@ class FusedMoEMethodBase(QuantizeMethodBase): def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, layer: torch.nn.Module, ) -> FusedMoEPermuteExpertsUnpermute: # based on the all2all implementation, select the appropriate @@ -232,6 +241,11 @@ class FusedMoEMethodBase(QuantizeMethodBase): f"{self.__class__.__name__} must select appropriate gemm " "implementation based on the prepare_finalize") + @abstractmethod + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + raise NotImplementedError + @abstractmethod def apply( self, @@ -265,7 +279,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def __init__(self, moe: FusedMoEConfig): super().__init__(moe) - self.has_bias = self.moe.has_bias self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() if self.rocm_aiter_moe_enabled: from .rocm_aiter_fused_moe import rocm_aiter_fused_experts @@ -273,23 +286,30 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): else: self.rocm_aiter_fused_experts = None # type: ignore + def maybe_make_prepare_finalize( + self) -> Optional[FusedMoEPrepareAndFinalize]: + if self.rocm_aiter_moe_enabled: + return None + else: + return super().maybe_make_prepare_finalize() + def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, - # TODO(bnell): Remove. Every layer should have an moe config object. - moe: FusedMoEConfig, layer: torch.nn.Module, ) -> FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None if (prepare_finalize.activation_format == FusedMoEActivationFormat.BatchedExperts): logger.debug("BatchedTritonExperts %s", self.moe) return BatchedTritonExperts( max_num_tokens=self.moe.max_num_tokens, num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, ) else: logger.debug("TritonExperts %s", self.moe) - return TritonExperts() + return TritonExperts(self.moe_quant_config) def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -303,7 +323,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - if self.has_bias: + if self.moe.has_bias: w13_bias = torch.nn.Parameter(torch.zeros( num_experts, 2 * intermediate_size_per_partition, @@ -320,7 +340,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) - if self.has_bias: + if self.moe.has_bias: w2_bias = torch.nn.Parameter(torch.zeros(num_experts, hidden_size, dtype=params_dtype), @@ -442,6 +462,16 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): logical_replica_count=logical_replica_count, ) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + if self.moe.has_bias: + return biased_moe_quant_config( + layer.w13_bias, + layer.w2_bias, + ) + else: + return FUSED_MOE_UNQUANTIZED_CONFIG + def forward_cuda( self, layer: torch.nn.Module, @@ -486,6 +516,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): logical_replica_count=logical_replica_count) if self.rocm_aiter_moe_enabled: + assert self.fused_experts is None return self.rocm_aiter_fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -496,7 +527,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) elif self.fused_experts is not None: - if self.has_bias: + if self.moe.has_bias: raise ValueError( "FusedMoEModularKernel does not support bias.") return self.fused_experts( @@ -517,12 +548,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, - w1_bias=layer.w13_bias if self.has_bias else None, - w2_bias=layer.w2_bias if self.has_bias else None, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, activation=activation, + quant_config=self.moe_quant_config, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map, @@ -933,16 +963,18 @@ class FusedMoE(CustomOp): # since model_config is not set in the pytest test. model_dtype = params_dtype - moe = FusedMoEConfig.make(num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - num_local_experts=self.local_num_experts, - moe_parallel_config=self.moe_parallel_config, - in_dtype=model_dtype, - max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, - quant_config=quant_config, - has_bias=has_bias) + moe = FusedMoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + in_dtype=model_dtype, + max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, + has_bias=has_bias, + ) self.moe_config = moe + self.moe_quant_config: Optional[FusedMoEQuantConfig] = None self.quant_config = quant_config # Note: get_quant_method will look at the layer's local_num_experts @@ -990,6 +1022,9 @@ class FusedMoE(CustomOp): # Chunked all2all staging tensor self.batched_hidden_states: Optional[torch.Tensor] = None self.batched_router_logits: Optional[torch.Tensor] = None + + # TODO(bnell): flashinfer uses non-batched format. + # Does it really need a batched buffer? if (self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels or self.moe_config.use_flashinfer_cutlass_kernels): @@ -1062,7 +1097,9 @@ class FusedMoE(CustomOp): @property def use_flashinfer_cutlass_kernels(self): - return self.moe_config.use_flashinfer_cutlass_kernels + return (self.moe_quant_config is not None + and self.moe_quant_config.quant_dtype == "nvfp4" + and self.moe_config.use_flashinfer_cutlass_kernels) def update_expert_map(self): # ep_size and ep_rank should already be updated @@ -1492,6 +1529,11 @@ class FusedMoE(CustomOp): self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] self.logical_replica_count = logical_replica_count[moe_layer_idx] + def ensure_moe_quant_config(self): + if self.quant_method.moe_quant_config is None: + self.quant_method.moe_quant_config = ( + self.quant_method.get_fused_moe_quant_config(self)) + @staticmethod def select_experts( hidden_states: torch.Tensor, @@ -1711,6 +1753,8 @@ class FusedMoE(CustomOp): assert ( self.batched_router_logits.size(-1) == full_router_logits.size(-1)) + self.ensure_moe_quant_config() + full_fused_final_hidden_states = torch.empty_like(full_hidden_states) if self.shared_experts is not None: full_shared_final_hidden_states = torch.empty_like( @@ -1825,14 +1869,17 @@ class FusedMoE(CustomOp): router_logits: torch.Tensor, ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.quant_method is not None + + self.ensure_moe_quant_config() + # Route to the chunked forward path using the FlashInfer Cutlass kernel # only when data parallelism (DP) is enabled. - use_flashinfer_cutlass_kernels = ( - self.dp_size > 1 - and self.moe_config.use_flashinfer_cutlass_kernels) + _use_flashinfer_cutlass_kernels = (self.dp_size > 1 and + self.use_flashinfer_cutlass_kernels) + if (self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels - or use_flashinfer_cutlass_kernels): + or _use_flashinfer_cutlass_kernels): return self.forward_impl_chunked(hidden_states, router_logits) do_naive_dispatch_combine: bool = ( diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index efaa9cc058e41..58cd0294c8c44 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -177,8 +177,6 @@ class FusedMoEPrepareAndFinalize(ABC): def prepare( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -189,9 +187,6 @@ class FusedMoEPrepareAndFinalize(ABC): """ Perform any quantization (and/or) dispatching needed for this kernel. - a1: The (unquantized) input to the MoE layer. - - a1_scale: Optional scales for a1 - - a2_scale: Optional scales for the second MoE gemm. Required to make - sure the quantization is consistent for both gemms. - topk_ids: The topk ids. - topk_weights: The topk weights. - num_experts: The total number of experts in the global expert space. @@ -199,10 +194,11 @@ class FusedMoEPrepareAndFinalize(ABC): space to the local expert space of the expert parallel shard. - apply_router_weight_on_input: When True, apply the weights to the activations, before quantization + dispatching. + - quant_config: Quantization info provided by the fused experts. Returns a tuple of: - quantized + dispatched a. - - quantized + dispatched a1_scales. + - Optional quantized + dispatched a1_scales. - Optional ExpertTokensMetadata containing gpu/cpu tensors as big as the number of local experts with the information about the number of tokens assigned to each local expert. @@ -220,8 +216,6 @@ class FusedMoEPrepareAndFinalize(ABC): def prepare_async( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -316,6 +310,7 @@ class FusedMoEPrepareAndFinalize(ABC): raise NotImplementedError +# TODO: add supported activations method (return string) class FusedMoEPermuteExpertsUnpermute(ABC): """ An abstract base class for the [Permute-Experts-Unpermute] step described @@ -324,12 +319,12 @@ class FusedMoEPermuteExpertsUnpermute(ABC): def __init__( self, - quant_config: Optional[FusedMoEQuantConfig], + quant_config: FusedMoEQuantConfig, ): - if quant_config is not None: - self.quant_config = quant_config - else: - self.quant_config = FusedMoEQuantConfig() + """ + quant_config: Quantization parameters for this experts instance. + """ + self.quant_config = quant_config @property @abstractmethod @@ -341,6 +336,11 @@ class FusedMoEPermuteExpertsUnpermute(ABC): """ raise NotImplementedError + # + # Various helpers for accessing quantization parameters from the + # quant_config. + # + @property def quant_dtype(self) -> Optional[torch.dtype]: return self.quant_config.quant_dtype @@ -357,6 +357,54 @@ class FusedMoEPermuteExpertsUnpermute(ABC): def per_out_ch_quant(self) -> bool: return self.quant_config.per_out_ch_quant + @property + def a1_scale(self) -> Optional[torch.Tensor]: + return self.quant_config.a1_scale + + @property + def a2_scale(self) -> Optional[torch.Tensor]: + return self.quant_config.a2_scale + + @property + def a1_gscale(self) -> Optional[torch.Tensor]: + return self.quant_config.a1_gscale + + @property + def a2_gscale(self) -> Optional[torch.Tensor]: + return self.quant_config.a2_gscale + + @property + def w1_scale(self) -> Optional[torch.Tensor]: + return self.quant_config.w1_scale + + @property + def w2_scale(self) -> Optional[torch.Tensor]: + return self.quant_config.w2_scale + + @property + def w1_zp(self) -> Optional[torch.Tensor]: + return self.quant_config.w1_zp + + @property + def w2_zp(self) -> Optional[torch.Tensor]: + return self.quant_config.w2_zp + + @property + def w1_bias(self) -> Optional[torch.Tensor]: + return self.quant_config.w1_bias + + @property + def w2_bias(self) -> Optional[torch.Tensor]: + return self.quant_config.w2_bias + + @property + def g1_alphas(self) -> Optional[torch.Tensor]: + return self.quant_config.g1_alphas + + @property + def g2_alphas(self) -> Optional[torch.Tensor]: + return self.quant_config.g2_alphas + # TODO (bnell): make this return a CHUNK_SIZE or None instead? @abstractmethod def supports_chunking(self) -> bool: @@ -433,12 +481,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[ExpertTokensMetadata], @@ -455,7 +498,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): - w1 (torch.Tensor): The first set of expert weights. - w2 (torch.Tensor): The second set of expert weights. - topk_weights: A map of row to expert weights. Some implementations - choose to do weight application. + choose to do weight application. - topk_ids (torch.Tensor): A map of row to expert id. - activation (str): The activation function to apply after the first MoE layer. @@ -464,15 +507,9 @@ class FusedMoEPermuteExpertsUnpermute(ABC): - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. - - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2. - - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for - w1. - - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for - w2. - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be - used for a1. - - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. + used for a1. Result of quantization from prepare/finalize and not + from the FusedMoEQuantConfig. - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs must be large enough to hold output of either MoE gemm. - workspace2 (torch.Tensor): A scratch tensor used for the activation @@ -559,12 +596,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts: int, local_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, ) -> torch.Tensor: @@ -601,12 +633,7 @@ class FusedMoEModularKernel(torch.nn.Module): activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, a1q_scale=a1q_scale, - a2_scale=a2_scale, workspace13=workspace13, workspace2=workspace2, expert_tokens_meta=expert_tokens_meta, @@ -627,12 +654,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts: int, local_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, ) -> torch.Tensor: @@ -658,12 +680,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts=global_num_experts, local_num_experts=local_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, a1q_scale=a1q_scale, - a2_scale=a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -685,9 +702,13 @@ class FusedMoEModularKernel(torch.nn.Module): Optional[torch.Tensor], torch.Tensor, torch.Tensor]: s = chunk_idx * CHUNK_SIZE e = min(s + CHUNK_SIZE, M) - return (a1q[s:e], _chunk_scales(a1q_scale, s, e), - _chunk_scales(a2_scale, s, - e), topk_ids[s:e], topk_weights[s:e]) + return ( + a1q[s:e], + _chunk_scales(a1q_scale, s, e), + _chunk_scales(self.fused_experts.a2_scale, s, e), + topk_ids[s:e], + topk_weights[s:e], + ) def slice_output_tensor(chunk_idx: int) -> torch.Tensor: assert fused_out.size(0) % M == 0, ( @@ -744,12 +765,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts=global_num_experts, local_num_experts=local_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, a1q_scale=c_a1q_scale, - a2_scale=c_a2_scale, expert_tokens_meta=c_expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -767,12 +783,6 @@ class FusedMoEModularKernel(torch.nn.Module): activation: str = "silu", global_num_experts: int = -1, expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: """ @@ -795,14 +805,6 @@ class FusedMoEModularKernel(torch.nn.Module): - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. - - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2. - - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for - w1. - - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for - w2. - - a1_scale (Optional[torch.Tensor]): Optional scale to be used for a1. - - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. - apply_router_weight_on_input (bool): When true, the topk weights are applied directly on the inputs. This is only applicable when topk is 1. @@ -832,8 +834,6 @@ class FusedMoEModularKernel(torch.nn.Module): (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, _expert_topk_weights) = self.prepare_finalize.prepare( a1, - a1_scale, - a2_scale, topk_weights, topk_ids, global_num_experts, @@ -846,8 +846,6 @@ class FusedMoEModularKernel(torch.nn.Module): dbo_maybe_run_recv_hook() hook, receiver = self.prepare_finalize.prepare_async( a1, - a1_scale, - a2_scale, topk_weights, topk_ids, global_num_experts, @@ -897,12 +895,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts=global_num_experts, local_num_experts=local_num_experts, expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, a1q_scale=a1q_scale, - a2_scale=a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index b8c1c14317c46..32d12476dd01a 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -95,8 +95,6 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare_async( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -130,8 +128,10 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): repeat_cols = 4 repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0) + # TODO(bnell): always pass quant_config.a1_scale? a1q, a1q_scale = moe_kernel_quantize_input( - a1, (None if quant_config.per_act_token_quant else a1_scale), + a1, (None if quant_config.per_act_token_quant else + quant_config.a1_scale), quant_dtype=quant_config.quant_dtype, per_act_token_quant=quant_config.per_act_token_quant, block_shape=quant_config.block_shape) @@ -253,8 +253,6 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def prepare( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -264,8 +262,6 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ) -> mk.PrepareResultType: hook, receiver = self.prepare_async( a1, - a1_scale, - a2_scale, topk_weights, topk_ids, num_experts, diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index bd9f7d4a06b17..588e5de865dd9 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -30,8 +30,6 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): def prepare( self, a1: torch.Tensor, - a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, @@ -48,7 +46,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): a1.mul_(topk_weights.to(a1.dtype)) a1q, a1q_scale = moe_kernel_quantize_input( - a1, a1_scale, quant_config.quant_dtype, + a1, quant_config.a1_scale, quant_config.quant_dtype, quant_config.per_act_token_quant, quant_config.block_shape) return a1q, a1q_scale, None, None, None diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 13c3ab4f06dd1..f4972ff5f9cb0 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -7,6 +7,8 @@ from typing import Optional import torch from vllm import envs +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig) from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op @@ -305,21 +307,18 @@ def rocm_aiter_grouped_topk( def rocm_aiter_fused_experts( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str = "silu", - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - per_channel_quant: bool = False, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, - expert_map: Optional[torch.Tensor] = None) -> torch.Tensor: + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + expert_map: Optional[torch.Tensor] = None, + quant_config: Optional[FusedMoEQuantConfig] = None, +) -> torch.Tensor: + if quant_config is None: + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG activation_method = (ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU) @@ -333,7 +332,8 @@ def rocm_aiter_fused_experts( expert_mask = None # w8a8 per-channel quantization - if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8: + if (quant_config.per_act_token_quant and apply_router_weight_on_input + and quant_config.use_fp8_w8a8): # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input` # This applies topk_weights on the GEMM output of the first FC layer # rather than the second FC. @@ -349,8 +349,8 @@ def rocm_aiter_fused_experts( w2, topk_weights, topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, + fc1_scale=quant_config.w1_scale, + fc2_scale=quant_config.w2_scale, fc1_smooth_scale=None, fc2_smooth_scale=None, a16=False, @@ -362,14 +362,14 @@ def rocm_aiter_fused_experts( quant_method = QuantMethod.NO.value # w8a8 block-scaled - if block_shape is not None and use_fp8_w8a8: + if quant_config.block_shape is not None and quant_config.use_fp8_w8a8: assert not apply_router_weight_on_input, ( "apply_router_weight_on_input is\ not supported for block scaled moe") - assert w1_scale is not None - assert w2_scale is not None + assert quant_config.w1_scale is not None + assert quant_config.w2_scale is not None quant_method = QuantMethod.BLOCK_128x128.value - elif use_fp8_w8a8: + elif quant_config.use_fp8_w8a8: # Currently only per tensor quantization method is enabled. quant_method = QuantMethod.PER_TENSOR.value @@ -390,10 +390,10 @@ def rocm_aiter_fused_experts( expert_mask=expert_mask, quant_method=quant_method, activation_method=activation_method, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + a1_scale=quant_config.a1_scale, + a2_scale=quant_config.a2_scale, doweight_stage1=apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 6cd81d97f0298..b2dbc306a6148 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -7,7 +7,8 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( - DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape, + DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape) +from vllm.model_executor.layers.fused_moe.deep_gemm_utils import ( deep_gemm_block_shape) from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used @@ -17,40 +18,19 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - use_mxfp4_w4a4: bool = False, - per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + quant_config: FusedMoEQuantConfig, allow_deep_gemm: bool = False, ): - super().__init__( - FusedMoEQuantConfig.make( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, - )) - self.triton_expert = TritonExperts( - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int4_w4a16=use_int4_w4a16, - use_int8_w8a16=use_int8_w8a16, - use_mxfp4_w4a4=use_mxfp4_w4a4, - per_act_token_quant=per_act_token_quant, - block_shape=block_shape, - ) + super().__init__(quant_config) - self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8 and + self.triton_expert = TritonExperts(quant_config) + + self.allow_deep_gemm = (allow_deep_gemm + and self.quant_config.use_fp8_w8a8 and self.block_shape == deep_gemm_block_shape()) self.deep_gemm_expert = DeepGemmExperts( - ) if self.allow_deep_gemm else None + self.quant_config) if self.allow_deep_gemm else None @property def activation_formats( @@ -130,12 +110,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -158,12 +133,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): activation, global_num_experts, expert_map, - w1_scale, - w2_scale, - w1_zp, - w2_zp, a1q_scale, - a2_scale, workspace13, workspace2, expert_tokens_meta, diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 14dfce4b0e3aa..8e5f6acc9df63 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -5,7 +5,8 @@ from typing import Optional import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP) from vllm.utils import next_power_of_2 @@ -16,20 +17,17 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, moe: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, gemm1_alpha, gemm1_beta, gemm1_clamp_limit, - w13_bias, - w2_bias, max_capture_size, ): - super().__init__(moe.quant_config) + super().__init__(quant_config) self.moe = moe self.gemm1_alpha = gemm1_alpha self.gemm1_beta = gemm1_beta self.gemm1_clamp_limit = gemm1_clamp_limit - self.w13_bias = w13_bias - self.w2_bias = w2_bias self.max_capture_size = max_capture_size @property @@ -104,12 +102,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): activation: str, global_num_experts: int, expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -129,8 +122,8 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to( torch.bfloat16).view(torch.int16) - assert w1_scale is not None - assert w2_scale is not None + assert self.w1_scale is not None + assert self.w2_scale is not None kwargs = { "topk_ids": packed_tensor, @@ -143,9 +136,9 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): "gemm1_weights": w1, "gemm1_weights_scale": - w1_scale, + self.w1_scale, "gemm1_bias": - self.w13_bias, + self.w1_bias, "gemm1_alpha": self.gemm1_alpha, "gemm1_beta": @@ -155,7 +148,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): "gemm2_weights": w2, "gemm2_weights_scale": - w2_scale, + self.w2_scale, "gemm2_bias": self.w2_bias, "output1_scale_scalar": diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 1aeb3f92bc3ea..678942e568d86 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -268,3 +268,7 @@ def _validate_scale_shape( assert block_shape is not None expected = (a.shape[0], cdiv(a.shape[1], block_shape[1])) assert a_scale.shape == expected, f"{a_scale.shape} == {expected}" + + +def activation_without_mul(activation: str) -> str: + return activation + "_no_mul" diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index bf99f0823b745..060d6e84a944d 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -9,8 +9,10 @@ from torch.nn import Parameter import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, @@ -483,6 +485,10 @@ class AWQMoEMethod(FusedMoEMethodBase): if hasattr(layer, "w2_bias") and layer.w2_bias is not None: layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return None + def apply( self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 2245c59af6fea..650dab8df87e3 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -6,8 +6,9 @@ from typing import Any, Callable, Optional, Union import torch from packaging import version +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, - FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, @@ -452,6 +453,10 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): **extra_weight_attrs, ) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return None + def apply( self, layer: torch.nn.Module, @@ -509,6 +514,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map, + quant_config=self.moe_quant_config, ) def _create_weights_4bit( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 5470deb768450..85adae32f4cdc 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -16,8 +16,11 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, - FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, - FusedMoeWeightScaleSupported) + FusedMoEPermuteExpertsUnpermute, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, + int4_w4a16_moe_quant_config, int8_w8a8_moe_quant_config, + int8_w8a16_moe_quant_config, nvfp4_moe_quant_config) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( is_valid_flashinfer_cutlass_fused_moe) from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa @@ -122,7 +125,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): return CompressedTensorsWNA16MarlinMoEMethod( quant_config, layer.moe_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): - return CompressedTensorsW4A4MoeMethod(layer.moe_config, layer) + return CompressedTensorsW4A4MoeMethod(layer.moe_config) elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant) or quant_config._is_fp8_w8a8(weight_quant, input_quant)): @@ -138,7 +141,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): - def __init__(self, moe: FusedMoEConfig, layer: torch.nn.Module): + def __init__(self, moe: FusedMoEConfig): from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 detect_nvfp4_moe_support) super().__init__(moe) @@ -147,7 +150,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): self.allow_flashinfer = _nvfp4.allow_flashinfer self.use_marlin = _nvfp4.use_marlin self.group_size = 16 - self.layer = layer def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -305,37 +307,46 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): (layer.w2_input_global_scale), requires_grad=False) def maybe_make_prepare_finalize( - self, - moe: FusedMoEConfig, - ) -> Optional[mk.FusedMoEPrepareAndFinalize]: - if not self.allow_flashinfer: - return super().maybe_make_prepare_finalize(moe) + self) -> Optional[mk.FusedMoEPrepareAndFinalize]: + if self.use_marlin: + return None + elif not self.allow_flashinfer: + return super().maybe_make_prepare_finalize() prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize( - moe, - a1_gscale=self.layer.w13_input_scale_quant, - ) + self.moe) logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, layer: torch.nn.Module, ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None """Return the appropriate GEMM experts implementation.""" experts = select_nvfp4_gemm_impl( - moe, - g1_alphas=self.layer.g1_alphas, - g2_alphas=self.layer.g2_alphas, - a1_gscale=self.layer.w13_input_scale_quant, - a2_gscale=self.layer.w2_input_scale_quant, + self.moe, + self.moe_quant_config, allow_flashinfer=self.allow_flashinfer, ) logger.debug_once("Using %s", experts.__class__.__name__) return experts + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + if self.use_marlin: + return None + + return nvfp4_moe_quant_config( + g1_alphas=layer.g1_alphas, + g2_alphas=layer.g2_alphas, + a1_gscale=layer.w13_input_scale_quant, + a2_gscale=layer.w2_input_scale_quant, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + ) + def apply( self, layer: torch.nn.Module, @@ -359,8 +370,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError("EPLB not supported for " "`CompressedTensorsW4A4MoeMethod` yet.") @@ -381,7 +390,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): indices_type=self.topk_indices_dtype, ) + # + # Note: the order here is important. self.fused_experts can override + # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin. + # if self.use_marlin: + assert self.fused_experts is None return torch.ops.vllm.fused_marlin_moe( x, layer.w13_weight, @@ -401,8 +415,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): expert_map=expert_map, workspace=layer.workspace) - # FlashInfer fused experts path - if self.fused_experts is not None: + elif self.fused_experts is not None: assert is_valid_flashinfer_cutlass_fused_moe( x, layer.w13_weight, layer.w2_weight), ( "Flashinfer CUTLASS Fused MoE not applicable!") @@ -417,11 +430,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) + # FlashInfer fused experts path elif self.allow_flashinfer: from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 flashinfer_cutlass_moe_fp4) @@ -430,51 +442,46 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): x, layer.w13_weight, layer.w2_weight), ( "Flashinfer CUTLASS Fused MoE not applicable!") + assert self.moe_quant_config is not None + return flashinfer_cutlass_moe_fp4( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, + quant_config=self.moe_quant_config, inplace=False, # TODO(shuw): fix later, now output is high prec activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - g1_alphas=layer.g1_alphas, - g2_alphas=layer.g2_alphas, - a1_gscale=layer.w13_input_scale_quant, - a2_gscale=layer.w2_input_scale_quant, apply_router_weight_on_input=apply_router_weight_on_input, ) + else: + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + cutlass_moe_fp4) - assert expert_map is None, ("Expert Parallelism / expert_map " - "is currently not supported for " - "CompressedTensorsW4A4MoeMethod.") - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - cutlass_moe_fp4) + assert expert_map is None, ("Expert Parallelism / expert_map " + "is currently not supported for " + "CompressedTensorsW4A4MoeMethod.") + assert self.moe_quant_config is not None - # Cutlass moe takes in activations in BF16/Half precision - # and fp4 quantized weights loaded from the checkpoint - return cutlass_moe_fp4( - a=x, - w1_fp4=layer.w13_weight, - w2_fp4=layer.w2_weight, - w1_blockscale=layer.w13_weight_scale, - w2_blockscale=layer.w2_weight_scale, - g1_alphas=layer.g1_alphas, - g2_alphas=layer.g2_alphas, - a1_gscale=layer.w13_input_scale_quant, - a2_gscale=layer.w2_input_scale_quant, - topk_weights=topk_weights, - topk_ids=topk_ids, - m=x.shape[0], - n=layer.w2_weight.shape[2] * 2, - k=x.shape[1], - e=layer.w13_weight.shape[0], - apply_router_weight_on_input=apply_router_weight_on_input).to( - x.dtype) + # Cutlass moe takes in activations in BF16/Half precision + # and fp4 quantized weights loaded from the checkpoint + return cutlass_moe_fp4( + a=x, + w1_fp4=layer.w13_weight, + w2_fp4=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + quant_config=self.moe_quant_config, + apply_router_weight_on_input=apply_router_weight_on_input, + # TODO(bnell): derive these from arguments + m=x.shape[0], + n=layer.w2_weight.shape[2] * 2, + k=x.shape[1], + e=layer.w13_weight.shape[0], + ).to(x.dtype) class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): @@ -692,16 +699,11 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) - self.rocm_aiter_fused_experts_func = rocm_aiter_fused_experts elif self.use_marlin: prepare_moe_fp8_layer_for_marlin(layer, False) # Activations not quantized for marlin. del layer.w13_input_scale del layer.w2_input_scale - self.fused_experts_func = None - else: - from vllm.model_executor.layers.fused_moe import fused_experts - self.fused_experts_func = fused_experts if self.use_cutlass: device = layer.w13_weight.device @@ -722,11 +724,20 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): device=device, dtype=torch.int64) + def maybe_make_prepare_finalize( + self) -> Optional[mk.FusedMoEPrepareAndFinalize]: + if self.use_marlin or self.rocm_aiter_moe_enabled: + return None + else: + return super().maybe_make_prepare_finalize() + def select_gemm_impl( - self, prepare_finalize: FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, - layer: torch.nn.Module) -> FusedMoEPermuteExpertsUnpermute: + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> FusedMoEPermuteExpertsUnpermute: # cutlass path + assert self.moe_quant_config is not None if self.use_cutlass: from vllm.model_executor.layers.fused_moe import ( CutlassBatchedExpertsFp8, CutlassExpertsFp8) @@ -740,26 +751,24 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): logger.debug("CutlassBatchedExpertsFp8(%s)", self.__class__.__name__) experts = CutlassBatchedExpertsFp8( - moe.num_local_experts, + self.moe.num_local_experts, num_dispatchers, - moe.in_dtype, - self.input_quant.strategy == QuantizationStrategy.TOKEN, - self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + self.moe.in_dtype, ab_strides1=self.ab_strides1_c_strides2, ab_strides2=self.ab_strides2, c_strides1=self.c_strides1, c_strides2=self.ab_strides1_c_strides2, + quant_config=self.moe_quant_config, ) else: logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) experts = CutlassExpertsFp8( - moe.in_dtype, - self.input_quant.strategy == QuantizationStrategy.TOKEN, - self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + self.moe.in_dtype, ab_strides1=self.ab_strides1_c_strides2, ab_strides2=self.ab_strides2, c_strides1=self.c_strides1, c_strides2=self.ab_strides1_c_strides2, + quant_config=self.moe_quant_config, ) self.disable_expert_map = (num_dispatchers > 1 @@ -774,29 +783,40 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): assert not self.rocm_aiter_moe_enabled and not self.use_marlin - logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__) - if (prepare_finalize.activation_format == FusedMoEActivationFormat.BatchedExperts): max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank( ) assert max_num_tokens_per_rank is not None + logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__) return BatchedTritonExperts( max_num_tokens=max_num_tokens_per_rank, num_dispatchers=prepare_finalize.num_dispatchers(), - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - per_act_token_quant=( - self.input_quant.strategy == QuantizationStrategy.TOKEN), + quant_config=self.moe_quant_config, ) else: - return TritonExperts( - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - per_act_token_quant=( - self.input_quant.strategy == QuantizationStrategy.TOKEN), - ) + logger.debug("TritonExperts(%s)", self.__class__.__name__) + return TritonExperts(self.moe_quant_config) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + if self.use_marlin: + return None + + per_act_token = ( + self.input_quant.strategy == QuantizationStrategy.TOKEN) + per_channel_quant = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL) + + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_channel_quant, + ) def apply( self, @@ -841,92 +861,19 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): indices_type=self.topk_indices_dtype, ) - # cutlass path - if self.use_cutlass: - per_act_token = ( - self.input_quant.strategy == QuantizationStrategy.TOKEN) - per_channel_quant = ( - self.weight_quant.strategy == QuantizationStrategy.CHANNEL) + per_act_token = ( + self.input_quant.strategy == QuantizationStrategy.TOKEN) + per_channel_quant = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL) - # small-batch fallback on SM100 - if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8: - from vllm.model_executor.layers.fused_moe import fused_experts - return fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=True, - per_channel_quant=per_channel_quant, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale) - - if self.fused_experts is None: - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - cutlass_moe_fp8) - return cutlass_moe_fp8( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - per_act_token=per_act_token, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - ab_strides1=self.ab_strides1_c_strides2, - ab_strides2=self.ab_strides2, - c_strides1=self.c_strides1, - c_strides2=self.ab_strides1_c_strides2, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - ) - else: - return self.fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - ) - - if self.rocm_aiter_moe_enabled: - return self.rocm_aiter_fused_experts_func( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=True, - per_channel_quant=self.weight_quant.strategy == - QuantizationStrategy.CHANNEL, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - expert_map=expert_map) + # + # Note: the order here is important. self.fused_experts can override + # cutlass fp8 or fused_experts but not marlin or rocm. + # if self.use_marlin: assert activation == "silu", ( f"{activation} not supported for Marlin MoE.") + assert self.fused_experts is None return torch.ops.vllm.fused_marlin_moe( x, layer.w13_weight, @@ -944,26 +891,95 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): expert_map=expert_map, workspace=layer.workspace) - assert self.fused_experts_func is not None + elif self.rocm_aiter_moe_enabled: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa E501 + rocm_aiter_fused_experts) + assert per_act_token == per_channel_quant + assert self.moe_quant_config is not None + assert self.fused_experts is None + return rocm_aiter_fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) - return self.fused_experts_func( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=True, - per_channel_quant=self.weight_quant.strategy == - QuantizationStrategy.CHANNEL, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale) + elif self.fused_experts is not None: + return self.fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + ) + + # cutlass path + elif self.use_cutlass: + assert self.moe_quant_config is not None + + # small-batch fallback on SM100 + if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8: + from vllm.model_executor.layers.fused_moe import fused_experts + assert per_act_token == per_channel_quant + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + quant_config=self.moe_quant_config, + ) + else: + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + cutlass_moe_fp8) + assert per_act_token == per_channel_quant + assert self.moe_quant_config is not None + return cutlass_moe_fp8( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + quant_config=self.moe_quant_config, + activation=activation, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, + ) + + else: + from vllm.model_executor.layers.fused_moe import fused_experts + assert per_act_token == per_channel_quant + assert self.moe_quant_config is not None + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): @@ -1049,6 +1065,16 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: pass + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return int8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + per_act_token_quant=True, + ) + def apply( self, layer: torch.nn.Module, @@ -1104,14 +1130,10 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): inplace=True, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input, - use_int8_w8a8=True, - per_channel_quant=True, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale) + quant_config=self.moe_quant_config, + ) class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): @@ -1355,6 +1377,10 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): layer.workspace = marlin_make_workspace_new(device, 4) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return None + def apply( self, layer: torch.nn.Module, @@ -1588,6 +1614,20 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): layer.w2_weight_scale.transpose(1, 2).contiguous(), requires_grad=False) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + assert self.num_bits == 4 or self.num_bits == 8 + config_builder = (int4_w4a16_moe_quant_config if self.num_bits == 4 + else int8_w8a16_moe_quant_config) + + return config_builder( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + w1_zp=None, + w2_zp=None, + block_shape=[0, self.group_size], + ) + def apply( self, layer: torch.nn.Module, @@ -1641,13 +1681,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): topk_ids=topk_ids, inplace=True, activation=activation, - use_int4_w4a16=self.num_bits == 4, - use_int8_w8a16=self.num_bits == 8, - global_num_experts=global_num_experts, apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - w1_zp=None, - w2_zp=None, - block_shape=[0, self.group_size]) + quant_config=self.moe_quant_config, + ) diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index b361fe9bea088..8555e9ff20346 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -8,6 +8,8 @@ import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, int8_w8a16_moe_quant_config) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -106,6 +108,13 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): requires_grad=False) layer.register_parameter("w2_scale", w2_scale) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return int8_w8a16_moe_quant_config(w1_scale=layer.w13_scale, + w2_scale=layer.w2_scale, + w1_zp=None, + w2_zp=None) + def apply( self, layer: torch.nn.Module, @@ -159,12 +168,11 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): topk_ids=topk_ids, inplace=True, activation=activation, - use_int8_w8a16=True, - global_num_experts=global_num_experts, apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_scale, - w2_scale=layer.w2_scale) + quant_config=self.moe_quant_config, + ) @staticmethod def quantizing_weight_loader(layer, weight_loader): diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 254cc2be05ee6..e75094c54743c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -14,9 +14,11 @@ from vllm import _custom_ops as ops from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, + FusedMoE, FusedMoEActivationFormat, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, fp8_w8a8_moe_quant_config) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -575,20 +577,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): "CutlassBlockScaledGroupedGemm not supported on the current " "platform.") - def maybe_make_prepare_finalize( - self, - moe: FusedMoEConfig, - ) -> Optional[mk.FusedMoEPrepareAndFinalize]: - if self.flashinfer_moe_backend != FlashinferMoeBackend.CUTLASS: - return super().maybe_make_prepare_finalize(moe) - - prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize( - moe, - layer=self.layer, - ) - logger.debug_once("%s", prepare_finalize.__class__.__name__) - return prepare_finalize - def create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -928,10 +916,23 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor( layer.w2_weight_scale_inv) + def maybe_make_prepare_finalize( + self) -> Optional[mk.FusedMoEPrepareAndFinalize]: + if (self.rocm_aiter_moe_enabled or self.use_marlin + or self.flashinfer_moe_backend + == FlashinferMoeBackend.TENSORRT_LLM): + return None + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + prepare_finalize = ( + build_flashinfer_fp8_cutlass_moe_prepare_finalize(self.moe)) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize + else: + return super().maybe_make_prepare_finalize() + def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, layer: torch.nn.Module, ) -> FusedMoEPermuteExpertsUnpermute: from vllm.model_executor.layers.fused_moe import ( @@ -940,6 +941,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( "Marlin and ROCm AITER are not supported with all2all yet.") + assert self.moe_quant_config is not None + if (prepare_finalize.activation_format == FusedMoEActivationFormat.BatchedExperts): max_num_tokens_per_rank = ( @@ -953,15 +956,13 @@ class Fp8MoEMethod(FusedMoEMethodBase): return BatchedTritonOrDeepGemmExperts( max_num_tokens=max_num_tokens_per_rank, num_dispatchers=prepare_finalize.num_dispatchers(), - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - per_act_token_quant=False, + quant_config=self.moe_quant_config, allow_deep_gemm=self.allow_deep_gemm, ) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: experts = select_cutlass_fp8_gemm_impl( - moe, - self.layer, + self.moe, + self.moe_quant_config, ) logger.debug_once("Using %s", experts.__class__.__name__) return experts @@ -971,11 +972,25 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.__class__.__name__, self.quant_config.weight_block_size, False) return TritonOrDeepGemmExperts( - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, + quant_config=self.moe_quant_config, allow_deep_gemm=self.allow_deep_gemm, ) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + if self.use_marlin: + return None + + return fp8_w8a8_moe_quant_config( + w1_scale=(layer.w13_weight_scale_inv + if self.block_quant else layer.w13_weight_scale), + w2_scale=(layer.w2_weight_scale_inv + if self.block_quant else layer.w2_weight_scale), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + block_shape=self.quant_config.weight_block_size, + ) + def apply( self, layer: torch.nn.Module, @@ -1005,12 +1020,14 @@ class Fp8MoEMethod(FusedMoEMethodBase): assert logical_replica_count is not None assert isinstance(layer, FusedMoE) - if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + if (self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + and self.fused_experts is None): assert activation == 'silu', ( f"Expected 'silu' activation but got {activation}") assert scoring_func == 'sigmoid', ( f"Expected 'sigmoid' scoring func but got {scoring_func}") if self.block_quant: + import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 assert (renormalize and use_grouped_topk and custom_routing_function is None) @@ -1066,9 +1083,14 @@ class Fp8MoEMethod(FusedMoEMethodBase): logical_replica_count=logical_replica_count, ) + # + # Note: the order of checks is important since self.fused_experts + # can override fused_experts or cutlass but not rocm or marlin. + # if self.rocm_aiter_moe_enabled: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_fused_experts) + assert self.fused_experts is None return rocm_aiter_fused_experts( x, layer.w13_weight, @@ -1076,19 +1098,13 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, activation=activation, - use_fp8_w8a8=True, apply_router_weight_on_input=apply_router_weight_on_input, - w1_scale=(layer.w13_weight_scale_inv - if self.block_quant else layer.w13_weight_scale), - w2_scale=(layer.w2_weight_scale_inv - if self.block_quant else layer.w2_weight_scale), - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - block_shape=self.quant_config.weight_block_size, - expert_map=expert_map) + expert_map=expert_map, + quant_config=self.moe_quant_config) elif self.use_marlin: assert activation == "silu", ( f"{activation} not supported for Marlin MoE.") + assert self.fused_experts is None return torch.ops.vllm.fused_marlin_moe( x, layer.w13_weight, @@ -1105,40 +1121,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): global_num_experts=global_num_experts, expert_map=expert_map, workspace=layer.workspace) - elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert self.block_quant is None - assert (not renormalize and custom_routing_function is not None) - assert activation == 'silu', ( - f"Expected 'silu' activation but got {activation}") - assert scoring_func == 'sigmoid', ( - f"Expected 'sigmoid' scoring func but got {scoring_func}") - if self.fused_experts is not None: - return self.fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) - else: - return flashinfer_cutlass_moe_fp8( - x, - layer, - topk_weights, - topk_ids, - inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) - else: - common_kwargs = dict( + elif self.fused_experts: + return self.fused_experts( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -1149,26 +1133,43 @@ class Fp8MoEMethod(FusedMoEMethodBase): global_num_experts=global_num_experts, apply_router_weight_on_input=apply_router_weight_on_input, expert_map=expert_map, - w1_scale=(layer.w13_weight_scale_inv - if self.block_quant else layer.w13_weight_scale), - w2_scale=(layer.w2_weight_scale_inv - if self.block_quant else layer.w2_weight_scale), - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, ) + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + assert self.block_quant is None + assert (not renormalize and custom_routing_function is not None) + assert activation == 'silu', ( + f"Expected 'silu' activation but got {activation}") + assert scoring_func == 'sigmoid', ( + f"Expected 'sigmoid' scoring func but got {scoring_func}") - if self.fused_experts is not None: - return self.fused_experts(**common_kwargs) - else: - from vllm.model_executor.layers.fused_moe import fused_experts - return fused_experts( - **common_kwargs, - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm, - allow_cutlass_block_scaled_grouped_gemm=( - self.allow_cutlass_block_scaled_grouped_gemm), - ) + return flashinfer_cutlass_moe_fp8( + x, + layer, + topk_weights, + topk_ids, + inplace=False, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + from vllm.model_executor.layers.fused_moe import fused_experts + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.moe_quant_config, + allow_deep_gemm=self.allow_deep_gemm, + allow_cutlass_block_scaled_grouped_gemm=( + self.allow_cutlass_block_scaled_grouped_gemm)) class Fp8KVCacheMethod(BaseKVCacheMethod): diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 01af1ccd9ae06..a631dfdab6544 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -10,8 +10,9 @@ from torch.nn.parameter import Parameter, UninitializedParameter from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, - FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) @@ -518,6 +519,10 @@ class GGUFMoEMethod(FusedMoEMethodBase): set_weight_attrs(w2_qweight_type, extra_weight_attrs) layer.register_parameter("w2_qweight_type", w2_qweight_type) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return None + def apply( self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 76de3a59c8ca1..e06b974255f01 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -9,8 +9,10 @@ import torch import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) @@ -632,6 +634,10 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): if hasattr(layer, "w2_bias") and layer.w2_bias is not None: layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return None + def apply( self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 5f9d4814274c8..c83b0b47a4b7e 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -11,6 +11,7 @@ from torch.nn.parameter import Parameter from vllm._ipex_ops import ipex_ops as ops from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -375,6 +376,10 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): use_prepack=True, ) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return None + def apply( self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 60a79e53e8141..7eac40825ac33 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -11,7 +11,9 @@ import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, + nvfp4_moe_quant_config) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( is_valid_flashinfer_cutlass_fused_moe) from vllm.model_executor.layers.fused_moe.layer import ( @@ -294,8 +296,6 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): cutlass_fp8_supported) self.cutlass_fp8_supported = cutlass_fp8_supported() self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None - self.fused_experts: Optional[ - mk.FusedMoEModularKernel] = None # type: ignore if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe(): self.flashinfer_moe_backend = get_flashinfer_moe_backend() logger.info_once( @@ -303,29 +303,27 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) def maybe_make_prepare_finalize( - self, - moe: FusedMoEConfig, - ) -> Optional[mk.FusedMoEPrepareAndFinalize]: - if self.fused_experts is not None or \ - self.flashinfer_moe_backend != FlashinferMoeBackend.CUTLASS: - return super().maybe_make_prepare_finalize(moe) - - prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize( - moe, - layer=self.layer, - ) - logger.debug_once("%s", prepare_finalize.__class__.__name__) - return prepare_finalize + self, ) -> Optional[mk.FusedMoEPrepareAndFinalize]: + # TRT LLM not supported with all2all yet. + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + return None + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + prepare_finalize = ( + build_flashinfer_fp8_cutlass_moe_prepare_finalize(self.moe)) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize + else: + return super().maybe_make_prepare_finalize() def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, layer: torch.nn.Module, ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None experts = select_cutlass_fp8_gemm_impl( - moe, - self.layer, + self.moe, + self.moe_quant_config, ) logger.debug_once("Using %s", experts.__class__.__name__) return experts @@ -479,6 +477,19 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + return None + + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + per_act_token_quant=False, + ) + def apply( self, layer: torch.nn.Module, @@ -507,6 +518,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): "EPLB not supported for `ModelOptFp8MoEMethod` yet.") if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + assert self.fused_experts is None assert activation == 'silu', ( f"Expected 'silu' activation but got {activation}") assert not renormalize @@ -537,55 +549,56 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): indices_type=self.topk_indices_dtype, ) - if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + # + # Note: the order here is important. self.fused_experts can override + # cutlass or fused_experts. + # + if self.fused_experts is not None: + return self.fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + inplace=False, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: assert not renormalize assert activation == 'silu', ( f"Expected 'silu' activation but got {activation}") - if self.fused_experts is not None: - return self.fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) - else: - return flashinfer_cutlass_moe_fp8( - x, - layer, - topk_weights, - topk_ids, - inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) - from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts) - return fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - use_fp8_w8a8=True, - per_channel_quant=False, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - apply_router_weight_on_input=apply_router_weight_on_input, - ) + return flashinfer_cutlass_moe_fp8( + x, + layer, + topk_weights, + topk_ids, + inplace=False, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_experts) + assert self.moe_quant_config is not None + + return fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + quant_config=self.moe_quant_config, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) class ModelOptNvFp4Config(QuantizationConfig): @@ -1034,33 +1047,30 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): " for ModelOptNvFp4FusedMoE.") def maybe_make_prepare_finalize( - self, - moe: FusedMoEConfig, - ) -> Optional[mk.FusedMoEPrepareAndFinalize]: - if (self.allow_flashinfer and self.flashinfer_moe_backend - == FlashinferMoeBackend.CUTLASS): + self) -> Optional[mk.FusedMoEPrepareAndFinalize]: + if (self.use_marlin + or (self.allow_flashinfer and self.flashinfer_moe_backend + == FlashinferMoeBackend.TENSORRT_LLM)): + return None + elif (self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS): + # For now, fp4 moe only works with the flashinfer dispatcher. prepare_finalize = ( - build_flashinfer_fp4_cutlass_moe_prepare_finalize( - moe, - a1_gscale=self.layer.w13_input_scale_quant, - )) + build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)) logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize - - return super().maybe_make_prepare_finalize(moe) + else: + return super().maybe_make_prepare_finalize() def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, layer: torch.nn.Module, ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None experts = select_nvfp4_gemm_impl( - moe, - g1_alphas=self.layer.g1_alphas, - g2_alphas=self.layer.g2_alphas, - a1_gscale=self.layer.w13_input_scale_quant, - a2_gscale=self.layer.w2_input_scale_quant, + self.moe, + self.moe_quant_config, allow_flashinfer=self.allow_flashinfer, ) logger.debug_once("Using %s", experts.__class__.__name__) @@ -1360,6 +1370,21 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + if (self.use_marlin or self.flashinfer_moe_backend + == FlashinferMoeBackend.TENSORRT_LLM): + return None + + return nvfp4_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + g1_alphas=layer.g1_alphas, + g2_alphas=layer.g2_alphas, + a1_gscale=layer.w13_input_scale_quant, + a2_gscale=layer.w2_input_scale_quant, + ) + def apply( self, layer: torch.nn.Module, @@ -1388,12 +1413,14 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.") assert activation == "silu", "Only SiLU activation is supported." - if self.allow_flashinfer and \ - self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + if (self.allow_flashinfer and self.flashinfer_moe_backend + == FlashinferMoeBackend.TENSORRT_LLM): import flashinfer from vllm.model_executor.models.llama4 import Llama4MoE + assert self.fused_experts is None + a1_gscale = layer.w13_input_scale_quant (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( @@ -1457,7 +1484,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) + # + # Note: the order here is important. self.fused_experts can override + # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin or + # trtllm. + # if self.use_marlin: + assert self.fused_experts is None return torch.ops.vllm.fused_marlin_moe( x, layer.w13_weight, @@ -1477,7 +1510,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map=expert_map, workspace=layer.workspace) - if self.fused_experts is not None: + elif self.fused_experts is not None: assert self.allow_flashinfer and \ self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS @@ -1485,7 +1518,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): x, layer.w13_weight, layer.w2_weight), ( "Flashinfer CUTLASS Fused MoE not applicable!") - out = self.fused_experts( + return self.fused_experts( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -1495,28 +1528,22 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) elif (self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS): from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 flashinfer_cutlass_moe_fp4) + assert self.moe_quant_config is not None - out = flashinfer_cutlass_moe_fp4( + return flashinfer_cutlass_moe_fp4( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - g1_alphas=layer.g1_alphas, - g2_alphas=layer.g2_alphas, - a1_gscale=layer.w13_input_scale_quant, - a2_gscale=layer.w2_input_scale_quant, - inplace=False, # TODO(shuw): fix later, now output is high prec + quant_config=self.moe_quant_config, + inplace=False, activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, @@ -1527,23 +1554,19 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): # only (no EP). from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp4) - out = cutlass_moe_fp4( + assert self.moe_quant_config is not None + return cutlass_moe_fp4( a=x, w1_fp4=layer.w13_weight, w2_fp4=layer.w2_weight, - w1_blockscale=layer.w13_weight_scale, - w2_blockscale=layer.w2_weight_scale, - g1_alphas=layer.g1_alphas, - g2_alphas=layer.g2_alphas, - a1_gscale=layer.w13_input_scale_quant, - a2_gscale=layer.w2_input_scale_quant, topk_weights=topk_weights, topk_ids=topk_ids, + quant_config=self.moe_quant_config, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + # TODO: derive from arguments m=x.shape[0], n=layer.w2_weight.shape[2] * 2, k=x.shape[1], e=layer.w13_weight.shape[0], - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input) - - return out + ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index c25b3dd6080dc..145b614237fb3 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -6,6 +6,9 @@ from typing import Any, Callable, Optional, Union import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, int4_w4a16_moe_quant_config, + int8_w8a16_moe_quant_config) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, @@ -283,6 +286,22 @@ class MoeWNA16Method(FusedMoEMethodBase): layer.register_parameter(key, param) set_weight_attrs(param, extra_weight_attrs) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + weight_bits = self.quant_config.weight_bits + has_zp = self.quant_config.has_zp + assert weight_bits == 4 or weight_bits == 8 + config_builder = (int4_w4a16_moe_quant_config + if weight_bits == 4 else int8_w8a16_moe_quant_config) + + return config_builder( + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + w1_zp=layer.w13_qzeros if has_zp else None, + w2_zp=layer.w2_qzeros if has_zp else None, + block_shape=[0, layer.group_size], + ) + def apply( self, layer: torch.nn.Module, @@ -327,9 +346,6 @@ class MoeWNA16Method(FusedMoEMethodBase): e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) - weight_bits = self.quant_config.weight_bits - has_zp = self.quant_config.has_zp - return fused_experts( x, layer.w13_qweight, @@ -337,16 +353,11 @@ class MoeWNA16Method(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - use_int4_w4a16=weight_bits == 4, - use_int8_w8a16=weight_bits == 8, - global_num_experts=global_num_experts, apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_scales, - w2_scale=layer.w2_scales, - w1_zp=layer.w13_qzeros if has_zp else None, - w2_zp=layer.w2_qzeros if has_zp else None, - block_shape=[0, layer.group_size]) + quant_config=self.moe_quant_config, + ) @staticmethod def get_weight_loader(layer, weight_loader): diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index f935bdd84124a..28c1e60ccd08a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -12,6 +12,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.fused_moe import modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, mxfp4_w4a4_moe_quant_config) from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) @@ -629,10 +631,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): return tile_tokens_dim + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + + if self.mxfp4_backend == Mxfp4Backend.MARLIN: + return None + + if self.mxfp4_backend == Mxfp4Backend.TRITON: + w1_scale = layer.w13_precision_config + w2_scale = layer.w2_precision_config + else: + w1_scale = layer.w13_weight_scale + w2_scale = layer.w2_weight_scale + + return mxfp4_w4a4_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, layer: torch.nn.Module, ) -> mk.FusedMoEPermuteExpertsUnpermute: if (prepare_finalize.activation_format == @@ -647,11 +668,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): "gemm1_alpha": layer.gemm1_alpha, "gemm1_beta": layer.gemm1_beta, "gemm1_clamp_limit": layer.gemm1_clamp_limit, - "w13_bias": layer.w13_bias, - "w2_bias": layer.w2_bias, + # TODO(bnell): part of quant_config "max_capture_size": self.max_capture_size, } - return TrtLlmGenExperts(moe, **kwargs) + assert self.moe_quant_config is not None + return TrtLlmGenExperts(self.moe, self.moe_quant_config, + **kwargs) else: # Use matmul_ogs from triton_kernels here! raise NotImplementedError( @@ -710,8 +732,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -941,10 +961,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): renormalize=renormalize, global_num_experts=global_num_experts, expert_map=expert_map, - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_precision=self.w13_precision_config, - w2_precision=self.w2_precision_config, + quant_config=self.moe_quant_config, apply_router_weight_on_input=apply_router_weight_on_input, ) else: diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index bc8ae980429a3..d2d990e46bcf3 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -11,6 +11,9 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, + mxfp4_w4a4_moe_quant_config) from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( is_rocm_aiter_moe_enabled) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( @@ -287,6 +290,16 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): from vllm.model_executor.layers.fused_moe import fused_experts self.fused_experts_func = fused_experts + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + per_act_token_quant=self.weight_qscheme == "per_channel", + ) + def apply( self, layer: torch.nn.Module, @@ -339,12 +352,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): topk_ids=topk_ids, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=True, - per_channel_quant=self.weight_qscheme == "per_channel", - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, + quant_config=self.moe_quant_config, expert_map=expert_map) if self.use_marlin: assert activation == "silu", ( @@ -376,14 +384,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): inplace=True, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=True, - per_channel_quant=self.weight_qscheme == "per_channel", global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale) + quant_config=self.moe_quant_config) class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): @@ -487,6 +490,16 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): layer.register_parameter("w13_weight_scale", w13_weight_scale) layer.register_parameter("w2_weight_scale", w2_weight_scale) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + return mxfp4_w4a4_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=None, + a2_scale=None, + block_shape=None, + ) + def apply( self, layer: torch.nn.Module, @@ -539,15 +552,10 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - use_mxfp4_w4a4=True, + activation=activation, global_num_experts=global_num_experts, apply_router_weight_on_input=apply_router_weight_on_input, expert_map=expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=None, - a2_scale=None, - block_shape=None, - activation=activation, + quant_config=self.moe_quant_config, ) return out diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 0d5fa05652b80..ed90e2e26460e 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -12,6 +12,9 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, int4_w4a16_moe_quant_config, + int8_w8a16_moe_quant_config) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -269,6 +272,21 @@ class RTNMoEMethod(FusedMoEMethodBase): fix_weights(layer, "w13_weight", weight_bits == 4) fix_weights(layer, "w2_weight", weight_bits == 4) + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]: + weight_bits = self.quant_config.weight_bits + group_size = self.quant_config.group_size + assert weight_bits == 4 or weight_bits == 8 + config_builder = (int4_w4a16_moe_quant_config + if weight_bits == 4 else int8_w8a16_moe_quant_config) + return config_builder( + w1_scale=layer.w13_scale, + w2_scale=layer.w2_scale, + w1_zp=None, + w2_zp=None, + block_shape=[0, group_size], + ) + def apply( self, layer: torch.nn.Module, @@ -314,10 +332,7 @@ class RTNMoEMethod(FusedMoEMethodBase): e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype) - weight_bits = self.quant_config.weight_bits - group_size = self.quant_config.group_size - - ret = fused_experts( + return fused_experts( x, layer.w13_weight, layer.w2_weight, @@ -325,16 +340,11 @@ class RTNMoEMethod(FusedMoEMethodBase): topk_ids=topk_ids, inplace=True, activation=activation, - use_int4_w4a16=weight_bits == 4, - use_int8_w8a16=weight_bits == 8, - global_num_experts=global_num_experts, - w1_scale=layer.w13_scale, - w2_scale=layer.w2_scale, apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, expert_map=expert_map, - block_shape=[0, group_size]) - - return ret + quant_config=self.moe_quant_config, + ) def rtn_quantize(tensor: torch.Tensor, num_bits: int, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index f5d7c57fe2a87..fabf855b36e68 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -7,7 +7,8 @@ import torch import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( FlashInferExperts) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 @@ -47,32 +48,23 @@ def reorder_w1w3_to_w3w1(weight: torch.Tensor, def build_flashinfer_fp4_cutlass_moe_prepare_finalize( - moe: FusedMoEConfig, - a1_gscale: torch.Tensor, -) -> mk.FusedMoEPrepareAndFinalize: + moe: FusedMoEConfig) -> mk.FusedMoEPrepareAndFinalize: """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" use_dp = moe.moe_parallel_config.dp_size > 1 - return FlashInferCutlassMoEPrepareAndFinalize(use_dp, a1_gscale=a1_gscale) + return FlashInferCutlassMoEPrepareAndFinalize(use_dp) def select_nvfp4_gemm_impl( moe: FusedMoEConfig, - g1_alphas: torch.Tensor, - g2_alphas: torch.Tensor, - a1_gscale: torch.Tensor, - a2_gscale: torch.Tensor, + moe_quant_config: FusedMoEQuantConfig, allow_flashinfer: bool, ) -> mk.FusedMoEPermuteExpertsUnpermute: """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" if allow_flashinfer: return FlashInferExperts( - g1_alphas=g1_alphas, - g2_alphas=g2_alphas, - a1_gscale=a1_gscale, - a2_gscale=a2_gscale, out_dtype=moe.in_dtype, - quant_dtype="nvfp4", + quant_config=moe_quant_config, ep_rank=moe.moe_parallel_config.ep_rank, ep_size=moe.moe_parallel_config.ep_size, tp_rank=moe.moe_parallel_config.tp_rank, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 9889808f0760f..aa66a42c588a7 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -8,7 +8,8 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import envs from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( FlashInferExperts) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 @@ -99,6 +100,8 @@ def apply_flashinfer_per_tensor_scale_fp8( apply_router_weight_on_input: bool, ) -> torch.Tensor: from flashinfer.fused_moe import RoutingMethodType + + import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 assert layer.output1_scales_scalar is not None, ( "Expected output1_scales_scalar to be initialized") assert layer.output1_scales_scalar is not None, ( @@ -167,34 +170,23 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None: def build_flashinfer_fp8_cutlass_moe_prepare_finalize( - moe: Optional[FusedMoEConfig], - layer: torch.nn.Module, -) -> mk.FusedMoEPrepareAndFinalize: + moe: Optional[FusedMoEConfig], ) -> mk.FusedMoEPrepareAndFinalize: """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False - return FlashInferCutlassMoEPrepareAndFinalize( - use_dp, a1_gscale=layer.w13_input_scale) + return FlashInferCutlassMoEPrepareAndFinalize(use_dp) def select_cutlass_fp8_gemm_impl( moe: Optional[FusedMoEConfig], - layer: torch.nn.Module, + quant_config: FusedMoEQuantConfig, out_dtype: Optional[torch.dtype] = None, ) -> mk.FusedMoEPermuteExpertsUnpermute: """Return a GEMM *experts* implementation for fused-MoE layers""" - from vllm.model_executor.models.llama4 import Llama4MoE - assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \ - "FusedMoE flashinfer kernels are only supported for Llama4" - if moe is not None: return FlashInferExperts( - g1_alphas=layer.output1_scales_gate_scalar, - g2_alphas=layer.output2_scales_scalar, - a1_gscale=layer.w13_input_scale, - a2_gscale=layer.w2_input_scale_inv, out_dtype=moe.in_dtype, - quant_dtype=torch.float8_e4m3fn, + quant_config=quant_config, ep_rank=moe.moe_parallel_config.ep_rank, ep_size=moe.moe_parallel_config.ep_size, tp_rank=moe.moe_parallel_config.tp_rank, @@ -204,12 +196,8 @@ def select_cutlass_fp8_gemm_impl( assert out_dtype is not None, ( "If moe config is None, out_dtype must be passed") return FlashInferExperts( - g1_alphas=layer.output1_scales_gate_scalar, - g2_alphas=layer.output2_scales_scalar, - a1_gscale=layer.w13_input_scale, - a2_gscale=layer.w2_input_scale_inv, out_dtype=out_dtype, - quant_dtype=torch.float8_e4m3fn, + quant_config=quant_config, ) @@ -224,11 +212,13 @@ def flashinfer_cutlass_moe_fp8( expert_map: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, ) -> torch.Tensor: + quant_config = layer.quant_method.get_fused_moe_quant_config(layer) + assert quant_config is not None + fused_experts = mk.FusedMoEModularKernel( - build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None, - layer=layer), + build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None), select_cutlass_fp8_gemm_impl(moe=None, - layer=layer, + quant_config=quant_config, out_dtype=hidden_states.dtype)) return fused_experts( diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index e3e9635132d68..bbe0c6f6d38ec 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -411,6 +411,7 @@ def per_token_group_quant_fp8( x_s = torch.empty(shape, device=x.device, dtype=torch.float32) # prefer CUDA kernel if available + # TODO(bnell): this causes some fp8 moe test to fail. if current_platform.is_cuda() and x.is_contiguous(): torch.ops._C.per_token_group_fp8_quant(x, x_q, x_s, group_size, eps, fp8_min, fp8_max, use_ue8m0) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index b758cbf28d893..bfc1408ddf880 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -15,8 +15,8 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import (get_act_and_mul_fn, get_act_fn) -from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_topk, torch_vllm_outplace_fused_experts) +from vllm.model_executor.layers.fused_moe import (activation_without_mul, + fused_topk) from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -230,7 +230,7 @@ class NomicMoE(nn.Module): self.hidden_size = hidden_size self.total_intermediate_size = intermediate_size self.intermediate_size = divide(intermediate_size, self.tp_size) - self.hidden_act = hidden_act + self.hidden_act = activation_without_mul(hidden_act) if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -297,14 +297,14 @@ class NomicMoE(nn.Module): router_logits, self.top_k, renormalize=False) - final_hidden_states = torch_vllm_outplace_fused_experts( + + final_hidden_states = torch.ops.vllm.outplace_fused_experts( hidden_states=hidden_states, w1=self.w1, w2=self.w2, topk_weights=topk_weights, topk_ids=topk_ids, activation=self.hidden_act, - is_act_and_mul=False, ) if self.tp_size > 1: diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 4395b11b7d0f0..59c9921881497 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -37,7 +37,7 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, @@ -163,13 +163,19 @@ class DeepseekMoE(nn.Module): shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe(hidden_states, - self.w1, - self.w2, - router_logits, - self.top_k, - renormalize=self.config.norm_topk_prob, - inplace=True) + + topk_weights, topk_ids, _ = fused_topk( + hidden_states, + router_logits, + self.top_k, + renormalize=self.config.norm_topk_prob) + + final_hidden_states = fused_experts(hidden_states, + self.w1, + self.w2, + topk_weights, + topk_ids, + inplace=True) if self.config.n_shared_experts is not None: final_hidden_states = final_hidden_states + shared_output diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index c7be7f76dba15..240c23ea2b25d 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -39,7 +39,7 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, @@ -136,13 +136,18 @@ class MiniCPMMoE(nn.Module): hidden_states = hidden_states.view(-1, self.hidden_size) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe(hidden_states, - self.ws, - self.w2s, - router_logits, - self.top_k, - renormalize=True, - inplace=True) + + topk_weights, topk_ids, _ = fused_topk(hidden_states, + router_logits, + self.top_k, + renormalize=True) + + final_hidden_states = fused_experts(hidden_states, + self.ws, + self.w2s, + topk_weights, + topk_ids, + inplace=True) if self.tp_size > 1: final_hidden_states = tensor_model_parallel_all_reduce( diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f66e8b0b454bf..029309c49efd4 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -702,4 +702,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, return loader.load_weights(weights) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() + return self.model.get_expert_mapping() \ No newline at end of file diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index a25ef86a989db..a636a714145cf 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -81,9 +81,14 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool: def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: - if not (isinstance(module, FusedMoE) - and module.moe_config.quant_dtype == torch.float8_e4m3fn - and module.moe_config.block_shape == deep_gemm_block_shape()): + if not isinstance(module, FusedMoE): + return False + + moe_quant_config = module.quant_method.get_fused_moe_quant_config(module) + + if (moe_quant_config is None + or moe_quant_config.quant_dtype != torch.float8_e4m3fn + or moe_quant_config.block_shape != deep_gemm_block_shape()): return False if not isinstance(module.quant_method.fused_experts, From 2c3c1bd07aad253a34c97563bc5d466adaecaa18 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 19:38:09 -0700 Subject: [PATCH 389/584] [V0 Deprecation] Remove V0 Engine tests (#25114) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/engine/conftest.py | 12 -- tests/engine/test_computed_prefix_blocks.py | 37 ---- tests/engine/test_executor.py | 111 ---------- tests/engine/test_multiproc_workers.py | 179 ---------------- tests/engine/test_options.py | 58 ----- tests/engine/test_short_mm_context.py | 1 + tests/engine/test_stop_checker.py | 225 -------------------- 7 files changed, 1 insertion(+), 622 deletions(-) delete mode 100644 tests/engine/conftest.py delete mode 100644 tests/engine/test_computed_prefix_blocks.py delete mode 100644 tests/engine/test_executor.py delete mode 100644 tests/engine/test_multiproc_workers.py delete mode 100644 tests/engine/test_options.py delete mode 100644 tests/engine/test_stop_checker.py diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py deleted file mode 100644 index 375b248ebedaa..0000000000000 --- a/tests/engine/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py deleted file mode 100644 index ac5a1f957dfe4..0000000000000 --- a/tests/engine/test_computed_prefix_blocks.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine -from vllm.sampling_params import SamplingParams - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -@pytest.mark.parametrize("block_size", [16]) -def test_computed_prefix_blocks(model: str, block_size: int): - # This test checks if we are able to run the engine to completion - # without triggering asserts. - # We are in a scenario where all blocks from the second request's prompt - # are full and already computed when the second request arrives. - prompt = ( - "You are a helpful assistant. How do I build a car from cardboard and " - "paper clips? Is there an easy to follow video tutorial available " - "online for free?") - prompt2 = ( - " Please recommend to me some resources where I can learn not only to " - "handle technical difficulties of building a car, but also " - "decoration.") - - engine_args = EngineArgs(model=model, - block_size=block_size, - enable_prefix_caching=True) - - engine = LLMEngine.from_engine_args(engine_args) - sampling_params = SamplingParams() - - engine.add_request("0", prompt + prompt2, sampling_params) - engine.step() - engine.add_request("1", prompt, sampling_params) - engine.step() diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py deleted file mode 100644 index 67064aff3ae92..0000000000000 --- a/tests/engine/test_executor.py +++ /dev/null @@ -1,111 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import os -from typing import Any, Callable, Optional, Union - -import pytest - -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.llm_engine import LLMEngine -from vllm.executor.uniproc_executor import UniProcExecutor -from vllm.sampling_params import SamplingParams - - -class Mock: - ... - - -class CustomUniExecutor(UniProcExecutor): - - def collective_rpc(self, - method: Union[str, Callable], - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None) -> list[Any]: - # Drop marker to show that this was run - with open(".marker", "w"): - ... - return super().collective_rpc(method, timeout, args, kwargs) - - -CustomUniExecutorAsync = CustomUniExecutor - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_custom_executor_type_checking(model): - with pytest.raises(ValueError): - engine_args = EngineArgs(model=model, - distributed_executor_backend=Mock) - LLMEngine.from_engine_args(engine_args) - with pytest.raises(ValueError): - engine_args = AsyncEngineArgs(model=model, - distributed_executor_backend=Mock) - AsyncLLMEngine.from_engine_args(engine_args) - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_custom_executor(model, tmp_path): - cwd = os.path.abspath(".") - os.chdir(tmp_path) - try: - assert not os.path.exists(".marker") - - engine_args = EngineArgs( - model=model, - distributed_executor_backend=CustomUniExecutor, - enforce_eager=True, # reduce test time - ) - engine = LLMEngine.from_engine_args(engine_args) - sampling_params = SamplingParams(max_tokens=1) - - engine.add_request("0", "foo", sampling_params) - engine.step() - - assert os.path.exists(".marker") - finally: - os.chdir(cwd) - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_custom_executor_async(model, tmp_path): - cwd = os.path.abspath(".") - os.chdir(tmp_path) - try: - assert not os.path.exists(".marker") - - engine_args = AsyncEngineArgs( - model=model, - distributed_executor_backend=CustomUniExecutorAsync, - enforce_eager=True, # reduce test time - ) - engine = AsyncLLMEngine.from_engine_args(engine_args) - sampling_params = SamplingParams(max_tokens=1) - - async def t(): - stream = await engine.add_request("0", "foo", sampling_params) - async for x in stream: - ... - - asyncio.run(t()) - - assert os.path.exists(".marker") - finally: - os.chdir(cwd) - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_respect_ray(model): - # even for TP=1 and PP=1, - # if users specify ray, we should use ray. - # users might do this if they want to manage the - # resources using ray. - engine_args = EngineArgs( - model=model, - distributed_executor_backend="ray", - enforce_eager=True, # reduce test time - ) - engine = LLMEngine.from_engine_args(engine_args) - assert engine.model_executor.uses_ray diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py deleted file mode 100644 index b5381b61a020a..0000000000000 --- a/tests/engine/test_multiproc_workers.py +++ /dev/null @@ -1,179 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -from concurrent.futures import ThreadPoolExecutor -from functools import partial -from time import sleep -from typing import Any - -import pytest - -from vllm.config import VllmConfig -from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, - ResultHandler, WorkerMonitor) -from vllm.worker.worker_base import WorkerWrapperBase - - -class DummyWorkerWrapper(WorkerWrapperBase): - """Dummy version of vllm.worker.worker.Worker""" - - def worker_method(self, worker_input: Any) -> tuple[int, Any]: - sleep(0.05) - - if isinstance(worker_input, Exception): - # simulate error case - raise worker_input - - return self.rpc_rank, input - - -def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]: - result_handler = ResultHandler() - vllm_config = VllmConfig() - workers = [ - ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config, - rank) for rank in range(8) - ] - - worker_monitor = WorkerMonitor(workers, result_handler) - assert not worker_monitor.is_alive() - - result_handler.start() - worker_monitor.start() - assert worker_monitor.is_alive() - - return workers, worker_monitor - - -def test_local_workers() -> None: - """Test workers with sync task submission""" - - workers, worker_monitor = _start_workers() - - def execute_workers(worker_input: str) -> None: - worker_outputs = [ - worker.execute_method("worker_method", worker_input) - for worker in workers - ] - - for rank, output in enumerate(worker_outputs): - assert output.get() == (rank, input) - - executor = ThreadPoolExecutor(max_workers=4) - - # Test concurrent submission from different threads - futures = [ - executor.submit(partial(execute_workers, f"thread {thread_num}")) - for thread_num in range(4) - ] - - for future in futures: - future.result() - - # Test error case - exception = ValueError("fake error") - result = workers[0].execute_method("worker_method", exception) - try: - result.get() - pytest.fail("task should have failed") - except Exception as e: - assert isinstance(e, ValueError) - assert str(e) == "fake error" - - # Test cleanup when a worker fails - assert worker_monitor.is_alive() - workers[3].process.kill() - - # Other workers should get shut down here - worker_monitor.join(20) - - # Ensure everything is stopped - assert not worker_monitor.is_alive() - assert all(not worker.process.is_alive() for worker in workers) - - # Further attempts to submit tasks should fail - try: - _result = workers[0].execute_method("worker_method", "test") - pytest.fail("task should fail once workers have been shut down") - except Exception as e: - assert isinstance(e, ChildProcessError) - - -def test_local_workers_clean_shutdown() -> None: - """Test clean shutdown""" - - workers, worker_monitor = _start_workers() - - assert worker_monitor.is_alive() - assert all(worker.process.is_alive() for worker in workers) - - # Clean shutdown - worker_monitor.close() - - worker_monitor.join(20) - - # Ensure everything is stopped - assert not worker_monitor.is_alive() - assert all(not worker.process.is_alive() for worker in workers) - - # Further attempts to submit tasks should fail - try: - _result = workers[0].execute_method("worker_method", "test") - pytest.fail("task should fail once workers have been shut down") - except Exception as e: - assert isinstance(e, ChildProcessError) - - -@pytest.mark.asyncio -async def test_local_workers_async() -> None: - """Test local workers with async task submission""" - - workers, worker_monitor = _start_workers() - - async def execute_workers(worker_input: str) -> None: - worker_coros = [ - worker.execute_method_async("worker_method", worker_input) - for worker in workers - ] - - results = await asyncio.gather(*worker_coros) - for rank, result in enumerate(results): - assert result == (rank, input) - - tasks = [ - asyncio.create_task(execute_workers(f"task {task_num}")) - for task_num in range(4) - ] - - for task in tasks: - await task - - # Test error case - exception = ValueError("fake error") - try: - _result = await workers[0].execute_method_async( - "worker_method", exception) - pytest.fail("task should have failed") - except Exception as e: - assert isinstance(e, ValueError) - assert str(e) == "fake error" - - # Test cleanup when a worker fails - assert worker_monitor.is_alive() - workers[3].process.kill() - - # Other workers should get shut down here - worker_monitor.join(20) - - # Ensure everything is stopped - assert not worker_monitor.is_alive() - assert all(not worker.process.is_alive() for worker in workers) - - # Further attempts to submit tasks should fail - try: - _result = await workers[0].execute_method_async( - "worker_method", "test") - pytest.fail("task should fail once workers have been shut down") - except Exception as e: - assert isinstance(e, ChildProcessError) diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py deleted file mode 100644 index 42e88e84770ab..0000000000000 --- a/tests/engine/test_options.py +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from contextlib import nullcontext - -import pytest - -from vllm.entrypoints.llm import LLM -from vllm.sampling_params import SamplingParams - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_skip_tokenizer_initialization(model: str): - # This test checks if the flag skip_tokenizer_init skips the initialization - # of tokenizer and detokenizer. The generated output is expected to contain - # token ids. - llm = LLM( - model=model, - skip_tokenizer_init=True, - enforce_eager=True, - ) - sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) - - with pytest.raises(ValueError, match="cannot pass text prompts when"): - llm.generate("abc", sampling_params) - - outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, - sampling_params=sampling_params) - assert len(outputs) > 0 - completions = outputs[0].outputs - assert len(completions) > 0 - assert completions[0].text == "" - assert completions[0].token_ids - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -@pytest.mark.parametrize("enable_prompt_embeds", [True, False]) -def test_enable_prompt_embeds(hf_runner, model: str, - enable_prompt_embeds: bool): - prompt = "abc" - - with hf_runner(model) as hf_model: - token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids - token_ids = token_ids.to(hf_model.model.device) - - embed_layer = hf_model.model.get_input_embeddings() - prompt_embeds = embed_layer(token_ids).squeeze(0) - - ctx = (nullcontext() if enable_prompt_embeds else pytest.raises( - ValueError, match="set `--enable-prompt-embeds`")) - - llm = LLM( - model=model, - enable_prompt_embeds=enable_prompt_embeds, - enforce_eager=True, - ) - - with ctx: - llm.generate({"prompt_embeds": prompt_embeds}) diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py index 9c62761d78afb..9eb3dfc09224e 100644 --- a/tests/engine/test_short_mm_context.py +++ b/tests/engine/test_short_mm_context.py @@ -25,6 +25,7 @@ def test_context_length_too_short(vllm_runner, image_assets, model): model, max_model_len=128, # LLaVA has a feature size of 576 enforce_eager=True, + load_format="dummy", ) with vllm_model: diff --git a/tests/engine/test_stop_checker.py b/tests/engine/test_stop_checker.py deleted file mode 100644 index 34f4cb13ab0a5..0000000000000 --- a/tests/engine/test_stop_checker.py +++ /dev/null @@ -1,225 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -from transformers import AutoTokenizer - -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.reasoning import ReasoningParser -from vllm.sampling_params import SamplingParams -from vllm.sequence import Sequence, SequenceStatus - -REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" - - -class MockReasoningParser(ReasoningParser): - """Mock reasoning parser for testing purposes.""" - - def __init__(self, - tokenizer: AutoTokenizer, - reasoning_active: bool = False): - super().__init__(tokenizer) - self.reasoning_active = reasoning_active - - def is_reasoning_end(self, input_ids: list[int]) -> bool: - return not self.reasoning_active - - def extract_content_ids(self, input_ids: list[int]) -> list[int]: - return input_ids - - -class MockSequence(Sequence): - """Mock sequence for testing purposes.""" - - def __init__(self, token_ids, output_text="test_output", eos_token_id=0): - self.token_ids = token_ids - self.output_text = output_text - self.eos_token_id = eos_token_id - self.status = SequenceStatus.RUNNING - self.stop_reason = None - - def get_token_ids(self): - return self.token_ids - - def get_last_token_id(self): - return self.token_ids[-1] if self.token_ids else None - - def get_len(self): - return len(self.token_ids) - - def get_output_len(self): - return len(self.token_ids) - 1 # Simulating prompt + outputs - - -@pytest.fixture -def deepseek_r1_qwen_tokenizer(): - return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) - - -@pytest.fixture -def stop_checker(): - return StopChecker(max_model_len=10) - - -@pytest.fixture -def stop_checker_with_reasoner(): - reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer) - return StopChecker(max_model_len=10, reasoner=reasoner) - - -def test_eos_token_stopping(stop_checker): - """Test sequence stopping when EOS token is encountered.""" - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams() - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - - -def test_ignore_eos(stop_checker): - """Test sequence continuing when EOS token is ignored.""" - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams(ignore_eos=True) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.RUNNING - - -def test_min_tokens(stop_checker): - """Test min_tokens prevents early stopping.""" - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams(min_tokens=3) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.RUNNING - - -def test_stop_token_ids(stop_checker): - """Test sequence stopping with custom stop token IDs.""" - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(stop_token_ids=[3]) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.stop_reason == 3 - - -def test_stop_strings(stop_checker): - """Test sequence stopping with stop strings.""" - seq = MockSequence(token_ids=[1, 2, 3], - output_text="test output with STOP", - eos_token_id=0) - sampling_params = SamplingParams(stop=["STOP"]) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.stop_reason == "STOP" - assert "STOP" not in seq.output_text # Default behavior removes stop string - - -def test_include_stop_str_in_output(stop_checker): - """Test keeping stop strings in output.""" - seq = MockSequence(token_ids=[1, 2, 3], - output_text="test output with STOP", - eos_token_id=0) - sampling_params = SamplingParams(stop=["STOP"], - include_stop_str_in_output=True) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=5, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert "STOP" in seq.output_text - - -def test_max_tokens(stop_checker): - """Test sequence stopping at max_tokens.""" - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(max_tokens=2) - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED - - -def test_max_model_len(stop_checker): - """Test sequence stopping at max_model_len.""" - seq = MockSequence(token_ids=list(range(11)), - eos_token_id=0) # 11 tokens, max is 10 - sampling_params = SamplingParams() - - stop_checker.maybe_stop_sequence(seq, - new_char_count=1, - sampling_params=sampling_params) - - assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED - - -def test_reasoning_skip_stops(stop_checker_with_reasoner): - """Test that stop tokens and strings are ignored during reasoning.""" - # Set reasoning_active to True to simulate being in reasoning mode - stop_checker_with_reasoner.reasoner.reasoning_active = True - - # Test with stop token - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(stop_token_ids=[3]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=1, sampling_params=sampling_params) - assert seq.status == SequenceStatus.RUNNING - - # Test with stop string - seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") - sampling_params = SamplingParams(stop=["STOP"]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=4, sampling_params=sampling_params) - assert seq.status == SequenceStatus.RUNNING - - # But EOS token still stops the sequence - seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) - sampling_params = SamplingParams() - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=1, sampling_params=sampling_params) - assert seq.status == SequenceStatus.FINISHED_STOPPED - - -def test_reasoning_end_enables_stops(stop_checker_with_reasoner): - """Test that stop tokens work after reasoning ends.""" - # Set reasoning_active to False to simulate being out of reasoning mode - stop_checker_with_reasoner.reasoner.reasoning_active = False - - # Test with stop token - seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) - sampling_params = SamplingParams(stop_token_ids=[3]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=1, sampling_params=sampling_params) - assert seq.status == SequenceStatus.FINISHED_STOPPED - - # Test with stop string - seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") - sampling_params = SamplingParams(stop=["STOP"]) - - stop_checker_with_reasoner.maybe_stop_sequence( - seq, new_char_count=4, sampling_params=sampling_params) - assert seq.status == SequenceStatus.FINISHED_STOPPED From 2fc24e94f964368491a994641fb2921ed74cb4d4 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 19:40:44 -0700 Subject: [PATCH 390/584] [V0 Deprecation] Remove V0 Tracing & Metrics tests (#25115) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- .buildkite/test-pipeline.yaml | 4 +- tests/metrics/test_metrics.py | 268 ---------------------- tests/tracing/__init__.py | 0 tests/tracing/test_tracing.py | 237 ------------------- tests/{metrics => v1/tracing}/__init__.py | 0 5 files changed, 1 insertion(+), 508 deletions(-) delete mode 100644 tests/metrics/test_metrics.py delete mode 100644 tests/tracing/__init__.py delete mode 100644 tests/tracing/test_tracing.py rename tests/{metrics => v1/tracing}/__init__.py (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 08c10180fc224..0bce02b90a7cd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -217,16 +217,14 @@ steps: num_gpus: 2 source_file_dependencies: - vllm/ - - tests/metrics - tests/v1/tracing commands: - - pytest -v -s metrics - "pip install \ 'opentelemetry-sdk>=1.26.0' \ 'opentelemetry-api>=1.26.0' \ 'opentelemetry-exporter-otlp>=1.26.0' \ 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s tracing + - pytest -v -s v1/tracing ##### fast check tests ##### ##### 1 GPU test ##### diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py deleted file mode 100644 index dbd9c518e0200..0000000000000 --- a/tests/metrics/test_metrics.py +++ /dev/null @@ -1,268 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import ray -from prometheus_client import REGISTRY - -import vllm.envs as envs -from vllm import EngineArgs, LLMEngine -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.metrics import RayPrometheusStatLogger -from vllm.sampling_params import SamplingParams -from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -MODELS = [ - "distilbert/distilgpt2", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_metric_counter_prompt_tokens( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - with vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4) as vllm_model: - tokenizer = vllm_model.llm.get_tokenizer() - prompt_token_counts = [ - len(tokenizer.encode(p)) for p in example_prompts - ] - # This test needs at least 2 prompts in a batch of different lengths to - # verify their token count is correct despite padding. - assert len(example_prompts) > 1, "at least 2 prompts are required" - assert prompt_token_counts[0] != prompt_token_counts[1], ( - "prompts of different lengths are required") - vllm_prompt_token_count = sum(prompt_token_counts) - - _ = vllm_model.generate_greedy(example_prompts, max_tokens) - stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] - metric_count = stat_logger.metrics.counter_prompt_tokens.labels( - **stat_logger.labels)._value.get() - - assert vllm_prompt_token_count == metric_count, ( - f"prompt token count: {vllm_prompt_token_count!r}\n" - f"metric: {metric_count!r}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_metric_counter_generation_tokens( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - with vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.llm.get_tokenizer() - stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] - metric_count = stat_logger.metrics.counter_generation_tokens.labels( - **stat_logger.labels)._value.get() - vllm_generation_count = 0 - for i in range(len(example_prompts)): - vllm_output_ids, vllm_output_str = vllm_outputs[i] - prompt_ids = tokenizer.encode(example_prompts[i]) - # vllm_output_ids contains both prompt tokens and generation tokens. - # We're interested only in the count of the generation tokens. - vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) - - assert vllm_generation_count == metric_count, ( - f"generation token count: {vllm_generation_count!r}\n" - f"metric: {metric_count!r}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize( - "served_model_name", - [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) -def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, - served_model_name: list[str]) -> None: - with vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.3, - served_model_name=served_model_name) as vllm_model: - stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] - metrics_tag_content = stat_logger.labels["model_name"] - - if envs.VLLM_CI_USE_S3: - model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}" - if served_model_name is None or served_model_name == []: - assert metrics_tag_content == model, ( - f"Metrics tag model_name is wrong! expect: {model!r}\n" - f"actual: {metrics_tag_content!r}") - else: - assert metrics_tag_content == served_model_name[0], ( - f"Metrics tag model_name is wrong! expect: " - f"{served_model_name[0]!r}\n" - f"actual: {metrics_tag_content!r}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("disable_log_stats", [True, False]) -@pytest.mark.asyncio -async def test_async_engine_log_metrics_regression( - example_prompts, - model: str, - dtype: str, - max_tokens: int, - disable_log_stats: bool, -) -> None: - """ - Regression test ensuring async engine generates metrics - when disable_log_stats=False - (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678) - """ - engine_args = AsyncEngineArgs( - model=model, - dtype=dtype, - disable_log_stats=disable_log_stats, - ) - async_engine = AsyncLLMEngine.from_engine_args(engine_args) - for i, prompt in enumerate(example_prompts): - results = async_engine.generate( - prompt, - SamplingParams(max_tokens=max_tokens), - f"request-id-{i}", - ) - # Exhaust the async iterator to make the async engine work - async for _ in results: - pass - - assert_metrics(model, async_engine.engine, disable_log_stats, - len(example_prompts)) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("disable_log_stats", [True, False]) -def test_engine_log_metrics_regression( - example_prompts, - model: str, - dtype: str, - max_tokens: int, - disable_log_stats: bool, -) -> None: - engine_args = EngineArgs( - model=model, - dtype=dtype, - disable_log_stats=disable_log_stats, - ) - engine = LLMEngine.from_engine_args(engine_args) - for i, prompt in enumerate(example_prompts): - engine.add_request( - f"request-id-{i}", - prompt, - SamplingParams(max_tokens=max_tokens), - ) - while engine.has_unfinished_requests(): - engine.step() - - if envs.VLLM_CI_USE_S3: - model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}" - assert_metrics(model, engine, disable_log_stats, len(example_prompts)) - - -def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool, - num_requests: int) -> None: - if disable_log_stats: - with pytest.raises(AttributeError): - _ = engine.stat_loggers - else: - assert (engine.stat_loggers - is not None), "engine.stat_loggers should be set" - # Ensure the count bucket of request-level histogram metrics matches - # the number of requests as a simple sanity check to ensure metrics are - # generated - labels = {'model_name': model} - request_histogram_metrics = [ - "vllm:e2e_request_latency_seconds", - "vllm:request_prompt_tokens", - "vllm:request_generation_tokens", - "vllm:request_params_n", - "vllm:request_params_max_tokens", - ] - for metric_name in request_histogram_metrics: - metric_value = REGISTRY.get_sample_value(f"{metric_name}_count", - labels) - assert ( - metric_value == num_requests), "Metrics should be collected" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [16]) -def test_engine_log_metrics_ray( - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - # This test is quite weak - it only checks that we can use - # RayPrometheusStatLogger without exceptions. - # Checking whether the metrics are actually emitted is unfortunately - # non-trivial. - - # We have to run in a Ray task for Ray metrics to be emitted correctly - @ray.remote(num_gpus=1) - def _inner(): - - class _RayPrometheusStatLogger(RayPrometheusStatLogger): - - def __init__(self, *args, **kwargs): - self._i = 0 - super().__init__(*args, **kwargs) - - def log(self, *args, **kwargs): - self._i += 1 - return super().log(*args, **kwargs) - - engine_args = EngineArgs( - model=model, - dtype=dtype, - disable_log_stats=False, - ) - engine = LLMEngine.from_engine_args(engine_args) - logger = _RayPrometheusStatLogger( - local_interval=0.5, - labels=dict(model_name=engine.model_config.served_model_name), - vllm_config=engine.vllm_config) - engine.add_logger("ray", logger) - for i, prompt in enumerate(example_prompts): - engine.add_request( - f"request-id-{i}", - prompt, - SamplingParams(max_tokens=max_tokens), - ) - while engine.has_unfinished_requests(): - engine.step() - assert logger._i > 0, ".log must be called at least once" - - ray.get(_inner.remote()) diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py deleted file mode 100644 index 4dbae7c15de3a..0000000000000 --- a/tests/tracing/test_tracing.py +++ /dev/null @@ -1,237 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa -# type: ignore -from __future__ import annotations - -import threading -from collections.abc import Iterable -from concurrent import futures -from typing import Callable, Generator, Literal - -import grpc -import pytest -from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ( - ExportTraceServiceResponse) -from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import ( - TraceServiceServicer, add_TraceServiceServicer_to_server) -from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue -from opentelemetry.sdk.environment_variables import ( - OTEL_EXPORTER_OTLP_TRACES_INSECURE) - -from vllm import LLM, SamplingParams -from vllm.tracing import SpanAttributes - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch: pytest.MonkeyPatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - with monkeypatch.context() as m: - m.setenv('VLLM_USE_V1', '0') - yield - - -FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" - -FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value', - 'array_value'] - - -def decode_value(value: AnyValue): - field_decoders: dict[FieldName, Callable] = { - "bool_value": (lambda v: v.bool_value), - "string_value": (lambda v: v.string_value), - "int_value": (lambda v: v.int_value), - "double_value": (lambda v: v.double_value), - "array_value": - (lambda v: [decode_value(item) for item in v.array_value.values]), - } - for field, decoder in field_decoders.items(): - if value.HasField(field): - return decoder(value) - raise ValueError(f"Couldn't decode value: {value}") - - -def decode_attributes(attributes: Iterable[KeyValue]): - return {kv.key: decode_value(kv.value) for kv in attributes} - - -class FakeTraceService(TraceServiceServicer): - - def __init__(self): - self.request = None - self.evt = threading.Event() - - def Export(self, request, context): - self.request = request - self.evt.set() - return ExportTraceServiceResponse() - - -@pytest.fixture -def trace_service() -> Generator[FakeTraceService, None, None]: - """Fixture to set up a fake gRPC trace service""" - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) - service = FakeTraceService() - add_TraceServiceServicer_to_server(service, server) - server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS) - server.start() - - yield service - - server.stop(None) - - -def test_traces( - monkeypatch: pytest.MonkeyPatch, - trace_service: FakeTraceService, -): - with monkeypatch.context() as m: - m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - - sampling_params = SamplingParams( - temperature=0.01, - top_p=0.1, - max_tokens=256, - ) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") - - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS - ) == sampling_params.max_tokens - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE - ) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - # Model forward and model execute should be none, since detailed traces is - # not enabled. - assert metrics.model_forward_time is None - assert metrics.model_execute_time is None - - -def test_traces_with_detailed_steps( - monkeypatch: pytest.MonkeyPatch, - trace_service: FakeTraceService, -): - with monkeypatch.context() as m: - m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - - sampling_params = SamplingParams( - temperature=0.01, - top_p=0.1, - max_tokens=256, - ) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - collect_detailed_traces=["all"], - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") - - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS - ) == sampling_params.max_tokens - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE - ) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - assert metrics.model_forward_time > 0 - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD - ) == pytest.approx(metrics.model_forward_time / 1000) - assert metrics.model_execute_time > 0 - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE - ) == metrics.model_execute_time - assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/metrics/__init__.py b/tests/v1/tracing/__init__.py similarity index 100% rename from tests/metrics/__init__.py rename to tests/v1/tracing/__init__.py From 6c036615dc8ee8c27588491287cb49f2c2e2476a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 19:41:55 -0700 Subject: [PATCH 391/584] [V0 Deprecation] Remove misc V0 tests (#25118) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/model_executor/test_logits_processor.py | 98 ------------------- tests/test_cache_block_hashing.py | 92 ----------------- 2 files changed, 190 deletions(-) delete mode 100644 tests/model_executor/test_logits_processor.py delete mode 100644 tests/test_cache_block_hashing.py diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py deleted file mode 100644 index 532ebba038d38..0000000000000 --- a/tests/model_executor/test_logits_processor.py +++ /dev/null @@ -1,98 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from unittest.mock import patch - -import pytest -import torch - -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import is_pin_memory_available - - -class MockLogitsProcessor(LogitsProcessor): - - def __init__(self, vocab_size: int, scale: float, - fake_logits: torch.Tensor): - super().__init__(vocab_size=vocab_size, scale=scale) - self.fake_logits = fake_logits.clone() - - def forward(self, *args, **kwargs): - with patch( - "vllm.model_executor.layers.logits_processor._prune_hidden_states", - lambda x, y: x - ), patch( - "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits", - lambda *args, **kwargs: self.fake_logits): - return super().forward(*args, **kwargs) - - -def _prepare_test( - batch_size: int -) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) - fake_logits = torch.full((batch_size, vocab_size), - 1e-2, - dtype=input_tensor.dtype) - logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits) - return input_tensor, fake_logits, logits_processor - - -RANDOM_SEEDS = list(range(128)) -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_logits_processors(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, logits_processor = _prepare_test(batch_size) - - # This sample logits processor gives infinite score to the i-th token, - # where i is the length of the input sequence. - # We therefore expect the output token sequence to be [0, 1, 2, ...] - def pick_ith(token_ids, logits): - logits[len(token_ids)] = float("inf") - return logits - - seq_group_metadata_list = [] - seq_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0, - logits_processors=[pick_ith]), - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - logits_processor_output = logits_processor( - lm_head=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - - assert torch.isinf(logits_processor_output[:, 0]).all() - - fake_logits *= logits_processor.scale - torch.testing.assert_close(logits_processor_output[:, 1], - fake_logits[:, 1], - rtol=1e-4, - atol=0.0) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py deleted file mode 100644 index 1dba0fd0fb3d3..0000000000000 --- a/tests/test_cache_block_hashing.py +++ /dev/null @@ -1,92 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test hashing of cache blocks. - -Run `pytest tests/test_cache_block_hashing.py`. -""" -from typing import Optional - -import pytest - -from vllm.inputs import token_inputs -from vllm.lora.request import LoRARequest -from vllm.sequence import Sequence -from vllm.transformers_utils.tokenizer import get_tokenizer - -# Make two prefixes with different first blocks. -prefix_start = [("You are an expert"), ("You are a")] -prefix_common = ( - " school principal, skilled in effectively managing " - "faculty and staff. Draft 10-15 questions for a potential first grade " - "Head Teacher for my K-12, all-girls', independent school that emphasizes " - "community, joyful discovery, and life-long learning. The candidate is " - "coming in for a first-round panel interview for a 8th grade Math " - "teaching role. They have 5 years of previous teaching experience " - "as an assistant teacher at a co-ed, public school with experience " - "in middle school math teaching. Based on this, fulfill " - "the following: ") -prefixes = [start + prefix_common for start in prefix_start] - -# Sample prompts. -sample_prompts = [ - "Hello, my name is", "The president of the United States is", - "The capital of France is", "The future of AI is" -] - - -# Helper function. -def flatten_2d(li): - return [lss for ls in li for lss in ls] - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("max_num_seqs", [256]) -@pytest.mark.parametrize("concurrent_lora_int_ids", - [[None], [1], [None, 1], [None, 1, 2], [1, 2]]) -def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, - concurrent_lora_int_ids: list[Optional[int]]): - - tokenizer = get_tokenizer("facebook/opt-125m") - - hashes: list[list[list[int]]] = [] - - for prefix in prefixes: - for lora_int_id in concurrent_lora_int_ids: - lora_request = None - - if lora_int_id is not None: - lora_request = LoRARequest( - f"example_lora_{lora_int_id}", - lora_int_id, - f"example/path/to/lora_{lora_int_id}", - ) - - hashes.append([]) - prompts = [prefix + prompt for prompt in sample_prompts] - for seq_id, prompt in enumerate(prompts): - hashes[-1].append([]) - prompt_token_ids = tokenizer.encode(prompt) - seq = Sequence(seq_id, - inputs=token_inputs(prompt_token_ids, - prompt=prompt), - block_size=block_size, - eos_token_id=tokenizer.eos_token_id, - lora_request=lora_request) - - num_blocks = len(prompt_token_ids) // block_size - for idx in range(num_blocks): - hashes[-1][-1].append(seq.hash_of_block(idx)) - - # Check that hashes made with two prefixes with different first blocks are - # different everywhere. - for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])): - assert (hash0 != hash1) - - # Check that hashes of different prompts made with the same prefix are the - # same until the hashes that contain the prompt. - for hash_pref in hashes: - same_hashes = [tuple(h[:-1]) for h in hash_pref] - different_hashes = [h[-1] for h in hash_pref] - assert (len(set(same_hashes)) == 1) - assert (len(set(different_hashes)) == len(different_hashes)) From 7fb2a5be2838219d566f949ac41861df822eac10 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 20:18:36 -0700 Subject: [PATCH 392/584] [V0 Deprecation] Skip PP test (#25128) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/distributed/test_pipeline_parallel.py | 114 +++++--------------- 1 file changed, 28 insertions(+), 86 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 9da9672d95970..fcd09844c0951 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -26,23 +26,10 @@ logger = init_logger("test_pipeline_parallel") VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - For PP, we fall back to V0 by default. This means - that the TP baseline runs with V1 while the PP engine - runs with V0. This gives divergent results with dummy - weights. Once we enable V1 by default for PP, we can - remove this. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - class ParallelSetup(NamedTuple): tp_size: int pp_size: int eager_mode: bool - chunked_prefill: bool class PPTestOptions(NamedTuple): @@ -53,23 +40,10 @@ class PPTestOptions(NamedTuple): @dataclass class PPTestSettings: parallel_setups: list[ParallelSetup] - # NOTE: the length of distributed_backends and - # vllm_major_versions should be the same, and they - # are first zipped together to iterate over all - # test settings. distributed_backends: list[str] - # vllm major version: "0" for V0, "1" for V1 - vllm_major_versions: list[str] runner: RunnerOption test_options: PPTestOptions - def __post_init__(self): - if len(self.distributed_backends) != len(self.vllm_major_versions): - raise ValueError( - f"Length mismatch: distributed_backends " - f"({len(self.distributed_backends)}) != " - f"vllm_major_versions ({len(self.vllm_major_versions)})") - @staticmethod def detailed( *, @@ -83,27 +57,21 @@ class PPTestSettings: parallel_setups=[ ParallelSetup(tp_size=tp_base, pp_size=pp_base, - eager_mode=False, - chunked_prefill=False), + eager_mode=False), ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, - eager_mode=False, - chunked_prefill=True), + eager_mode=False), ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, - eager_mode=True, - chunked_prefill=False), + eager_mode=True), ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, - eager_mode=False, - chunked_prefill=True), + eager_mode=False), ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, - eager_mode=True, - chunked_prefill=False), + eager_mode=True), ], - distributed_backends=["mp", "mp", "ray", "ray"], - vllm_major_versions=["0", "1", "0", "1"], + distributed_backends=["mp", "ray"], runner=runner, test_options=PPTestOptions(multi_node_only=multi_node_only, load_format=load_format), @@ -118,17 +86,14 @@ class PPTestSettings: multi_node_only: bool = False, load_format: Optional[str] = None, ): - vllm_major_versions = ["1"] if runner == "pooling" else ["0"] return PPTestSettings( parallel_setups=[ ParallelSetup(tp_size=tp_base, pp_size=pp_base, - eager_mode=True, - chunked_prefill=False), + eager_mode=True), ], distributed_backends=["mp"], - vllm_major_versions=vllm_major_versions, runner=runner, test_options=PPTestOptions(multi_node_only=multi_node_only, load_format=load_format), @@ -138,10 +103,8 @@ class PPTestSettings: opts = self.test_options for parallel_setup in self.parallel_setups: - for backend, vllm_major_version in zip(self.distributed_backends, - self.vllm_major_versions): - yield (model_id, parallel_setup, backend, vllm_major_version, - self.runner, opts) + for backend in self.distributed_backends: + yield (model_id, parallel_setup, backend, self.runner, opts) # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU @@ -269,7 +232,6 @@ def _compare_tp( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available: int, @@ -281,7 +243,6 @@ def _compare_tp( tp_size, pp_size, eager_mode, - chunked_prefill, ) = parallel_setup multi_node_only, load_format = test_options @@ -334,8 +295,6 @@ def _compare_tp( "--max-num-seqs", "8", ] - if chunked_prefill: - common_args.append("--enable-chunked-prefill") if eager_mode: common_args.append("--enforce-eager") if runner != "auto": @@ -353,14 +312,10 @@ def _compare_tp( if max_num_seqs: common_args.extend(["--max-num-seqs", f"{max_num_seqs}"]) - specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill - testing_ray_compiled_graph = False - if distributed_backend == "ray" and (vllm_major_version == "1" - or specific_case): + if distributed_backend == "ray": # For V1, test Ray Compiled Graph for all the tests - # For V0, test Ray Compiled Graph for a subset of the tests pp_env = { - "VLLM_USE_V1": vllm_major_version, + "VLLM_USE_V1": "1", "VLLM_USE_RAY_COMPILED_DAG": "1", "VLLM_USE_RAY_SPMD_WORKER": "1", "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", @@ -368,17 +323,15 @@ def _compare_tp( # Temporary. Currently when zeromq + SPMD is used, it does not properly # terminate because of a Ray Compiled Graph issue. common_args.append("--disable-frontend-multiprocessing") - testing_ray_compiled_graph = True elif distributed_backend == "mp": - # Both V0/V1 of multiprocessing executor support PP pp_env = { - "VLLM_USE_V1": vllm_major_version, + "VLLM_USE_V1": "1", } else: pp_env = None tp_env = { - "VLLM_USE_V1": vllm_major_version, + "VLLM_USE_V1": "1", } pp_args = [ @@ -404,25 +357,17 @@ def _compare_tp( "mp", ] - try: - compare_two_settings(model_id, - pp_args, - tp_args, - pp_env, - tp_env, - method=method) - except Exception: - if testing_ray_compiled_graph and vllm_major_version == "0": - # Ray Compiled Graph tests are flaky for V0, - # so we don't want to fail the test - logger.exception("Ray Compiled Graph tests failed") - else: - raise + compare_two_settings(model_id, + pp_args, + tp_args, + pp_env, + tp_env, + method=method) @pytest.mark.parametrize( - ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", - "runner", "test_options"), + ("model_id", "parallel_setup", "distributed_backend", "runner", + "test_options"), [ params for model_id, settings in TEXT_GENERATION_MODELS.items() for params in settings.iter_params(model_id) if model_id in TEST_MODELS @@ -433,15 +378,14 @@ def test_tp_language_generation( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available, ): + pytest.skip("Skipping the test until V1 passes it.") _compare_tp(model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, @@ -450,8 +394,8 @@ def test_tp_language_generation( @pytest.mark.parametrize( - ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", - "runner", "test_options"), + ("model_id", "parallel_setup", "distributed_backend", "runner", + "test_options"), [ params for model_id, settings in EMBEDDING_MODELS.items() for params in settings.iter_params(model_id) if model_id in TEST_MODELS @@ -462,15 +406,14 @@ def test_tp_language_embedding( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available, ): + pytest.skip("Skipping the test until V1 passes it.") _compare_tp(model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, @@ -479,8 +422,8 @@ def test_tp_language_embedding( @pytest.mark.parametrize( - ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", - "runner", "test_options"), + ("model_id", "parallel_setup", "distributed_backend", "runner", + "test_options"), [ params for model_id, settings in MULTIMODAL_MODELS.items() for params in settings.iter_params(model_id) if model_id in TEST_MODELS @@ -491,15 +434,14 @@ def test_tp_multimodal_generation( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, - vllm_major_version: str, runner: RunnerOption, test_options: PPTestOptions, num_gpus_available, ): + pytest.skip("Skipping the test until V1 passes it.") _compare_tp(model_id, parallel_setup, distributed_backend, - vllm_major_version, runner, test_options, num_gpus_available, From 4ac510f4844ae2ab168c2dbac545e3dd28a0a1b9 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 17 Sep 2025 23:19:52 -0400 Subject: [PATCH 393/584] [Kernels] Enable DeepGEMM by default (#24462) Signed-off-by: Bill Nell <bnell@redhat.com> --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index eeed7771f0453..72e1d5b0ede81 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -135,7 +135,7 @@ if TYPE_CHECKING: VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None VLLM_TPU_USING_PATHWAYS: bool = False - VLLM_USE_DEEP_GEMM: bool = False + VLLM_USE_DEEP_GEMM: bool = True VLLM_USE_DEEP_GEMM_E8M0: bool = True VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False @@ -1044,7 +1044,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Allow use of DeepGemm kernels for fused moe ops. "VLLM_USE_DEEP_GEMM": - lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), + lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "1"))), # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs. "VLLM_USE_DEEP_GEMM_E8M0": From 3127274d022b0bc8ff6ba9ceef41a99a6f01ad2d Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Wed, 17 Sep 2025 21:04:21 -0700 Subject: [PATCH 394/584] [MM Encoder] Apply DP ViT for Qwen3-VL model series (#24955) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/model_executor/models/qwen3_vl.py | 94 +++++++++++++++++----- vllm/model_executor/models/qwen3_vl_moe.py | 2 + 2 files changed, 77 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 22948aee4936c..2c36dfbce7f67 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -126,20 +126,23 @@ class Qwen3_VisionMLP(nn.Module): bias: bool = False, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + use_data_parallel: bool = False): super().__init__() self.linear_fc1 = ColumnParallelLinear(in_features, hidden_features, bias=bias, quant_config=quant_config, return_bias=False, - prefix=f"{prefix}.linear_fc1") + prefix=f"{prefix}.linear_fc1", + disable_tp=use_data_parallel) self.linear_fc2 = RowParallelLinear(hidden_features, in_features, bias=bias, quant_config=quant_config, return_bias=False, - prefix=f"{prefix}.linear_fc2") + prefix=f"{prefix}.linear_fc2", + disable_tp=use_data_parallel) self.act_fn = act_fn def forward(self, x: torch.Tensor): @@ -158,23 +161,27 @@ class Qwen3_VisionBlock(nn.Module): norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() if norm_layer is None: norm_layer = partial(nn.LayerNorm, eps=1e-6) self.norm1 = norm_layer(dim) self.norm2 = norm_layer(dim) - self.attn = Qwen2_5_VisionAttention(embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") + self.attn = Qwen2_5_VisionAttention( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel) self.mlp = Qwen3_VisionMLP(dim, mlp_hidden_dim, act_fn=act_fn, bias=True, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) def forward( self, @@ -205,6 +212,7 @@ class Qwen3_VisionPatchMerger(nn.Module): use_postshuffle_norm: bool = False, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.hidden_size = context_dim * (spatial_merge_size**2) @@ -222,13 +230,15 @@ class Qwen3_VisionPatchMerger(nn.Module): self.hidden_size, bias=True, quant_config=quant_config, - prefix=f"{prefix}.linear_fc1") + prefix=f"{prefix}.linear_fc1", + disable_tp=use_data_parallel) self.act_fn = nn.GELU() self.linear_fc2 = RowParallelLinear(self.hidden_size, d_model, bias=True, quant_config=quant_config, - prefix=f"{prefix}.linear_fc2") + prefix=f"{prefix}.linear_fc2", + disable_tp=use_data_parallel) def forward(self, x: torch.Tensor) -> torch.Tensor: if self.use_postshuffle_norm: @@ -250,6 +260,7 @@ class Qwen3_VisionTransformer(nn.Module): norm_eps: float = 1e-6, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.hidden_size = vision_config.hidden_size @@ -260,6 +271,12 @@ class Qwen3_VisionTransformer(nn.Module): self.spatial_merge_unit = self.spatial_merge_size**2 self.temporal_patch_size = vision_config.temporal_patch_size self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes + self.use_data_parallel = use_data_parallel + + # NOTE: This is used for creating empty tensor for all_gather for + # DP ViT. Here out_hidden_size is enlarged due to deepstack + self.out_hidden_size = (vision_config.out_hidden_size * + (1 + len(self.deepstack_visual_indexes))) self.patch_embed = Qwen3_VisionPatchEmbed( patch_size=self.patch_size, @@ -283,7 +300,8 @@ class Qwen3_VisionTransformer(nn.Module): act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], norm_layer=norm_layer, quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(vision_config.depth) ]) @@ -294,6 +312,7 @@ class Qwen3_VisionTransformer(nn.Module): spatial_merge_size=self.spatial_merge_size, quant_config=quant_config, prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, ) self.deepstack_merger_list = nn.ModuleList([ @@ -304,7 +323,8 @@ class Qwen3_VisionTransformer(nn.Module): use_postshuffle_norm=True, norm_layer=norm_layer, quant_config=quant_config, - prefix=f"{prefix}.deepstack_merger_list.{layer_idx}") + prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(len(self.deepstack_visual_indexes)) ]) @@ -325,7 +345,14 @@ class Qwen3_VisionTransformer(nn.Module): def rot_pos_emb(self, grid_thw): pos_ids = [] - for t, h, w in grid_thw: + # Support both Tensor and list inputs for DP path + if isinstance(grid_thw, list): + grid_list = grid_thw + max_grid_size = max(max(h, w) for _, h, w in grid_list) + else: + grid_list = grid_thw.tolist() + max_grid_size = int(grid_thw[:, 1:].max().item()) + for t, h, w in grid_list: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) hpos_ids = hpos_ids.reshape( h // self.spatial_merge_size, @@ -348,7 +375,6 @@ class Qwen3_VisionTransformer(nn.Module): pos_ids.append( torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb @@ -453,10 +479,18 @@ class Qwen3_VisionTransformer(nn.Module): hidden_states = hidden_states + pos_embeds rotary_pos_emb = self.rot_pos_emb(grid_thw) + if isinstance(grid_thw, list): + grid_thw_tensor = torch.tensor(grid_thw, + device=hidden_states.device, + dtype=torch.int32) + else: + grid_thw_tensor = grid_thw + cu_seqlens = torch.repeat_interleave( - grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], + grid_thw_tensor[:, 0]).cumsum( dim=0, - dtype=grid_thw.dtype + dtype=grid_thw_tensor.dtype if torch.jit.is_tracing() else torch.int32, ) cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) @@ -984,6 +1018,9 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, "up_proj", ], } + + supports_encoder_tp_data = True + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -1009,12 +1046,14 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.visual = Qwen3_VisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, ) self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config, @@ -1177,7 +1216,15 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + if self.use_data_parallel: + from vllm.multimodal.utils import ( + run_dp_sharded_mrope_vision_model) + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) # Split concatenated embeddings for each image item. # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync @@ -1199,7 +1246,16 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, else: pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + if self.use_data_parallel: + from vllm.multimodal.utils import ( + run_dp_sharded_mrope_vision_model) + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d") + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw) # Split concatenated embeddings for each video item. # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index a800e94ab1e50..d25bc71dcb59b 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -315,12 +315,14 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.visual = Qwen3_VisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, ) self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config, From 32baf1d03685ead1f5946f867e4ca16007bd10b5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 05:05:18 +0100 Subject: [PATCH 395/584] [Docs] Clean up the contributing README (#25099) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/contributing/README.md | 177 +++++++++--------- .../installation/python_env_setup.inc.md | 2 +- mkdocs.yaml | 1 + 3 files changed, 95 insertions(+), 85 deletions(-) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 5a2a70d57e85f..b0a95b3b3d3a5 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -26,113 +26,123 @@ See <gh-file:LICENSE>. ## Developing ---8<-- "docs/getting_started/installation/python_env_setup.inc.md" - -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. -Check out the [building from source][build-from-source] documentation for details. - -For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. - -### Building the docs with MkDocs - -#### Introduction to MkDocs - -[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file. - -#### Install MkDocs and Plugins - -Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies: - -```bash -uv pip install -r requirements/docs.txt -``` - -!!! note - Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+) - -#### Verify Installation - -Confirm that MkDocs is correctly installed: - -```bash -mkdocs --version -``` - -Example output: - -```console -mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10) -``` - -#### Clone the `vLLM` repository +The first step of contributing to vLLM is to clone the GitHub repository: ```bash git clone https://github.com/vllm-project/vllm.git cd vllm ``` -#### Start the Development Server +Then, configure your Python virtual environment. -MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command: +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" + +If you are only developing vLLM's Python code, install vLLM using: ```bash -mkdocs serve +VLLM_USE_PRECOMPILED=1 uv pip install -e . ``` -Example output: +If you are developing vLLM's Python and CUDA/C++ code, install vLLM using: -```console -INFO - Documentation built in 106.83 seconds -INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml' -INFO - [22:02:02] Serving on http://127.0.0.1:8000/ +```bash +uv pip install -e . ``` -#### View in Your Browser +For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section. -Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:. - -#### Learn More - -For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/). - -## Testing - -??? console "Commands" - - ```bash - # These commands are only for Nvidia CUDA platforms. - uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto - - # Linting, formatting and static type checking - pre-commit install - - # You can manually run pre-commit with - pre-commit run --all-files --show-diff-on-failure - - # To manually run something from CI that does not run - # locally by default, you can run: - pre-commit run mypy-3.9 --hook-stage manual --all-files - - # Unit tests - pytest tests/ - - # Run tests for a single test file with detailed output - pytest -s -v tests/test_logger.py - ``` +For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. !!! tip - Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. + vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12. Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. -!!! note "Install python3-dev if Python.h is missing" +### Linting + +vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as: + +```bash +uv pip install pre-commit +pre-commit install +``` + +vLLM's `pre-commit` hooks will now run automatically every time you commit. + +!!! tip "Tips" + You can manually run the `pre-commit` hooks using: + + ```bash + pre-commit run # runs on staged files + pre-commit run -a # runs on all files (short for --all-files) + ``` + + --- + + Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with: + + ```bash + pre-commit run --hook-stage manual markdownlint + pre-commit run --hook-stage manual mypy-3.9 + ``` + +### Documentation + +MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>. + +Get started with: + +```bash +uv pip install -r requirements/docs.txt +``` + +!!! tip + Ensure that your Python version is compatible with the plugins + (e.g., `mkdocs-awesome-nav` requires Python 3.10+) + +MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. +From the root of the repository, run: + +```bash +mkdocs serve # with API ref (~10 minutes) +API_AUTONAV_EXCLUDE=vllm mkdocs serve # API ref off (~15 seconds) +``` + +Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready! +Open <http://127.0.0.1:8000/> in your browser to see it. + +For additional features and advanced configurations, refer to the: + +- [MkDocs documentation](https://www.mkdocs.org/) +- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use) + +### Testing + +vLLM uses `pytest` to test the codebase. + +```bash +# Install the test dependencies used in CI (CUDA only) +uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto + +# Install some common test dependencies (hardware agnostic) +uv pip install pytest pytest-asyncio + +# Run all tests +pytest tests/ + +# Run tests for a single test file with detailed output +pytest -s -v tests/test_logger.py +``` + +!!! tip "Install python3-dev if Python.h is missing" If any of the above commands fails with `Python.h: No such file or directory`, install `python3-dev` with `sudo apt install python3-dev`. -!!! note +!!! warning "Warnings" Currently, the repository is not fully checked by `mypy`. -!!! note + --- + Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU platform to run unit tests locally, rely on the continuous integration system to run the tests for now. @@ -194,8 +204,7 @@ appropriately to indicate the type of change. Please use one of the following: The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use `pre-commit` to format your code. See - <https://pre-commit.com/#usage> if `pre-commit` is new to you. +- Pass all linter checks. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md index 423bf9b00d07f..06794f8d3120e 100644 --- a/docs/getting_started/installation/python_env_setup.inc.md +++ b/docs/getting_started/installation/python_env_setup.inc.md @@ -1,4 +1,4 @@ -It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands: ```bash uv venv --python 3.12 --seed diff --git a/mkdocs.yaml b/mkdocs.yaml index 507a80c41e8b4..bbd850bdfee34 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -79,6 +79,7 @@ plugins: - "re:vllm\\._.*" # Internal modules - "vllm.third_party" - "vllm.vllm_flash_attn" + - !ENV [API_AUTONAV_EXCLUDE, ""] - mkdocstrings: handlers: python: From b98219670fb1ca2952d449404c2b4921d7cdce73 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Thu, 18 Sep 2025 05:08:41 +0100 Subject: [PATCH 396/584] [Core][MM] Cleanup `MultiModalCache` (#25006) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/multimodal/cache.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 31ae450f4c2ff..297b4c7fa7fbd 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import operator import sys from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence @@ -91,26 +92,15 @@ _V = TypeVar("_V", bound=MultiModalCacheValue) class MultiModalCache: @classmethod - def get_leaf_size( - cls, - leaf: object, - *, - debug: bool = False, - ) -> int: + def get_leaf_size(cls, leaf: object) -> int: if isinstance(leaf, MultiModalProcessorCacheItem): return cls.get_leaf_size(leaf.item) if isinstance(leaf, MultiModalProcessorCacheItemMetadata): return leaf.item_size # These are not subclasses of dict - if isinstance(leaf, MultiModalKwargsItems): - return cls.get_item_size(leaf.data) # type: ignore - if isinstance(leaf, MultiModalKwargsItem): - return cls.get_item_size(leaf.data) # type: ignore - if isinstance(leaf, MultiModalKwargs): - return cls.get_item_size(leaf.data) # type: ignore - - if isinstance(leaf, MultiModalFieldElem): + if isinstance(leaf, (MultiModalKwargs, MultiModalKwargsItems, + MultiModalKwargsItem, MultiModalFieldElem)): return cls.get_item_size(leaf.data) # type: ignore # sys.getsizeof doesn't work for tensors @@ -126,11 +116,8 @@ class MultiModalCache: *, debug: bool = False, ) -> int: - size = json_reduce_leaves( - lambda a, b: a + b, - json_map_leaves(lambda x: cls.get_leaf_size(x, debug=debug), - value), - ) + size = json_reduce_leaves(operator.add, + json_map_leaves(cls.get_leaf_size, value)) if debug: leaf_count = json_count_leaves(value) From 027d37df389b00ed2e7d874113f869267533a2ab Mon Sep 17 00:00:00 2001 From: toncao <130689535+toncao@users.noreply.github.com> Date: Thu, 18 Sep 2025 11:08:50 +0700 Subject: [PATCH 397/584] [Bugfix][Qwen3-Next] add prefixes to shared_expert in qwen3-next and mlp in qwen2moe to successfully load ignored params in quantized models (#24960) Signed-off-by: toncao <cpatonn@gmail.com> Co-authored-by: toncao <cpatonn@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> --- vllm/model_executor/models/qwen2_moe.py | 48 ++++++++++++------------ vllm/model_executor/models/qwen3_next.py | 1 + 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5e6dea67c9404..6c6276a930453 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -72,17 +72,20 @@ class Qwen2MoeMLP(nn.Module): hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( hidden_size, [intermediate_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(intermediate_size, hidden_size, bias=False, quant_config=quant_config, - reduce_results=reduce_results) + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -123,7 +126,8 @@ class Qwen2MoeSparseMoeBlock(nn.Module): self.gate = ReplicatedLinear(config.hidden_size, config.num_experts, bias=False, - quant_config=None) + quant_config=None, + prefix=f"{prefix}.gate") if config.shared_expert_intermediate_size > 0: self.shared_expert = Qwen2MoeMLP( hidden_size=config.hidden_size, @@ -132,6 +136,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module): quant_config=quant_config, reduce_results=self.experts.must_reduce_shared_expert_outputs( ), + prefix=f"{prefix}.shared_expert", ) else: self.shared_expert = None @@ -203,21 +208,19 @@ class Qwen2MoeAttention(nn.Module): self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=True, - quant_config=quant_config, - ) + self.qkv_proj = QKVParallelLinear(hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - ) + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") self.rotary_emb = get_rope( self.head_dim, @@ -296,12 +299,11 @@ class Qwen2MoeDecoderLayer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.mlp") else: - self.mlp = Qwen2MoeMLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - ) + self.mlp = Qwen2MoeMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index fe63e93032352..ca9f4d402dac2 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -138,6 +138,7 @@ class Qwen3NextSparseMoeBlock(nn.Module): quant_config=quant_config, reduce_results=self.experts.must_reduce_shared_expert_outputs( ), + prefix=f"{prefix}.shared_expert", ) else: self.shared_expert = None From dc2979c58574e7a49d17b50c5770010039145aac Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Thu, 18 Sep 2025 00:10:21 -0400 Subject: [PATCH 398/584] [Kernels] Overlap shared experts with combine instead of dispatch (#24254) Signed-off-by: Bill Nell <bnell@redhat.com> --- .../fused_moe/deepep_ht_prepare_finalize.py | 50 +++++++++- .../fused_moe/deepep_ll_prepare_finalize.py | 55 +++++++++-- .../layers/fused_moe/modular_kernel.py | 95 +++++++++++++++---- .../layers/fused_moe/pplx_prepare_finalize.py | 39 +++++++- 4 files changed, 203 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 5d6b9c87a6b76..f390f0a25875e 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -240,7 +240,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): quant_config) return receiver() - def finalize( + def _finalize( self, output: torch.Tensor, fused_expert_output: torch.Tensor, @@ -248,7 +248,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, - ) -> None: + do_async: bool, + ) -> Optional[Callable]: assert self.handle is not None @@ -271,7 +272,46 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_weights=None, config=self._get_combine_config(), previous_event=None, - async_finish=False, + async_finish=do_async, allocate_on_comm_stream=False) - # Respect inplace outputs. - output.copy_(combined_x, non_blocking=True) + + if do_async: + + def _receiver(): + event.current_stream_wait() + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) + + return lambda: _receiver() + else: + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) + return None + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> Callable: + receiver = self._finalize(output, fused_expert_output, topk_weights, + topk_ids, apply_router_weight_on_input, + weight_and_reduce_impl, True) + assert receiver is not None + return receiver + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + self._finalize(output, fused_expert_output, topk_weights, topk_ids, + apply_router_weight_on_input, weight_and_reduce_impl, + False) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 01df7770463d0..101fc8798c427 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -12,8 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input, normalize_batched_scales_shape) from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled, - dbo_maybe_run_recv_hook, - dbo_register_recv_hook, dbo_yield) + dbo_maybe_run_recv_hook) # DeepEP kernels quantize dispatch inputs in 128 element chunks. DEEPEP_QUANT_BLOCK_SIZE = 128 @@ -198,7 +197,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): hook() return receiver() - def finalize( + def _finalize( self, output: torch.Tensor, fused_expert_output: torch.Tensor, @@ -206,13 +205,14 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, - ) -> None: + do_async: bool, + ) -> Optional[Callable]: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") a2a_idx = dbo_current_ubatch_id() - do_recv_hook = dbo_enabled() + do_recv_hook = dbo_enabled() or do_async handle = self.handles[a2a_idx] assert handle is not None @@ -232,6 +232,45 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): zero_copy=False, return_recv_hook=do_recv_hook, out=output) - if recv_hook is not None: - dbo_register_recv_hook(recv_hook) - dbo_yield() + + return recv_hook + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> Callable: + recv_hook = self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + do_async=True, + ) + assert recv_hook is not None + return recv_hook + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + do_async=False, + ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 58cd0294c8c44..729f8e39cf0f7 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -209,7 +209,8 @@ class FusedMoEPrepareAndFinalize(ABC): def supports_async(self) -> bool: """ - Indicates whether or not this class implements prepare_async. + Indicates whether or not this class implements prepare_async and + finalize_async. """ return False @@ -275,6 +276,42 @@ class FusedMoEPrepareAndFinalize(ABC): """ raise NotImplementedError + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: TopKWeightAndReduce, + ) -> Callable: + """ + Perform any combine plus apply weights and perform a reduction on the + fused experts output but do not wait for results from other workers. + - output: The output tensor, written in place. Must be (M, K) shape. + - fused_expert_output: The unweighted, unreduced output of the fused + experts, it will have (M, topk, K) shape. + - topk_weights: The weights to be applied to the fused_experts_output. + - topk_ids: The topk_ids. + - apply_router_weight_on_input: When False, apply the weights to + fused_expert_output. + - weight_and_reduce_impl: An optional TopKWeightAndReduce + implementation. + + Returns a callback that when invoked waits for results from other + workers and has the same return signature as `finalize`, e.g. + + receiver = obj.finalize_async(output, ...) + ... output not valid yet ... + receiver() + ... output valid here ... + + is equivalent to: + + obj.finalize(output, ...) + """ + raise NotImplementedError + @property @abstractmethod def activation_format(self) -> FusedMoEActivationFormat: @@ -814,23 +851,20 @@ class FusedMoEModularKernel(torch.nn.Module): """ a1 = hidden_states - output = a1 if inplace else torch.zeros_like(a1) + if inplace and self.shared_experts is None: + output = a1 + else: + output = torch.zeros_like(a1) local_num_experts = w1.size(0) if global_num_experts == -1: global_num_experts = local_num_experts - shared_output: torch.Tensor - if not self.prepare_finalize.supports_async(): # We shouldn't be running an a2a kernel that doesn't # support async prepare/finalize assert not dbo_enabled() - # Run shared experts serially with dispatch. - if self.shared_experts is not None: - shared_output = self.shared_experts(a1) - (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, _expert_topk_weights) = self.prepare_finalize.prepare( a1, @@ -854,9 +888,6 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.quant_config, ) - if self.shared_experts is not None: - shared_output = self.shared_experts(a1) - # If DBO is being used, register the hook with the ubatch context # and call it in dbo_maybe_run_recv_hook instead of passing it to # the receiver. @@ -900,16 +931,42 @@ class FusedMoEModularKernel(torch.nn.Module): apply_router_weight_on_input=apply_router_weight_on_input, ) - self.prepare_finalize.finalize( - output, - fused_out, - topk_weights, - topk_ids, - apply_router_weight_on_input, - self.fused_experts.finalize_weight_and_reduce_impl(), - ) + shared_output: Optional[torch.Tensor] = None + + if not self.prepare_finalize.supports_async(): + assert not dbo_enabled() + + self.prepare_finalize.finalize( + output, + fused_out, + topk_weights, + topk_ids, + apply_router_weight_on_input, + self.fused_experts.finalize_weight_and_reduce_impl(), + ) + if self.shared_experts is not None: + shared_output = self.shared_experts(a1) + else: + recv_hook = self.prepare_finalize.finalize_async( + output, + fused_out, + topk_weights, + topk_ids, + apply_router_weight_on_input, + self.fused_experts.finalize_weight_and_reduce_impl(), + ) + + if self.shared_experts is not None: + shared_output = self.shared_experts(a1) + + assert recv_hook is not None + dbo_register_recv_hook(recv_hook) + dbo_yield() + if not dbo_enabled(): + recv_hook() if self.shared_experts is None: return output else: + assert shared_output is not None return shared_output, output diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 32d12476dd01a..ddddd2a3b7a2e 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -272,7 +272,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): hook() return receiver() - def finalize( + def finalize_async( self, output: torch.Tensor, fused_expert_output: torch.Tensor, @@ -280,7 +280,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, - ) -> None: + ) -> Callable: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") @@ -303,8 +303,39 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): if apply_router_weight_on_input: topk_weights = torch.ones_like(topk_weights) + topk_ids_u32 = topk_ids.view(dtype=torch.uint32) + self.a2a.combine(out_tokens=output, - indices=topk_ids.view(dtype=torch.uint32), + indices=topk_ids_u32, weights=topk_weights, expert_y=fused_expert_output, - bound_m=bound_m) + bound_m=bound_m, + do_send=True, + do_recv=False) + + return lambda: self.a2a.combine(out_tokens=output, + indices=topk_ids_u32, + weights=topk_weights, + expert_y=fused_expert_output, + bound_m=bound_m, + do_send=False, + do_recv=True) + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + receiver = self.finalize_async( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + ) + receiver() From 52bc9d5b3edbf8804758d46cde28024d6c362e42 Mon Sep 17 00:00:00 2001 From: YiwenC <54658925+666even666@users.noreply.github.com> Date: Wed, 17 Sep 2025 21:11:46 -0700 Subject: [PATCH 399/584] [Model] enable data parallel for InternVL vision encoder (#23909) Signed-off-by: Yiwen Chen <yiwen66@berkeley.edu> Signed-off-by: YiwenC <54658925+666even666@users.noreply.github.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- docs/configuration/optimization.md | 1 + vllm/model_executor/models/intern_vit.py | 107 ++++++++++++++++------- vllm/model_executor/models/internvl.py | 5 +- 3 files changed, 80 insertions(+), 33 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 5807d787cf531..5564d8a81d937 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -175,6 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u Known supported models: - GLM-4.5V GLM-4.1V (<gh-pr:23168>) +- InternVL (<gh-pr:23909>) - Kimi-VL (<gh-pr:23817>) - Llama4 (<gh-pr:18368>) - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 8e9ab9649bd44..118cce810a1f2 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -25,9 +25,11 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.multimodal.utils import run_dp_sharded_vision_model NORM2FN = { 'rms_norm': RMSNorm, @@ -137,6 +139,7 @@ class InternParallelAttention(nn.Module): *, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() @@ -150,8 +153,10 @@ class InternParallelAttention(nn.Module): f'(got `embed_dim`: {self.embed_dim} and `num_heads`:' f' {self.num_heads}).') - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) + self.tp_rank = (0 if use_data_parallel else + get_tensor_model_parallel_rank()) # Additional dummy heads are used to enable TP for common GPU counts. self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim @@ -159,14 +164,23 @@ class InternParallelAttention(nn.Module): self.tp_size) self.scale = self.head_dim**-0.5 - self.qkv = QKVParallelLinear( - self.embed_dim, - self.head_dim, - num_dummy_heads + self.num_heads, - bias=config.qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) + if use_data_parallel: + self.qkv = ReplicatedLinear( + self.embed_dim, + 3 * self.head_dim * self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + ) + else: + self.qkv = QKVParallelLinear( + self.embed_dim, + self.head_dim, + num_dummy_heads + self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + ) self.qk_normalization = config.qk_normalization @@ -178,12 +192,20 @@ class InternParallelAttention(nn.Module): eps=config.layer_norm_eps, var_hidden_size=self.embed_dim) - self.proj = RowParallelLinear( - self.dummy_dim, - self.embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) + if use_data_parallel: + self.proj = ReplicatedLinear( + self.dummy_dim, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) + else: + self.proj = RowParallelLinear( + self.dummy_dim, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) @@ -287,21 +309,26 @@ class InternMLP(nn.Module): config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) - self.fc1 = ColumnParallelLinear(config.hidden_size, - config.intermediate_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc1") - self.fc2 = RowParallelLinear(config.intermediate_size, - config.hidden_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc2") + cls_fc1 = (ReplicatedLinear + if use_data_parallel else ColumnParallelLinear) + self.fc1 = cls_fc1(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc1") + cls_fc2 = (ReplicatedLinear + if use_data_parallel else RowParallelLinear) + self.fc2 = cls_fc2(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -320,6 +347,7 @@ class InternVisionEncoderLayer(nn.Module): *, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() @@ -330,11 +358,13 @@ class InternVisionEncoderLayer(nn.Module): self.attn = self._init_attn(config, quant_config, num_dummy_heads=num_dummy_heads, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel) self.mlp = InternMLP(config, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) self.norm2 = NORM2FN[self.norm_type](self.embed_dim, @@ -352,16 +382,20 @@ class InternVisionEncoderLayer(nn.Module): *, num_dummy_heads: int, prefix: str = "", + use_data_parallel: bool = False, ): # fallback to sdpa attention if tp unavailable - tp_size = get_tensor_model_parallel_world_size() + # tp_size = get_tensor_model_parallel_world_size() + tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) num_heads = config.num_attention_heads if (num_heads + num_dummy_heads) % tp_size == 0: return InternParallelAttention(config, quant_config=quant_config, num_dummy_heads=num_dummy_heads, - prefix=prefix) + prefix=prefix, + use_data_parallel=use_data_parallel) return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) @@ -388,6 +422,7 @@ class InternVisionEncoder(nn.Module): num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() @@ -402,7 +437,8 @@ class InternVisionEncoder(nn.Module): InternVisionEncoderLayer(config, quant_config, num_dummy_heads=num_dummy_heads, - prefix=f"{prefix}.layers.{layer_idx}") + prefix=f"{prefix}.layers.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(num_hidden_layers) ]) @@ -429,10 +465,12 @@ class InternVisionModel(nn.Module): num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.config = config + self.use_data_parallel = use_data_parallel self.embeddings = InternVisionEmbeddings(config) self.encoder = InternVisionEncoder( @@ -441,6 +479,7 @@ class InternVisionModel(nn.Module): num_hidden_layers_override=num_hidden_layers_override, num_dummy_heads=num_dummy_heads, prefix=f"{prefix}.encoder", + use_data_parallel=use_data_parallel, ) def get_input_embeddings(self): @@ -464,7 +503,11 @@ class InternVisionModel(nn.Module): raise ValueError( f'wrong pixel_values size: {pixel_values.shape}') - encoder_outputs = self.encoder(inputs_embeds=hidden_states) + if self.use_data_parallel: + encoder_outputs = run_dp_sharded_vision_model( + hidden_states, self.encoder) + else: + encoder_outputs = self.encoder(inputs_embeds=hidden_states) return encoder_outputs diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 9565628b198e2..6a5c565b52e85 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1035,6 +1035,8 @@ class InternVLMultiModalProcessor( class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): @@ -1053,6 +1055,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self._patch_quant_config(config, quant_config) image_size = config.force_image_size or config.vision_config.image_size @@ -1120,7 +1123,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, prefix=prefix, - ) + use_data_parallel=self.use_data_parallel) else: return InternVisionPatchModel(config.vision_config) From bec060fd99e371b1adc53f65636061f702fa8e61 Mon Sep 17 00:00:00 2001 From: Andrew Sansom <andrew@protopia.ai> Date: Wed, 17 Sep 2025 23:25:07 -0500 Subject: [PATCH 400/584] Mark prompt logprobs as incompatible with prompt embeds at API level (#25077) Signed-off-by: Andrew Sansom <andrew@protopia.ai> --- .../test_completion_with_prompt_embeds.py | 17 +++++++++++++++++ vllm/engine/llm_engine.py | 11 +++++++---- vllm/entrypoints/openai/serving_completion.py | 5 +++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index dbfb1b024f7c2..7b58f851a4d21 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -228,3 +228,20 @@ async def test_completions_with_logprobs_and_prompt_embeds( assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1 assert len(logprobs.tokens) == 5 + + +@pytest.mark.asyncio +async def test_prompt_logprobs_raises_error( + client_with_prompt_embeds: openai.AsyncOpenAI): + with pytest.raises(BadRequestError, match="not compatible"): + encoded_embeds = create_dummy_embeds() + await client_with_prompt_embeds.completions.create( + model=MODEL_NAME, + prompt="", + max_tokens=5, + temperature=0.0, + extra_body={ + "prompt_embeds": encoded_embeds, + "prompt_logprobs": True + }, + ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c35bd20371d0a..34b5dcb587503 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -671,10 +671,13 @@ class LLMEngine: arrival_time = time.time() if (isinstance(prompt, dict) - and prompt.get("prompt_embeds", None) is not None - and not prompt.get("prompt_token_ids", None)): - seq_len = prompt["prompt_embeds"].shape[0] - prompt["prompt_token_ids"] = [0] * seq_len + and prompt.get("prompt_embeds", None) is not None): + if not prompt.get("prompt_token_ids", None): + seq_len = prompt["prompt_embeds"].shape[0] + prompt["prompt_token_ids"] = [0] * seq_len + if params.prompt_logprobs is not None: + raise ValueError( + "prompt_logprobs is not compatible with prompt embeds.") processed_inputs = self.input_preprocessor.preprocess( prompt, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 044f08f32b0d3..0c61c48da0bc8 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -112,6 +112,11 @@ class OpenAIServingCompletion(OpenAIServing): return self.create_error_response( "Echo is unsupported with prompt embeds.") + if (request.prompt_logprobs is not None + and request.prompt_embeds is not None): + return self.create_error_response( + "prompt_logprobs is not compatible with prompt embeds.") + request_id = ( f"cmpl-" f"{self._base_request_id(raw_request, request.request_id)}") From 3bc18127ff1c644257abcf84a1a56fab8c0d3f0c Mon Sep 17 00:00:00 2001 From: Chaojun Zhang <chaojun.zhang@intel.com> Date: Thu, 18 Sep 2025 12:30:10 +0800 Subject: [PATCH 401/584] [XPU] Whisper model support on XPU Platform (#25123) Signed-off-by: chzhang <chaojun.zhang@intel.com> --- vllm/attention/layer.py | 4 ++-- vllm/v1/worker/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 22dc6dcbc8d62..15c0ce33e9659 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -391,8 +391,8 @@ class MultiHeadAttention(nn.Module): backend = _Backend.FLASH_ATTN use_upstream_fa = True - if current_platform.is_rocm(): - # currently, only torch_sdpa is supported on rocm + if current_platform.is_rocm() or current_platform.is_xpu(): + # currently, only torch_sdpa is supported on rocm/xpu self.attn_backend = _Backend.TORCH_SDPA else: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index fc831a73a75e3..b76ac633892f3 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -282,7 +282,7 @@ def bind_kv_cache( # TODO - analyze where runner_kv_caches is used and the right # way to ensure it properly reflects multiple attention layers # in the same decoder block. - if current_platform.is_cuda(): + if current_platform.is_cuda() or current_platform.is_xpu(): # We know that the GPU runner is not impacted by this # case. Some test code depends on runner_kv_caches, but # not in a way that's impacted by ignoring this. From 9d8a2d86d24b8afd849d18ddb4ef51cec1c0471d Mon Sep 17 00:00:00 2001 From: YiwenC <54658925+666even666@users.noreply.github.com> Date: Wed, 17 Sep 2025 21:51:35 -0700 Subject: [PATCH 402/584] [EPLB] Add EPLB support for hunyuan_v1 (#23078) --- vllm/model_executor/layers/fused_moe/layer.py | 4 +- vllm/model_executor/models/hunyuan_v1.py | 135 ++++++++++++++++-- 2 files changed, 123 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index ae3b67a2b84e6..da513d75da4da 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1508,8 +1508,8 @@ class FusedMoE(CustomOp): return [ weight.view(self.local_num_experts, -1) for name, weight in weights - if name not in NON_EXPERT_WEIGHTS - and not name.startswith("_shared_experts.") + if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size( + []) and not name.startswith("_shared_experts.") ] def set_eplb_state( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index db054b5c537e8..4110c8a1fd08d 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only HunYuan model compatible with HuggingFace weights.""" -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable from typing import Any, Optional, Union import regex as re @@ -33,8 +34,8 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import (get_ep_group, get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import SiluAndMul @@ -56,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_layers, maybe_prefix) @@ -355,10 +356,16 @@ class HunYuanSparseMoeBlock(nn.Module): quant_config: Optional[QuantizationConfig] = None, layer_id: int = -1, prefix: str = "", + enable_eplb: bool = False, ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts = config.num_experts + if self.tp_size > config.num_experts: raise ValueError( f"Tensor parallel size {self.tp_size} is greater than " @@ -379,8 +386,23 @@ class HunYuanSparseMoeBlock(nn.Module): config.moe_intermediate_size, int) else config.moe_intermediate_size[layer_id]) + # Load balancing settings. + vllm_config = get_current_vllm_config() + eplb_config = vllm_config.parallel_config.eplb_config + self.enable_eplb = enable_eplb + + self.n_logical_experts = self.n_routed_experts + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + self.experts = FusedMoE( - num_experts=config.num_experts, + num_experts=self.n_routed_experts, top_k=top_k, hidden_size=config.hidden_size, intermediate_size=intermediate_size, @@ -388,6 +410,8 @@ class HunYuanSparseMoeBlock(nn.Module): renormalize=top_k > 1, quant_config=quant_config, prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, ) self.gate = ReplicatedLinear(config.hidden_size, @@ -446,6 +470,7 @@ class HunYuanDecoderLayer(nn.Module): quant_config: Optional[QuantizationConfig] = None, prefix: str = "", layer_id: int = -1, + enable_eplb: bool = False, ) -> None: super().__init__() assert layer_id >= 0 @@ -509,6 +534,7 @@ class HunYuanDecoderLayer(nn.Module): quant_config=quant_config, layer_id=layer_id, prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb, ) else: self.mlp = HunYuanMLP( @@ -562,6 +588,9 @@ class HunYuanModel(nn.Module): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + eplb_config = vllm_config.parallel_config.eplb_config + enable_eplb = vllm_config.parallel_config.enable_eplb + self.num_redundant_experts = eplb_config.num_redundant_experts self.config = config self.quant_config = quant_config @@ -588,6 +617,7 @@ class HunYuanModel(nn.Module): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + enable_eplb=enable_eplb, ), prefix=f"{prefix}.layers", ) @@ -674,6 +704,7 @@ class HunYuanModel(nn.Module): ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", num_experts=self.config.num_experts, + num_redundant_experts=self.num_redundant_experts, ) else: return [] @@ -803,25 +834,43 @@ class HunYuanModel(nn.Module): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue - name = name.replace(weight_name, param_name) - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): + # this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable + name_mapped = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name_mapped, self): continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader( + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader( param, loaded_weight, - name, + name_mapped, shard_id=shard_id, expert_id=expert_id, + return_success=True, ) - break + if success: + name = name_mapped + break else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) if name is None: @@ -841,7 +890,7 @@ class HunYuanModel(nn.Module): return loaded_params -class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP): +class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -883,6 +932,64 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP): else: self.lm_head = PPMissingLayer() + # Set MoE hyperparameters + self.expert_weights = [] + self.num_expert_groups = 1 + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, HunYuanDecoderLayer) + if isinstance(layer.mlp, HunYuanSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No HunYuanMoE layer found in model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_logical_experts = example_layer.n_logical_experts + self.num_physical_experts = example_layer.n_physical_experts + self.num_local_physical_experts = example_layer.n_local_physical_experts + self.num_routed_experts = example_layer.n_routed_experts + self.num_redundant_experts = example_layer.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + self.expert_weights.append(layer.get_expert_weights()) + # Register the expert weights. + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.mlp, HunYuanSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def forward( self, input_ids: torch.Tensor, From 5c65a72bb17b34bc6eb0d7ca43b10938c88dc7e3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Wed, 17 Sep 2025 22:05:25 -0700 Subject: [PATCH 403/584] [V0 Deprecation] Remove more V0 tests (#25117) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- .buildkite/test-pipeline.yaml | 6 - .github/CODEOWNERS | 2 - tests/async_engine/__init__.py | 0 tests/async_engine/api_server_async_engine.py | 54 -- tests/async_engine/conftest.py | 12 - tests/async_engine/test_api_server.py | 139 ------ tests/async_engine/test_request_tracker.py | 71 --- tests/basic_correctness/test_preemption.py | 189 ------- tests/detokenizer/conftest.py | 11 - tests/detokenizer/test_stop_checker.py | 83 ---- .../openai/correctness/test_lmeval.py | 10 - tests/samplers/test_logprobs.py | 182 ------- tests/worker/__init__.py | 0 tests/worker/conftest.py | 11 - tests/worker/test_model_input.py | 113 ----- tests/worker/test_model_runner.py | 462 ------------------ tests/worker/test_profile.py | 68 --- tests/worker/test_swap.py | 87 ---- 18 files changed, 1500 deletions(-) delete mode 100644 tests/async_engine/__init__.py delete mode 100644 tests/async_engine/api_server_async_engine.py delete mode 100644 tests/async_engine/conftest.py delete mode 100644 tests/async_engine/test_api_server.py delete mode 100644 tests/async_engine/test_request_tracker.py delete mode 100644 tests/basic_correctness/test_preemption.py delete mode 100644 tests/detokenizer/conftest.py delete mode 100644 tests/detokenizer/test_stop_checker.py delete mode 100644 tests/samplers/test_logprobs.py delete mode 100644 tests/worker/__init__.py delete mode 100644 tests/worker/conftest.py delete mode 100644 tests/worker/test_model_input.py delete mode 100644 tests/worker/test_model_runner.py delete mode 100644 tests/worker/test_profile.py delete mode 100644 tests/worker/test_swap.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0bce02b90a7cd..8dd99bf1a38f6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -46,22 +46,18 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - - tests/async_engine - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - tests/utils_ - - tests/worker - tests/standalone_tests/lazy_imports.py - tests/transformers_utils commands: - python3 standalone_tests/lazy_imports.py - - pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s multimodal - pytest -v -s utils_ # Utils - - pytest -v -s worker # Worker - pytest -v -s transformers_utils # transformers_utils - label: Python-only Installation Test # 10min @@ -82,14 +78,12 @@ steps: - vllm/ - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_preemption - tests/basic_correctness/test_cumem.py commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Entrypoints Unit Tests # 5min timeout_in_minutes: 10 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 771dd2e172586..b8d6db06548d5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -41,7 +41,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo -/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao @@ -50,7 +49,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche -/tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py deleted file mode 100644 index ec6b20f5e04b9..0000000000000 --- a/tests/async_engine/api_server_async_engine.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""vllm.entrypoints.api_server with some extra logging for testing.""" -from collections.abc import Iterable -from typing import Any - -import uvicorn -from fastapi.responses import JSONResponse, Response - -import vllm.entrypoints.api_server -import vllm.envs as envs -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.utils import FlexibleArgumentParser - -app = vllm.entrypoints.api_server.app - - -class AsyncLLMEngineWithStats(AsyncLLMEngine): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._num_aborts = 0 - - async def _engine_abort(self, request_ids: Iterable[str]): - ids = list(request_ids) - self._num_aborts += len(ids) - await super()._engine_abort(ids) - - def testing_stats(self) -> dict[str, Any]: - return {"num_aborted_requests": self._num_aborts} - - -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(engine.testing_stats()) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser = AsyncEngineArgs.add_cli_args(parser) - args = parser.parse_args() - - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) - vllm.entrypoints.api_server.engine = engine - uvicorn.run(app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE) diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py deleted file mode 100644 index 375b248ebedaa..0000000000000 --- a/tests/async_engine/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py deleted file mode 100644 index 07370a8803291..0000000000000 --- a/tests/async_engine/test_api_server.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copyreg -import os -import subprocess -import sys -import time -from multiprocessing import Pool -from pathlib import Path - -import pytest -import requests -import urllib3.exceptions - - -def _pickle_new_connection_error(obj): - """Custom pickler for NewConnectionError to fix tblib compatibility.""" - # Extract the original message by removing the "conn: " prefix - full_message = obj.args[0] if obj.args else "" - if ': ' in full_message: - # Split off the connection part and keep the actual message - _, actual_message = full_message.split(': ', 1) - else: - actual_message = full_message - return _unpickle_new_connection_error, (actual_message, ) - - -def _unpickle_new_connection_error(message): - """Custom unpickler for NewConnectionError.""" - # Create with None as conn and the actual message - return urllib3.exceptions.NewConnectionError(None, message) - - -# Register the custom pickle/unpickle functions for tblib compatibility -copyreg.pickle(urllib3.exceptions.NewConnectionError, - _pickle_new_connection_error) - - -def _query_server(prompt: str, max_tokens: int = 5) -> dict: - response = requests.post("http://localhost:8000/generate", - json={ - "prompt": prompt, - "max_tokens": max_tokens, - "temperature": 0, - "ignore_eos": True - }) - response.raise_for_status() - return response.json() - - -def _query_server_long(prompt: str) -> dict: - return _query_server(prompt, max_tokens=500) - - -@pytest.fixture -def api_server(distributed_executor_backend: str): - script_path = Path(__file__).parent.joinpath( - "api_server_async_engine.py").absolute() - commands = [ - sys.executable, - "-u", - str(script_path), - "--model", - "facebook/opt-125m", - "--host", - "127.0.0.1", - "--distributed-executor-backend", - distributed_executor_backend, - ] - - # API Server Test Requires V0. - my_env = os.environ.copy() - my_env["VLLM_USE_V1"] = "0" - uvicorn_process = subprocess.Popen(commands, env=my_env) - yield - uvicorn_process.terminate() - - -@pytest.mark.timeout(300) -@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) -def test_api_server(api_server, distributed_executor_backend: str): - """ - Run the API server and test it. - - We run both the server and requests in separate processes. - - We test that the server can handle incoming requests, including - multiple requests at the same time, and that it can handle requests - being cancelled without crashing. - """ - with Pool(32) as pool: - # Wait until the server is ready - prompts = ["warm up"] * 1 - result = None - while not result: - try: - for r in pool.map(_query_server, prompts): - result = r - break - except requests.exceptions.ConnectionError: - time.sleep(1) - - # Actual tests start here - # Try with 1 prompt - for result in pool.map(_query_server, prompts): - assert result - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests == 0 - - # Try with 100 prompts - prompts = ["test prompt"] * 100 - for result in pool.map(_query_server, prompts): - assert result - - with Pool(32) as pool: - # Cancel requests - prompts = ["canceled requests"] * 100 - pool.map_async(_query_server_long, prompts) - time.sleep(0.01) - pool.terminate() - pool.join() - - # check cancellation stats - # give it some time to update the stats - time.sleep(1) - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests > 0 - - # check that server still runs after cancellations - with Pool(32) as pool: - # Try with 100 prompts - prompts = ["test prompt after canceled"] * 100 - for result in pool.map(_query_server, prompts): - assert result diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py deleted file mode 100644 index 1851eeeda7905..0000000000000 --- a/tests/async_engine/test_request_tracker.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.async_llm_engine import RequestTracker -from vllm.outputs import RequestOutput - - -@pytest.mark.asyncio -async def test_request_tracker(): - tracker = RequestTracker() - stream_1 = tracker.add_request("1") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 1 - assert new[0]["request_id"] == "1" - assert not aborted - assert not stream_1.finished - - stream_2 = tracker.add_request("2") - stream_3 = tracker.add_request("3") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 2 - assert new[0]["request_id"] == "2" - assert new[1]["request_id"] == "3" - assert not aborted - assert not stream_2.finished - assert not stream_3.finished - - # request_ids must be unique - with pytest.raises(KeyError): - tracker.add_request("1") - assert not tracker.new_requests_event.is_set() - - tracker.abort_request("1") - new, aborted = tracker.get_new_and_aborted_requests() - assert len(aborted) == 1 - assert "1" in aborted - assert not new - assert stream_1.finished - - stream_4 = tracker.add_request("4") - tracker.abort_request("4") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - # aborted new requests will cancel each other out - - # there's no need for them to propagate into the - # engine - assert not aborted - assert not new - assert stream_4.finished - - stream_5 = tracker.add_request("5") - assert tracker.new_requests_event.is_set() - tracker.process_request_output( - RequestOutput("2", "output", [], [], [], finished=True)) - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert not aborted - assert len(new) == 1 - assert new[0]["request_id"] == "5" - assert stream_2.finished - assert not stream_5.finished diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py deleted file mode 100644 index db2fa2f6bef6f..0000000000000 --- a/tests/basic_correctness/test_preemption.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the short outputs of HF and vLLM when using greedy sampling. - -VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. - -Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 -pytest tests/basic_correctness/test_preemption.py`. -""" -import pytest -from prometheus_client import REGISTRY - -import vllm.envs as envs -from vllm import SamplingParams -from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, - ENABLE_ARTIFICIAL_PREEMPT) - -from ..models.utils import check_outputs_equal - -MODELS = [ - "distilbert/distilgpt2", -] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT, - so use VLLM_USE_V1=0 for all tests in the file. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.fixture(scope="module", autouse=True) -def check_settings(): - assert ENABLE_ARTIFICIAL_PREEMPT is True, ( - "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1." - "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 " - "pytest tests/basic_correctness/test_preemption.py`") - - -@pytest.fixture -def distributed_executor_backend() -> str: - # When SPMD worker is used, use distributed_executor_backend="ray" - # to test delta input optimization works with preemption. - return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [96]) -@pytest.mark.parametrize("chunked_prefill_token_size", [16]) -def test_chunked_prefill_recompute( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - chunked_prefill_token_size: int, - distributed_executor_backend: str, -) -> None: - """Ensure that chunked prefill works with preemption.""" - max_num_seqs = min(chunked_prefill_token_size, 256) - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=enable_chunked_prefill, - max_num_seqs=max_num_seqs, - distributed_executor_backend=distributed_executor_backend, - disable_log_stats=False, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - for i in range(len(example_prompts)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_outputs[i] - assert hf_output_str == vllm_output_str, ( - f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") - assert hf_output_ids == vllm_output_ids, ( - f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption( - caplog_vllm, - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """By default, recompute preemption is enabled""" - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - disable_log_stats=False, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - total_preemption = ( - vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - assert ("is preempted by PreemptionMode.RECOMPUTE mode because there " - "is not enough KV cache space." in caplog_vllm.text) - # Ensure the count bucket of request-level histogram metrics matches - # the number of requests as a simple sanity check to ensure metrics are - # generated - preemption_metrics = None - for m in REGISTRY.collect(): - if m.name == "vllm:num_preemptions": - preemption_metrics = m - assert preemption_metrics is not None - total_recorded_preemption = 0 - for sample in preemption_metrics.samples: - total_recorded_preemption += sample.value - assert total_preemption == total_recorded_preemption - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption_infeasible( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """Verify infeasible preemption request will be ignored.""" - BLOCK_SIZE = 16 - prefill_blocks = 2 - decode_blocks = max_tokens // BLOCK_SIZE - with vllm_runner( - model, - dtype=dtype, - block_size=BLOCK_SIZE, - # Not enough gpu blocks to complete a single sequence. - # preemption should happen, and the sequence should be - # ignored instead of hanging forever. - num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, - max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - sampling_params = SamplingParams(max_tokens=max_tokens, - ignore_eos=True) - req_outputs = vllm_model.llm.generate( - example_prompts, - sampling_params=sampling_params, - ) - - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - # Verify the request is ignored and not hang. - for req_output in req_outputs: - outputs = req_output.outputs - assert len(outputs) == 1 - assert outputs[0].finish_reason == "length" diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py deleted file mode 100644 index f2c125355c83c..0000000000000 --- a/tests/detokenizer/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py deleted file mode 100644 index 2ca10c072b342..0000000000000 --- a/tests/detokenizer/test_stop_checker.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.inputs import token_inputs -from vllm.sampling_params import SamplingParams -from vllm.sequence import Logprob, Sequence, SequenceStatus - - -def sequence_with_eos(text: str, eos_token: str, - eos_token_id: int) -> Sequence: - """ - Create a Sequence that ends with an EOS token. - """ - seq = Sequence( - seq_id=0, - inputs=token_inputs([]), - block_size=16, - eos_token_id=eos_token_id, - ) - seq.output_text = text + eos_token - - offset = eos_token_id + 1 - for i in range(offset, len(text) + offset): - seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)}) - seq.append_token_id(token_id=eos_token_id, - logprobs={eos_token_id: Logprob(0.0)}) - - seq.status = SequenceStatus.RUNNING - - return seq - - -@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [ - ("This text ends with EOS token", "</s>", 2), -]) -@pytest.mark.parametrize("ignore_eos", [True, False]) -@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) -@pytest.mark.skip_global_cleanup -def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int, - ignore_eos: bool, include_stop_str_in_output: bool): - """ - Test the behavior of the StopChecker's maybe_stop_sequence method - when an EOS token is encountered. - - This test covers: - - When the EOS token should stop the sequence and be removed from the output - - When the EOS token should stop the sequence and be included in the output - - When the EOS token should be ignored, and the sequence continues - """ - - stop_checker = StopChecker(max_model_len=1024) - - seq = sequence_with_eos( - text=text_wo_eos, - eos_token=eos_token, - eos_token_id=eos_token_id, - ) - new_char_count = len(eos_token) - - # Note that `stop` and `stop_token_ids` are not specified - sampling_params = SamplingParams( - min_tokens=1, - ignore_eos=ignore_eos, - include_stop_str_in_output=include_stop_str_in_output) - - stop_checker.maybe_stop_sequence( - seq=seq, - new_char_count=new_char_count, - sampling_params=sampling_params, - ) - - if ignore_eos: - assert seq.status == SequenceStatus.RUNNING - assert seq.output_text == text_wo_eos + eos_token - elif include_stop_str_in_output: - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.output_text == text_wo_eos + eos_token - else: - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.output_text == text_wo_eos diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index 684407cd6ee97..624acd5ffde73 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): more_args = ["--max-num-seqs", "64"] run_test(more_args) - - -@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, - more_args): - """Run with the V0 Engine.""" - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - run_test(more_args) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py deleted file mode 100644 index 87f40b1005312..0000000000000 --- a/tests/samplers/test_logprobs.py +++ /dev/null @@ -1,182 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm import SamplingParams - -from ..conftest import VllmRunner - -MODELS = ["distilbert/distilgpt2"] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module is V0 only since it uses dtype=float, so - set VLLM_USE_V1=0 for all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", - ["float"]) # needed for comparing logprobs with HF -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) -@pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size -@pytest.mark.parametrize("detokenize", [True, False]) -def test_get_prompt_logprobs( - hf_runner, - vllm_runner, - model, - dtype, - chunked_prefill_token_size: int, - num_top_logprobs: int, - detokenize: bool, - example_prompts, -): - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - - max_tokens = 5 - with hf_runner(model, dtype=dtype) as hf_model: - hf_logprobs = hf_model.generate_greedy_logprobs( - example_prompts, - max_tokens=max_tokens, - ) - - with vllm_runner( - model, - dtype=dtype, - max_logprobs=num_top_logprobs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs, - ) as vllm_model: - vllm_sampling_params = SamplingParams(max_tokens=max_tokens, - logprobs=num_top_logprobs, - prompt_logprobs=num_top_logprobs, - temperature=0.0, - detokenize=detokenize) - vllm_results = vllm_model.llm.generate( - example_prompts, sampling_params=vllm_sampling_params) - - # Test whether logprobs are included in the results. - for result in vllm_results: - assert result.prompt_logprobs is not None - assert result.outputs[0].logprobs is not None - assert len(result.outputs[0].logprobs) == max_tokens - for logprobs in result.outputs[0].logprobs: - # If the output token is not included in the top X - # logprob, it can return 1 more data - assert (len(logprobs) == num_top_logprobs - or len(logprobs) == num_top_logprobs + 1) - output_text = result.outputs[0].text - output_string_from_most_likely_tokens_lst: list[str] = [] - for top_logprobs in result.outputs[0].logprobs: - top_logprob = next(iter(top_logprobs.values())) - output_string_from_most_likely_tokens_lst.append( - top_logprob.decoded_token) - - if detokenize: - output_string_from_most_likely_tokens = "".join( - output_string_from_most_likely_tokens_lst) - assert output_text == output_string_from_most_likely_tokens, ( - "The output text from the top logprob for each token position " - "should be the same as the output text in the result.") - else: - assert output_text == '' - assert output_string_from_most_likely_tokens_lst == ([None] * - max_tokens) - - # The first prompt logprob is always None - assert result.prompt_logprobs[0] is None - for prompt_logprobs in result.prompt_logprobs[1:]: - # If the prompt token is not included in the top X - # logprob, it can return 1 more data - assert (len(prompt_logprobs) == num_top_logprobs - or len(prompt_logprobs) == num_top_logprobs + 1) - - # Test whether prompt logprobs are consistent with HF - for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs): - # Check prompt logprobs - # The first prompt logprob is always None, so we compare it from 1:. - vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:] - for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs): - for token_id, logprob in vllm_prompt_logprob_dict.items(): - torch.testing.assert_close(logprob.logprob, - hf_logprob[0][i][token_id].item(), - atol=1e-2, - rtol=1e-2) - vllm_sample_logprobs = vllm_result.outputs[0].logprobs - for i, top_logprobs in enumerate(vllm_sample_logprobs): - for token_id, sample_logprob in top_logprobs.items(): - logprob = sample_logprob.logprob - torch.testing.assert_close(logprob, - hf_logprob[i][-1][token_id].item(), - atol=1e-2, - rtol=1e-2) - if detokenize: - assert isinstance(sample_logprob.decoded_token, str), ( - "The token should be decoded by the time it is returned" - " to the user.") - - # Test if prompt logprobs are correctly set. - for vllm_result in vllm_results: - token_ids = vllm_result.prompt_token_ids - prompt_logprobs = vllm_result.prompt_logprobs - - # The first token doesn't have logprob. - assert prompt_logprobs[0] is None - - for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]): - assert token_id in logprob_dict - - -def test_max_logprobs(): - runner = VllmRunner("facebook/opt-125m", max_logprobs=1) - vllm_sampling_params = SamplingParams(logprobs=1) - # should pass - runner.generate(["Hello world"], sampling_params=vllm_sampling_params) - - bad_sampling_params = SamplingParams(logprobs=2) - with pytest.raises(ValueError): - runner.generate(["Hello world"], sampling_params=bad_sampling_params) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) -@pytest.mark.parametrize("detokenize", [True, False]) -def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, - detokenize: bool, example_prompts): - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - max_tokens = 5 - - with vllm_runner( - model, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs, - ) as vllm_model: - sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, - logprobs=None, - temperature=0.0, - detokenize=detokenize) - results_logprobs_none = vllm_model.llm.generate( - example_prompts, sampling_params=sampling_params_logprobs_none) - - for i in range(len(results_logprobs_none)): - assert results_logprobs_none[i].outputs[0].logprobs is None - assert results_logprobs_none[i].outputs[0].cumulative_logprob is None diff --git a/tests/worker/__init__.py b/tests/worker/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py deleted file mode 100644 index 3f202d4dbe948..0000000000000 --- a/tests/worker/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') \ No newline at end of file diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py deleted file mode 100644 index 0f28ef2ba857b..0000000000000 --- a/tests/worker/test_model_input.py +++ /dev/null @@ -1,113 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses - -import torch - -from vllm.attention import AttentionMetadata, AttentionMetadataBuilder -from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.utils import CommonAttentionState -from vllm.model_executor import SamplingMetadata -from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - - -class MockAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - raise NotImplementedError - - @staticmethod - def get_impl_cls(): - raise NotImplementedError - - @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: - return AttentionMetadata - - @staticmethod - def get_builder_cls() -> type["AttentionMetadataBuilder"]: - return AttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> tuple[int, ...]: - raise NotImplementedError - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - pass - - @staticmethod - def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - pass - - -def test_model_runner_input(): - sampling_metadata = SamplingMetadata( - ["seq_group"], - "selected_token_indices", - "categorized_sample_indices", - "num_prompts", - ) - attn_metadata = AttentionMetadata( - num_prefills=1, - num_prefill_tokens=2, - num_decode_tokens=3, - slot_mapping=torch.zeros(1), - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - ) - model_input = ModelInputForGPUWithSamplingMetadata( - input_tokens=torch.ones(10), - input_positions=torch.ones(10), - sampling_metadata=sampling_metadata, - attn_metadata=attn_metadata) - - assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata) - - # Test round trip serialization. - tensor_dict = model_input.as_broadcastable_tensor_dict() - attn_backend = MockAttentionBackend() - received_model_input = ( - ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, attn_backend=attn_backend)) - # Check that received copy has correct values. - assert isinstance(received_model_input, - ModelInputForGPUWithSamplingMetadata) - assert received_model_input.input_tokens is not None - assert ( - received_model_input.input_tokens == model_input.input_tokens).all() - assert received_model_input.input_positions is not None - assert (received_model_input.input_positions == model_input.input_positions - ).all() - assert received_model_input.multi_modal_kwargs is None - assert (received_model_input.multi_modal_kwargs == - model_input.multi_modal_kwargs) - assert received_model_input.lora_requests is None - assert received_model_input.lora_requests == model_input.lora_requests - assert received_model_input.lora_mapping is None - assert received_model_input.lora_mapping == model_input.lora_mapping - for field in dataclasses.fields(AttentionMetadata): - assert getattr(received_model_input.attn_metadata, field.name, - None) == getattr(attn_metadata, field.name, None) - # For sampling metadata, only selected_token_indices is copied. - assert (received_model_input.sampling_metadata.selected_token_indices == - sampling_metadata.selected_token_indices) - assert received_model_input.sampling_metadata.seq_groups is None diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py deleted file mode 100644 index 0be25aa2fc35d..0000000000000 --- a/tests/worker/test_model_runner.py +++ /dev/null @@ -1,462 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import get_open_port -from vllm.worker.model_runner import ModelRunner - - -def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - model_runner = ModelRunner( - vllm_config=engine_config, - is_driver_worker=True, - ) - return model_runner - - -def test_deepseek_mla_attn_backend_module(): - model_runner = _create_model_runner( - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", - trust_remote_code=True, - enable_chunked_prefill=False, - ) - assert model_runner.attn_backend.__name__ == "TritonMLABackend" - - -@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) -@pytest.mark.parametrize("use_prompt_embeds", [True, False]) -def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enable_prompt_embeds=True, - ) - - seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - expected_input_embeds_len = 0 - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * seq_len, - prompt_embeds=torch.rand(seq_len, 10), - ) - expected_input_embeds_len += seq_len - else: - seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len)) - - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - - expected_selected_token_indices = [] - selected_token_start_idx = 0 - for seq_len in seq_lens: - expected_selected_token_indices.append(selected_token_start_idx + - seq_len - 1) - selected_token_start_idx += seq_len - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - assert return_seq_lens == seq_lens - assert len(slot_mapping) == len(input_tokens) - - # Verify input metadata is correct for prompts. - device = model_runner.device - assert attn_metadata.num_prefills > 0 - assert attn_metadata.num_decode_tokens == 0 - torch.testing.assert_close( - attn_metadata.seq_lens_tensor, - torch.tensor(seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.max_prefill_seq_len == max(seq_lens) - assert attn_metadata.max_decode_seq_len == 0 - - # Test subquery start locs. - start_idx = 0 - start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - - # Test seq start locs. Note that for normal prefill it is - # equivalent to query_start_loc. - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - - torch.testing.assert_close( - attn_metadata.seq_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - torch.testing.assert_close( - attn_metadata.context_lens_tensor, - torch.zeros(attn_metadata.context_lens_tensor.shape[0], - dtype=torch.int, - device=device)) - - expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))], - dtype=torch.int32, - device=model_runner.device) - torch.testing.assert_close(attn_metadata.block_tables, expected) - # Cuda graph should not be used for prerill. - assert attn_metadata.use_cuda_graph is False - - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - if expected_input_embeds_len == 0: - torch.testing.assert_close(input_tokens, input_positions) - assert input_embeds is None - else: - assert len(input_embeds) == expected_input_embeds_len - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=model_runner.device, - pin_memory=model_runner.pin_memory) - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - torch.allclose(input_tokens, input_positions) - - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - - -@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) -@pytest.mark.parametrize("use_prompt_embeds", [True, False]) -def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=False, - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enable_prompt_embeds=True, - ) - - context_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - # Assume each seq group finishes prefill. - for i in range(batch_size): - # make sure all tokens fit into one block - context_len = i % (model_runner.block_size - 1) + 1 - context_lens.append(context_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * context_len, - prompt_embeds=torch.rand(context_len, 10), - ) - output_embed = torch.rand(10) - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(context_len)) - output_embed = None - seq_data.update_num_computed_tokens(context_len) - # Append one token ID since prefill is finished. - seq_data.append_token_id(1, 0, output_embed) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables={0: [1]}, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - slot_mapping = attn_metadata.slot_mapping - - assert len(slot_mapping) == len(input_tokens) - - expected_bs = model_runner.vllm_config.pad_for_cudagraph( - len(seq_group_metadata_list)) - # Verify input metadata is correct for prompts. - device = model_runner.device - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_prefill_tokens == 0 - seq_lens = [context_len + 1 for context_len in context_lens] - # seq_lens are padded to expected_bs - for _ in range(expected_bs - len(seq_lens)): - seq_lens.append(1) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.num_decode_tokens == len(seq_lens) - start_idx = 0 - start_loc = [start_idx] - for _ in context_lens: - # decode has only 1 token for query. - start_idx += 1 - start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.seq_start_loc, - torch.tensor(seq_start_loc, dtype=torch.int32, device=device)) - - torch.testing.assert_close( - attn_metadata.context_lens_tensor, - torch.tensor(context_lens, dtype=torch.int, device=device)) - assert attn_metadata.max_decode_seq_len == max(seq_lens) - torch.testing.assert_close( - attn_metadata.seq_lens_tensor[:len(seq_lens)], - torch.tensor(seq_lens, dtype=torch.int, device=device)) - - # block table's first index corresponds to each batch, meaning in - # decoding it is each token. - assert attn_metadata.block_tables.shape[0] == len(input_tokens) - # Block table's second dim corresponds to each token's block number. - # It is padded up to - assert attn_metadata.block_tables.shape[1] == ( - model_runner.get_max_block_per_batch()) - assert attn_metadata.use_cuda_graph is True - - assert len(input_tokens) == expected_bs - assert len(input_positions) == expected_bs - if use_prompt_embeds: - expected_input_embeds_length = start_loc[-1] - assert len(input_embeds) == expected_input_embeds_length - assert expected_input_embeds_length <= expected_bs - else: - assert input_embeds is None - - # Verify Sampling - expected_selected_token_indices = [] - for selected_token_start_idx, _ in enumerate(context_lens): - expected_selected_token_indices.append(selected_token_start_idx) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query lens is all 1 for decode. - query_lens=[1 for _ in range(len(context_lens))], - device=model_runner.device, - pin_memory=model_runner.pin_memory) - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - - -def test_empty_seq_group(): - """Verify prepare prompt and decode returns empty output.""" - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=False, - ) - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - - assert input_tokens is None - assert input_positions is None - assert attn_metadata is None - - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - - assert input_tokens is None - assert input_positions is None - assert input_embeds is None - assert attn_metadata is None - assert return_seq_lens is None - - -@pytest.fixture -def distributed_init(): - init_distributed_environment( - world_size=1, - rank=0, - distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}", - local_rank=0) - ensure_model_parallel_initialized(1, 1) - - -@pytest.mark.parametrize("batch_size", list(range(2, 128, 3))) -@pytest.mark.parametrize("enforce_eager", [True, False]) -@pytest.mark.parametrize('use_prompt_embeds', [True, False]) -def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds, - distributed_init, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=enforce_eager, - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=True, - enable_prompt_embeds=True, - ) - - # Add prefill requests. - seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - prefill_metadata_list: list[SequenceGroupMetadata] = [] - decode_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - prefill_batch_size = batch_size // 2 - decode_batch_size = batch_size - prefill_batch_size - expected_input_embeds_len = 0 - for i in range(prefill_batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * seq_len, - prompt_embeds=torch.rand(seq_len, 10), - ) - expected_input_embeds_len += seq_len - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(seq_len), ) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - prefill_metadata_list.append(seq_group_metadata) - - # Add decode requests - for i in range(prefill_batch_size, batch_size): - # make sure all tokens fit into one block - context_len = i % (model_runner.block_size - 1) + 1 - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * context_len, - prompt_embeds=torch.rand(context_len, 10), - ) - output_embed = torch.rand(10) - # This also iterates the expected input_embeds, because the model - # needs both the input and output embeddings passed into together - expected_input_embeds_len += 1 - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(context_len), ) - output_embed = None - assert len(seq_data.prompt_token_ids) == context_len - seq_data.append_token_id(1, 0, output_embed) - seq_data.update_num_computed_tokens(context_len) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables={0: [1]}, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - decode_metadata_list.append(seq_group_metadata) - - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - - prefill_meta_actual = attn_metadata.prefill_metadata - decode_meta_actual = attn_metadata.decode_metadata - - assert len(attn_metadata.slot_mapping) == len(input_tokens) - assert len(input_positions) == len(input_tokens) - assert attn_metadata.num_prefills == prefill_batch_size - assert attn_metadata.num_decode_tokens == decode_batch_size - assert attn_metadata.num_prefill_tokens == sum(seq_lens) - if expected_input_embeds_len == 0: - assert input_embeds is None - else: - assert len(input_embeds) == expected_input_embeds_len - - # Verify attn metadata is consistent. We don't need to test individual - # values here because they are tested above. - attn_metadata = model_runner._prepare_model_input_tensors( - seq_group_metadata_list).attn_metadata - - for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata), - vars(prefill_meta_actual)): - assert attr_expected[1] == attr_actual[1] - for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata), - vars(decode_meta_actual)): - assert attr_expected[1] == attr_actual[1] diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py deleted file mode 100644 index d8767f700b576..0000000000000 --- a/tests/worker/test_profile.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.worker import Worker - - -def test_gpu_memory_profiling(): - # Tests the gpu profiling that happens in order to determine the number of - # KV cache blocks that we can allocate on the GPU. - # This test mocks the maximum available gpu memory so that it can run on - # any gpu setup. - - # Set up engine args to build a worker. - engine_args = EngineArgs(model="facebook/opt-125m", - dtype="half", - load_format="dummy") - engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 1000 - engine_config.cache_config.num_cpu_blocks = 1000 - - # Create the worker. - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - worker = Worker( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=True, - ) - - # Set 10GiB as the total gpu ram to be device-agnostic - def mock_mem_info(): - current_usage = torch.cuda.memory_stats( - )["allocated_bytes.all.current"] - mock_total_bytes = 10 * 1024**3 - free = mock_total_bytes - current_usage - - return (free, mock_total_bytes) - - from unittest.mock import patch - with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info): - # Load the model so we can profile it - worker.init_device() - worker.load_model() - gpu_blocks, _ = worker.determine_num_available_blocks() - - # Peak vram usage by torch should be 0.47 GiB - # Model weights take 0.25 GiB - # No memory should be allocated outside of torch - # 9.0 GiB should be the utilization target - # 8.28 GiB should be available for the KV cache - block_size = CacheEngine.get_cache_block_size( - engine_config.cache_config, engine_config.model_config, - engine_config.parallel_config) - - expected_blocks = (8.28 * 1024**3) // block_size - - # Check within a small tolerance for portability - # Hardware, kernel, or dependency changes could all affect memory - # utilization. - # A 100 block tolerance here should be about 60MB of wiggle room. - assert abs(gpu_blocks - expected_blocks) < 100 diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py deleted file mode 100644 index 6d9f404ac207b..0000000000000 --- a/tests/worker/test_swap.py +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.worker import Worker - - -def test_swap() -> None: - # Configure the engine. - engine_args = EngineArgs(model="distilbert/distilgpt2", - dtype="half", - load_format="dummy") - engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 1000 - engine_config.cache_config.num_cpu_blocks = 1000 - - # Create the worker. - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - worker = Worker( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=True, - ) - - # Initialize the worker. - worker.init_device() - worker.load_model() - worker.initialize_cache( - num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, - num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - - # Randomly initialize the cache. - gpu_cache = worker.cache_engine[0].gpu_cache - cpu_cache = worker.cache_engine[0].cpu_cache - num_layers = len(gpu_cache) - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - gpu_key_cache.random_() - gpu_value_cache.random_() - cpu_key_cache, cpu_value_cache = cpu_cache[i] - cpu_key_cache.random_() - cpu_value_cache.random_() - - allclose = lambda a, b: torch.allclose( - a.cuda(), b.cuda(), rtol=0.0, atol=0.0) - - # Test swap out. - blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)] - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=[], - blocks_to_swap_in=[], - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=[], - ) - worker.execute_model(execute_model_req=execute_model_req) - - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - cpu_key_cache, cpu_value_cache = cpu_cache[i] - for src, dst in blocks_to_swap_out: - assert allclose(gpu_key_cache[src], cpu_key_cache[dst]) - assert allclose(gpu_value_cache[src], cpu_value_cache[dst]) - - # Test swap in. - execute_model_req.blocks_to_swap_out = [] - execute_model_req.blocks_to_swap_in = [ - (19, 45), - (67, 23), - (12, 78), - (40, 99), - (1, 71), - ] - worker.execute_model(execute_model_req=execute_model_req) - - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - cpu_key_cache, cpu_value_cache = cpu_cache[i] - for src, dst in execute_model_req.blocks_to_swap_in: - assert allclose(gpu_key_cache[dst], cpu_key_cache[src]) - assert allclose(gpu_value_cache[dst], cpu_value_cache[src]) From b7433ca1a47732394b1bdea4099d98389515954b Mon Sep 17 00:00:00 2001 From: Benjamin Chislett <bchislett@nvidia.com> Date: Thu, 18 Sep 2025 01:07:24 -0400 Subject: [PATCH 404/584] [Spec Decode] Efficient padded speculation (#24539) Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> --- tests/v1/spec_decode/test_eagle.py | 179 +++++++++++++++++++- vllm/config/speculative.py | 5 + vllm/v1/spec_decode/eagle.py | 258 +++++++++++++++++++++++++---- vllm/v1/worker/gpu_input_batch.py | 5 +- vllm/v1/worker/gpu_model_runner.py | 164 +++++++++++------- 5 files changed, 507 insertions(+), 104 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index ccab04628a163..e7f6b68fc3f77 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -19,6 +19,8 @@ from vllm.config.load import LoadConfig from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform from vllm.v1.spec_decode.eagle import EagleProposer +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch model_dir = "meta-llama/Llama-3.1-8B-Instruct" eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" @@ -64,6 +66,86 @@ def _create_proposer( device=current_platform.device_type) +def test_prepare_next_token_ids(): + """ + Test for prepare_next_token_ids_cpu and prepare_next_token_ids_padded. + Each will produce a device tensor of next_token_ids, taking as input + either the GPU tensor of sampled_token_ids with -1 for rejected tokens, + or the CPU python list[list[int]] with the rejected tokens removed. + """ + device = torch.device(current_platform.device_type) + + num_requests = 4 + num_speculative_tokens = 4 + batch_spec = BatchSpec( + seq_lens=[num_speculative_tokens + 1] * num_requests, + query_lens=[num_speculative_tokens + 1] * num_requests, + ) + + req_ids = [f"req_{i+1}" for i in range(num_requests)] + mock_input_batch = mock.MagicMock(spec=InputBatch) + mock_input_batch.req_ids = req_ids + mock_input_batch.num_reqs = num_requests + mock_input_batch.vocab_size = 100 + + mock_num_scheduled_tokens = {req_id: 0 for req_id in req_ids} + mock_requests = {} + for req_id in req_ids: + mock_request = mock.MagicMock(spec=CachedRequestState) + # Each request will have a backup next token id of 10, 20, 30, 40 + mock_request.get_token_id.return_value = int(req_id.split("_")[1]) * 10 + mock_request.num_computed_tokens = 0 + mock_requests[req_id] = mock_request + + sampled_token_ids = [ + [0, 1, -1, -1, -1], # 1 accepted, 3 rejected, "1" sampled + [0, 1, 2, 3, 4], # all accepted, "4" sampled + [-1, -1, -1, -1, -1], # sampling skipped, use backup token "30" + [-1, -1, -1, -1, -1] # this request will be discarded + ] + sampled_token_ids_tensor = torch.tensor(sampled_token_ids, + dtype=torch.int32, + device=device) + sampled_token_ids_cpu = [[i for i in seq if i != -1] + for seq in sampled_token_ids] + + expected_next_token_ids_cpu = [1, 4, 30, 40] + expected_next_token_ids_tensor = torch.tensor(expected_next_token_ids_cpu, + dtype=torch.int32, + device=device) + + proposer = _create_proposer("eagle", num_speculative_tokens) + + next_token_ids_from_cpu = proposer.prepare_next_token_ids_cpu( + sampled_token_ids_cpu, mock_requests, mock_input_batch, + mock_num_scheduled_tokens) + + assert torch.equal(next_token_ids_from_cpu, expected_next_token_ids_tensor) + + common_attn_metadata = create_common_attn_metadata( + batch_spec, + block_size=16, + device=device, + ) + + discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device) + num_discarded_reqs = 1 + + expected_valid_sampled_tokens_count = torch.tensor([2, 5, 0, 0], + dtype=torch.int32, + device=device) + + next_token_ids_from_padded, valid_sampled_tokens_count = \ + proposer.prepare_next_token_ids_padded( + common_attn_metadata, sampled_token_ids_tensor, mock_requests, + mock_input_batch, discarded_req_indices, num_discarded_reqs) + + assert torch.equal(next_token_ids_from_padded, + expected_next_token_ids_tensor) + assert torch.equal(valid_sampled_tokens_count, + expected_valid_sampled_tokens_count) + + def test_prepare_inputs(): """ cu_target_query_lens: [0, a, a + b, a + b + c] @@ -90,10 +172,24 @@ def test_prepare_inputs(): device=device, ) - # Rejected tokens per request: [1, 3, 2] - num_rejected_tokens = torch.tensor([1, 3, 2], - dtype=torch.int32, - device=device) + # If there are `k` sampled tokens, then `k-1` tokens are draft tokens + # from the previous iteration, and the last token is the bonus token sampled + # from the base model. + num_draft_tokens = [3, 6, 4] # one less than query_lens + # num rejected tokens is [1, 3, 2] + ACCEPT_TOKEN = 0 + BONUS_TOKEN = 1 + REJECT_TOKEN = -1 + sampled_token_ids = [ + [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, BONUS_TOKEN], + [ + ACCEPT_TOKEN, ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, + REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN + ], + [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN] + ] + sampled_token_ids = [[i for i in seq if i != REJECT_TOKEN] + for seq in sampled_token_ids] # Expected calculations: # query_len_per_req = [4, 7, 5] @@ -125,7 +221,7 @@ def test_prepare_inputs(): proposer = _create_proposer("eagle", 1) updated_metadata, token_indices = proposer.prepare_inputs( - common_attn_metadata, num_rejected_tokens.cpu()) + common_attn_metadata, sampled_token_ids, num_draft_tokens) assert torch.equal(updated_metadata.query_start_loc, expected_cu_num_tokens) @@ -133,6 +229,77 @@ def test_prepare_inputs(): assert torch.equal(token_indices, expected_token_indices) +def test_prepare_inputs_padded(): + """ + Input scenario is 3 requests with num_speculative_tokens == 2 and: + - Request 1: query_len = 3, rejected = 1 + - Request 2: query_len = 3, rejected = 0 + - Request 3: query_len = 3, rejected = 2 + + Expected outputs: + token_indices: [0, 1, 2, + 3, 4, 5, + 6, 7, 8] + Reason: Deferred computation should not disturb the original indices. + + token_indices_to_sample: [1, 5, 6] + Reason: After accounting for rejections, these are the valid token positions + from the original indices to sample from. + """ + + device = torch.device(current_platform.device_type) + + expected_token_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8], + dtype=torch.int32, + device=device) + expected_token_indices_to_sample = torch.tensor([1, 5, 6], + dtype=torch.int32, + device=device) + + num_speculative_tokens = 2 + batch_spec = BatchSpec( + seq_lens=[3, 3, 3], + query_lens=[3, 3, 3], + ) + + common_attn_metadata = create_common_attn_metadata( + batch_spec, + block_size=16, + device=device, + ) + + # Needed for cu_num_draft_tokens, which is expected to be [3, 6, 9] + expected_query_start_loc = torch.tensor([0, 3, 6, 9], + dtype=torch.int32, + device=device) + spec_decode_metadata = SpecDecodeMetadata.make_dummy( + draft_token_ids=[[0] * num_speculative_tokens] * 3, + device=device, + ) + + # num_rejected_tokens = [1, 0, 2] + # num_draft_tokens = [2, 2, 2] + # valid_sampled_tokens_count = num_draft_tokens + 1 - num_rejected_tokens + valid_sampled_tokens_count = torch.tensor([2, 3, 1], + dtype=torch.int32, + device=device) + + proposer = _create_proposer("eagle", num_speculative_tokens) + + output_metadata, token_indices, token_indices_to_sample = \ + proposer.prepare_inputs_padded( + common_attn_metadata, + spec_decode_metadata, + valid_sampled_tokens_count) + + assert output_metadata.max_query_len == 3 + assert torch.equal(output_metadata.query_start_loc, + expected_query_start_loc) + assert torch.equal(token_indices, expected_token_indices) + assert torch.equal(token_indices_to_sample, + expected_token_indices_to_sample) + + @pytest.mark.parametrize("method", ["eagle", "eagle3"]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @@ -373,6 +540,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): target_positions=target_positions, target_hidden_states=target_hidden_states, next_token_ids=next_token_ids, + last_token_indices=None, common_attn_metadata=common_attn_metadata, sampling_metadata=sampling_metadata) @@ -526,6 +694,7 @@ def test_propose_tree(spec_token_tree): target_positions=target_positions, target_hidden_states=target_hidden_states, next_token_ids=next_token_ids, + last_token_indices=None, common_attn_metadata=common_attn_metadata, sampling_metadata=sampling_metadata) assert result.shape == (batch_size, num_speculative_tokens) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index b2d50e3852337..fca8c28e5c61e 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -83,6 +83,11 @@ class SpeculativeConfig: disable_by_batch_size: Optional[int] = None """Disable speculative decoding for new incoming requests when the number of enqueued requests is larger than this value, if provided.""" + disable_padded_drafter_batch: bool = False + """Disable input padding for speculative decoding. If set to True, + speculative input batches can contain sequences of different lengths, + which may only be supported by certain attention backends. This currently + only affects the EAGLE method of speculation.""" # Ngram proposer configuration prompt_lookup_max: Optional[int] = None diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5154b29405b6e..2a178ddf48777 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -27,6 +27,9 @@ from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm.v1.utils import CpuGpuBuffer +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.ubatching import dbo_current_ubatch_id logger = init_logger(__name__) @@ -94,20 +97,26 @@ class EagleProposer: dtype=self.dtype, device=device) + # We need +1 here because the arange is used to set query_start_loc, + # which has one more element than batch_size. max_batch_size = vllm_config.scheduler_config.max_num_seqs - self.arange = torch.arange( - # We need +1 here because the arange is used to set query_start_loc, - # which has one more element than batch_size. - max_batch_size + 1, - device=device, - dtype=torch.int32, - ) + max_num_slots_for_arange = max(max_batch_size + 1, self.max_num_tokens) + self.arange = torch.arange(max_num_slots_for_arange, + device=device, + dtype=torch.int32) self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device) + self.backup_next_token_ids = CpuGpuBuffer( + max_batch_size, + dtype=torch.int32, + pin_memory=is_pin_memory_available(), + device=device, + with_numpy=True) + # Determine allowed attention backends once during initialization. self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...] if current_platform.is_rocm(): @@ -156,13 +165,16 @@ class EagleProposer: target_hidden_states: torch.Tensor, # [batch_size] next_token_ids: torch.Tensor, + last_token_indices: Optional[torch.Tensor], common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, mm_embeds: Optional[list[torch.Tensor]] = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] batch_size = next_token_ids.shape[0] - last_token_indices = common_attn_metadata.query_start_loc[1:] - 1 + + if last_token_indices is None: + last_token_indices = common_attn_metadata.query_start_loc[1:] - 1 if self.method == "eagle3": assert isinstance(self.model, Eagle3LlamaForCausalLM) @@ -228,6 +240,12 @@ class EagleProposer: last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) + + # Early exit if there is only one draft token to be generated. + if self.num_speculative_tokens == 1: + draft_token_ids = logits.argmax(dim=-1) + return draft_token_ids.view(-1, 1) + positions = target_positions[last_token_indices] hidden_states = hidden_states[last_token_indices] @@ -245,15 +263,12 @@ class EagleProposer: draft_token_ids = logits.argmax(dim=-1) - # Early exit if there is only one draft token to be generated. - if self.num_speculative_tokens == 1: - # [batch_size, 1] - return draft_token_ids.view(-1, 1) - - # TODO: Currently, MTP module released by deepseek only has - # one layer. Adapt this code to support multiple layers once - # there's a multi-layer MTP module. - assert isinstance(attn_metadata, self.allowed_attn_types) + if not isinstance(attn_metadata, self.allowed_attn_types): + raise ValueError( + f"Unsupported attention metadata type for speculative " + "decoding with num_speculative_tokens > 1: " + f"{type(attn_metadata)}. Supported types are: " + f"{self.allowed_attn_types}") # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] @@ -263,10 +278,13 @@ class EagleProposer: input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) else: input_batch_size = batch_size - attn_metadata.num_actual_tokens = batch_size - attn_metadata.max_query_len = 1 - attn_metadata.query_start_loc = self.arange[:batch_size + 1] - for _ in range(self.num_speculative_tokens - 1): + + common_attn_metadata.num_actual_tokens = batch_size + common_attn_metadata.max_query_len = 1 + common_attn_metadata.query_start_loc = self.arange[:batch_size + 1] + common_attn_metadata.query_start_loc_cpu = torch.from_numpy( + self.token_arange_np[:batch_size + 1]).clone() + for token_index in range(self.num_speculative_tokens - 1): # Update the inputs. # cast to int32 is crucial when eagle model is compiled. # tensor.argmax() returns int64 by default. @@ -286,27 +304,38 @@ class EagleProposer: positions) # Increment the sequence lengths. - attn_metadata.max_seq_len += 1 - attn_metadata.seq_lens += 1 - # Consider max model length. - attn_metadata.max_seq_len = min(attn_metadata.max_seq_len, - self.max_model_len) + common_attn_metadata.seq_lens += 1 + common_attn_metadata.seq_lens_cpu += 1 # For the requests that exceed the max model length, we set the # sequence length to 1 to minimize their overheads in attention. - attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1) + common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, + 1) + + common_attn_metadata.num_computed_tokens_cpu = \ + common_attn_metadata.seq_lens_cpu - 1 # Compute the slot mapping. block_numbers = clamped_positions // self.block_size - block_ids = attn_metadata.block_table.gather( + block_ids = common_attn_metadata.block_table_tensor.gather( dim=1, index=block_numbers.view(-1, 1)) block_ids = block_ids.view(-1) - attn_metadata.slot_mapping = (block_ids * self.block_size + - clamped_positions % self.block_size) + common_attn_metadata.slot_mapping = ( + block_ids * self.block_size + + clamped_positions % self.block_size) # Mask out the slot mappings that exceed the max model length. # Otherwise, the KV cache will be inadvertently updated with the # padding tokens. - attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len, - PADDING_SLOT_ID) + common_attn_metadata.slot_mapping.masked_fill_( + exceeds_max_model_len, PADDING_SLOT_ID) + + # Rebuild attention metadata + attn_metadata_builder = \ + self.runner.attn_groups[0][0].metadata_builders[ubatch_id] + attn_metadata = attn_metadata_builder\ + .build_for_drafting(common_attn_metadata=common_attn_metadata, + draft_index=token_index + 1) + for layer_name in self.attn_layer_names: + per_layer_attn_metadata[layer_name] = attn_metadata # copy inputs to buffer for cudagraph self.input_ids[:batch_size] = input_ids @@ -347,6 +376,158 @@ class EagleProposer: draft_token_ids = torch.stack(draft_token_ids_list, dim=1) return draft_token_ids + def prepare_next_token_ids_cpu( + self, sampled_token_ids: list[list[int]], + requests: dict[str, + CachedRequestState], gpu_input_batch: InputBatch, + num_scheduled_tokens: dict[str, int]) -> torch.Tensor: + """ + This function is used to prepare the inputs for speculative decoding. + It calculates the next token ids for each request based on the sampled + token ids from the CPU. If a request has no sampled token ids (e.g., + during the initial decoding steps), it falls back to using the request + state to get the next token id. + """ + req_ids = gpu_input_batch.req_ids + next_token_ids: list[int] = [] + for i, token_ids in enumerate(sampled_token_ids): + if token_ids: + # Common case. + next_token_id = token_ids[-1] + else: + # Partial prefill (rare case). + # Get the next token id from the request state. + req_id = req_ids[i] + req_state = requests[req_id] + seq_len = (req_state.num_computed_tokens + + num_scheduled_tokens[req_id]) + next_token_id = req_state.get_token_id(seq_len) + next_token_ids.append(next_token_id) + next_token_ids = torch.tensor(next_token_ids, + dtype=torch.int32, + device=self.input_ids.device) + return next_token_ids + + def prepare_next_token_ids_padded(self, + common_attn_metadata: CommonAttentionMetadata, + sampled_token_ids: torch.Tensor, + requests: dict[str, CachedRequestState], + gpu_input_batch: InputBatch, + discard_request_indices: torch.Tensor, + num_discarded_requests: int) -> \ + tuple[torch.Tensor, torch.Tensor]: + """ + This function is used to prepare the inputs for speculative decoding. + It calculates the next token ids and the number of valid sampled tokens + for each request, considering the "discarded" requests whose next token + is not sampled and comes from `request.get_token_id()` instead. + It also accounts for the rejected tokens in `sampled_token_ids`. + This function must use device functions to operate on the inputs, and + should not introduce any blocking CPU-GPU synchronization. + """ + # TODO(Ben): Combine this into a custom fused kernel + + # Precompute get_token_id for when there is no valid next token + num_reqs = gpu_input_batch.num_reqs + self.backup_next_token_ids.np[:num_reqs] = np.array([ + requests[gpu_input_batch.req_ids[i]].get_token_id( + common_attn_metadata.seq_lens_cpu[i].item()) + for i in range(num_reqs) + ]) + self.backup_next_token_ids.copy_to_gpu(num_reqs) + + # Mask out the sampled tokens indices that should not be sampled. + discard_sampled_tokens_req_indices = \ + discard_request_indices[:num_discarded_requests] + + valid_sampled_token_ids_gpu = sampled_token_ids.clone() + valid_sampled_token_ids_gpu.index_fill_( + 0, discard_sampled_tokens_req_indices, -1) + + # Generate a mask for all valid tokens within those requests + max_gen_len = sampled_token_ids.shape[-1] + if max_gen_len == 1: + valid_mask = torch.ones_like(valid_sampled_token_ids_gpu, + dtype=torch.bool) + else: + valid_mask = ( + (valid_sampled_token_ids_gpu != -1) & + (valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size)) + + # Count the number of valid tokens in each request + valid_sampled_tokens_count = valid_mask.sum(dim=1) + + # Get the rightmost valid index per row + last_valid_indices = valid_sampled_tokens_count - 1 + last_valid_indices_safe = torch.clamp(last_valid_indices, min=0) + + # Get last valid token from each row + # (assume undefined state where there is no valid token) + selected_tokens = torch.gather( + valid_sampled_token_ids_gpu, 1, + last_valid_indices_safe.unsqueeze(1)).squeeze(1) + + # Use last token if valid, pre-computed backup if not + batch_size = valid_sampled_token_ids_gpu.shape[0] + next_token_ids = torch.where( + last_valid_indices != -1, selected_tokens, + self.backup_next_token_ids.gpu[:batch_size]) + + return next_token_ids, valid_sampled_tokens_count + + def prepare_inputs_padded(self, + common_attn_metadata: CommonAttentionMetadata, + spec_decode_metadata: SpecDecodeMetadata, + valid_sampled_tokens_count: torch.Tensor) -> \ + tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]: + """ + This function is used to prepare the inputs for speculative decoding + It updates the common_attn_metadata for speculative decoding, + but does not consider the rejected tokens. Instead, all tokens + are included as inputs to the speculator, with the rejected tokens + used as padding and filtered out later by `token_indices_to_sample`. + No blocking CPU operations should be introduced in this function. + """ + num_draft_tokens_gpu = torch.cat([ + spec_decode_metadata.cu_num_draft_tokens[0:1], + spec_decode_metadata.cu_num_draft_tokens[1:] - + spec_decode_metadata.cu_num_draft_tokens[:-1] + ]) + + num_rejected_tokens_gpu = torch.where( + num_draft_tokens_gpu > 0, + num_draft_tokens_gpu + 1 - valid_sampled_tokens_count, + torch.zeros_like(num_draft_tokens_gpu)) + + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + + new_query_len_per_req = (query_start_loc_cpu[1:] - + query_start_loc_cpu[:-1]) + + total_num_tokens = query_start_loc_cpu[-1].item() + token_indices = self.arange[:total_num_tokens] + + spec_common_attn_metadata = CommonAttentionMetadata( + query_start_loc=common_attn_metadata.query_start_loc, + seq_lens=common_attn_metadata.seq_lens, + query_start_loc_cpu=query_start_loc_cpu, + seq_lens_cpu=common_attn_metadata.seq_lens_cpu, + num_computed_tokens_cpu=common_attn_metadata. + num_computed_tokens_cpu, + num_reqs=common_attn_metadata.num_reqs, + num_actual_tokens=total_num_tokens, + max_query_len=new_query_len_per_req.max().item(), + max_seq_len=common_attn_metadata.seq_lens_cpu.max().item(), + block_table_tensor=common_attn_metadata.block_table_tensor, + slot_mapping=common_attn_metadata.slot_mapping[token_indices], + causal=True, + ) + + token_indices_to_sample = common_attn_metadata.query_start_loc[1:] - 1 \ + - num_rejected_tokens_gpu + + return spec_common_attn_metadata, token_indices, token_indices_to_sample + def propose_tree( self, batch_size: int, @@ -520,11 +701,11 @@ class EagleProposer: def prepare_inputs( self, common_attn_metadata: CommonAttentionMetadata, - # [batch_size] - num_rejected_tokens: torch.Tensor + sampled_token_ids: list[list[int]], + num_draft_tokens: list[int], ) -> tuple[CommonAttentionMetadata, torch.Tensor]: """ - This function is used to prepare the inputs for the spec decode. + This function is used to prepare the inputs for speculative decoding. It updates to the common_attn_metadata to account for the rejected tokens (and newly sampled tokens). It also returns the token indices of the tokens that should be fed to the speculator. @@ -545,6 +726,13 @@ class EagleProposer: # q1, q1 + 1, ..., q1 + q2 - n2 - 1, # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] + num_rejected_tokens = [ + n + 1 - len(sampled_token_ids[i]) if n > 0 else 0 + for i, n in enumerate(num_draft_tokens) + ] + num_rejected_tokens = torch.tensor(num_rejected_tokens, + dtype=torch.int32) + device = common_attn_metadata.query_start_loc.device query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu new_seq_lens_cpu = common_attn_metadata.seq_lens_cpu \ diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 339b9937b73f4..6717622efb801 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -64,7 +64,10 @@ class CachedRequestState: def get_token_id(self, idx: int) -> int: if idx < self.num_prompt_tokens: return self.prompt_token_ids[idx] - return self.output_token_ids[idx - self.num_prompt_tokens] + elif idx - self.num_prompt_tokens < len(self.output_token_ids): + return self.output_token_ids[idx - self.num_prompt_tokens] + else: + return -1 class InputBatch: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f256dc160a6b5..e8ad9c2fca07c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -344,6 +344,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.hidden_size, dtype=self.dtype, numpy=False) + self.discard_request_indices = self._make_buffer(self.max_num_reqs, + dtype=torch.int64) + self.num_discarded_requests = 0 + self.num_draft_tokens = self._make_buffer(self.max_num_reqs, dtype=torch.int32) self.num_accepted_tokens = self._make_buffer(self.max_num_reqs, @@ -974,6 +978,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): seq_lens = self.seq_lens.gpu[:num_reqs] max_seq_len = self.seq_lens.np[:num_reqs].max().item() + num_tokens = [ + self.requests[r].num_tokens for r in self.input_batch.req_ids + ] + num_tokens_np = np.array(num_tokens, dtype=np.int32) + + # Record the index of requests that should not be sampled, + # so that we could clear the sampled tokens before returning + discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np + discard_request_indices = np.nonzero(discard_requests_mask)[0] + self.num_discarded_requests = len(discard_request_indices) + self.discard_request_indices.np[:self.num_discarded_requests] = ( + discard_request_indices) + + self.discard_request_indices.copy_to_gpu(self.num_discarded_requests) + # Copy the tensors to the GPU. self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens) @@ -1973,23 +1992,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if envs.VLLM_COMPUTE_NANS_IN_LOGITS: num_nans_in_logits = self._get_nans_in_logits(logits) - # TODO(woosuk): The following loop can be slow since it iterates over - # the requests one by one. Optimize. - discard_sampled_tokens_req_indices = [] - for i, req_id in enumerate(self.input_batch.req_ids): - req_state = self.requests[req_id] - seq_len = (req_state.num_computed_tokens + - scheduler_output.num_scheduled_tokens[req_id]) - if seq_len < req_state.num_tokens: - # Ignore the sampled token for partial prefills. - # Rewind the generator state as if the token was not sampled. - # This relies on cuda-specific torch-internal impl details - generator = self.input_batch.generators.get(i) - if generator is not None: - generator.set_offset(generator.get_offset() - 4) - # Record the index of the request that should not be sampled, - # so that we could clear the sampled tokens before returning. - discard_sampled_tokens_req_indices.append(i) + discard_sampled_tokens_req_indices = \ + self.discard_request_indices.np[:self.num_discarded_requests] + for i in discard_sampled_tokens_req_indices: + gen = self.input_batch.generators.get(int(i)) + if gen is not None: + gen.set_offset(gen.get_offset() - 4) # Copy some objects so they don't get modified after returning. # This is important when using async scheduling. @@ -2026,10 +2034,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) # Mask out the sampled tokens that should not be sampled. for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[i].clear() + valid_sampled_token_ids[int(i)].clear() else: valid_sampled_token_ids = [] - invalid_req_indices = list(discard_sampled_tokens_req_indices) + invalid_req_indices = discard_sampled_tokens_req_indices.tolist() invalid_req_indices_set = set(invalid_req_indices) assert sampled_token_ids.shape[-1] == 1 @@ -2229,6 +2237,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with record_function_or_nullcontext("Sample"): sampler_output = self._sample(logits, spec_decode_metadata) + def propose_draft_token_ids(sampled_token_ids): + assert spec_decode_common_attn_metadata is not None + with record_function_or_nullcontext("Draft"): + self._draft_token_ids = self.propose_draft_token_ids( + scheduler_output, + sampled_token_ids, + self.input_batch.sampling_metadata, + hidden_states, + sample_hidden_states, + aux_hidden_states, + spec_decode_metadata, + spec_decode_common_attn_metadata, + ) + + use_padded_batch_for_eagle = self.speculative_config and \ + self.speculative_config.use_eagle() and \ + not self.speculative_config.disable_padded_drafter_batch + if use_padded_batch_for_eagle: + # EAGLE speculative decoding can use the GPU sampled tokens + # as inputs, and does not need to wait for bookkeeping to finish. + propose_draft_token_ids(sampler_output.sampled_token_ids) + with record_function_or_nullcontext("Bookkeep"): ( num_nans_in_logits, @@ -2242,19 +2272,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits, hidden_states, num_scheduled_tokens) - if self.speculative_config: - assert spec_decode_common_attn_metadata is not None - with record_function_or_nullcontext("Draft"): - self._draft_token_ids = self.propose_draft_token_ids( - scheduler_output, - valid_sampled_token_ids, - self.input_batch.sampling_metadata, - hidden_states, - sample_hidden_states, - aux_hidden_states, - spec_decode_metadata, - spec_decode_common_attn_metadata, - ) + if self.speculative_config and not use_padded_batch_for_eagle: + # ngram and other speculative decoding methods use the sampled + # tokens on the CPU, so they are run after bookkeeping. + propose_draft_token_ids(valid_sampled_token_ids) with record_function_or_nullcontext("EPLB"): self.eplb_step() @@ -2294,7 +2315,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def propose_draft_token_ids( self, scheduler_output: "SchedulerOutput", - sampled_token_ids: list[list[int]], + sampled_token_ids: Union[torch.Tensor, list[list[int]]], sampling_metadata: SamplingMetadata, hidden_states: torch.Tensor, sample_hidden_states: torch.Tensor, @@ -2304,11 +2325,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) -> Union[list[list[int]], torch.Tensor]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if self.speculative_config.method == "ngram": + assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, NgramProposer) draft_token_ids = self.propose_ngram_draft_token_ids( sampled_token_ids) elif self.speculative_config.method == "medusa": + assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, MedusaProposer) + if sample_hidden_states.shape[0] == len(sampled_token_ids): # The input to the target model does not include draft tokens. hidden_states = sample_hidden_states @@ -2329,27 +2353,37 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) elif self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - # TODO(woosuk): Refactor the loop. - req_ids = self.input_batch.req_ids - next_token_ids: list[int] = [] - for i, token_ids in enumerate(sampled_token_ids): - if token_ids: - # Common case. - next_token_id = token_ids[-1] - else: - # Partial prefill (rare case). - # Get the next token id from the request state. - req_id = req_ids[i] - req_state = self.requests[req_id] - seq_len = (req_state.num_computed_tokens + - scheduler_output.num_scheduled_tokens[req_id]) - next_token_id = req_state.get_token_id(seq_len) - next_token_ids.append(next_token_id) - next_token_ids = torch.tensor(next_token_ids, - dtype=torch.int32, - device=self.device) + + if self.speculative_config.disable_padded_drafter_batch: + # When padded-batch is disabled, the sampled_token_ids should be + # the cpu-side list[list[int]] of valid sampled tokens for each + # request, with invalid requests having empty lists. + assert isinstance(sampled_token_ids, list), \ + "sampled_token_ids should be a python list when" \ + "padded-batch is disabled." + next_token_ids = self.drafter.prepare_next_token_ids_cpu( + sampled_token_ids, self.requests, self.input_batch, + scheduler_output.num_scheduled_tokens) + else: + # When using padded-batch, the sampled_token_ids should be + # the gpu tensor of sampled tokens for each request, of shape + # (num_reqs, num_spec_tokens + 1) with rejected tokens having + # value -1. + assert isinstance(sampled_token_ids, torch.Tensor), \ + "sampled_token_ids should be a torch.Tensor when" \ + "padded-batch is enabled." + next_token_ids, valid_sampled_tokens_count = \ + self.drafter.prepare_next_token_ids_padded( + common_attn_metadata, + sampled_token_ids, + self.requests, + self.input_batch, + self.discard_request_indices.gpu, + self.num_discarded_requests + ) if spec_decode_metadata is None: + token_indices_to_sample = None # input_ids can be None for multimodal models. target_token_ids = self.input_ids.gpu[:num_scheduled_tokens] # TODO(woosuk): Support M-RoPE. @@ -2361,17 +2395,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: target_hidden_states = hidden_states[:num_scheduled_tokens] else: - # TODO(woosuk): Refactor this. - num_draft_tokens = spec_decode_metadata.num_draft_tokens - num_rejected_tokens = [ - n + 1 - len(sampled_token_ids[i]) if n > 0 else 0 - for i, n in enumerate(num_draft_tokens) - ] - num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens, - dtype=torch.int32) - common_attn_metadata, token_indices =\ - self.drafter.prepare_inputs( - common_attn_metadata, num_rejected_tokens_cpu) + if self.speculative_config.disable_padded_drafter_batch: + token_indices_to_sample = None + common_attn_metadata, token_indices =\ + self.drafter.prepare_inputs( + common_attn_metadata, + sampled_token_ids, + spec_decode_metadata.num_draft_tokens) + else: + common_attn_metadata, token_indices, \ + token_indices_to_sample =\ + self.drafter.prepare_inputs_padded( + common_attn_metadata, + spec_decode_metadata, + valid_sampled_tokens_count) target_token_ids = self.input_ids.gpu[token_indices] # TODO(woosuk): Support M-RoPE. @@ -2391,6 +2428,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): target_positions=target_positions, target_hidden_states=target_hidden_states, next_token_ids=next_token_ids, + last_token_indices=token_indices_to_sample, sampling_metadata=sampling_metadata, common_attn_metadata=common_attn_metadata, mm_embeds=mm_embeds, From a904ea78eaf7fc3f9b136a1ba6f6f66fb5658496 Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Wed, 17 Sep 2025 22:30:02 -0700 Subject: [PATCH 405/584] [benchmark] add peak throughput metrics and plot (#23867) Signed-off-by: simon-mo <simon.mo@hey.com> --- vllm/benchmarks/lib/endpoint_request_func.py | 5 + vllm/benchmarks/serve.py | 198 ++++++++++++------- 2 files changed, 134 insertions(+), 69 deletions(-) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index e640630476630..066b8fe834380 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -89,6 +89,7 @@ class RequestFuncOutput: tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" + start_time: float = 0.0 async def async_request_openai_completions( @@ -140,6 +141,7 @@ async def async_request_openai_completions( generated_text = "" st = time.perf_counter() + output.start_time = st most_recent_timestamp = st try: async with session.post(url=api_url, json=payload, @@ -272,6 +274,7 @@ async def async_request_openai_chat_completions( generated_text = "" ttft = 0.0 st = time.perf_counter() + output.start_time = st most_recent_timestamp = st try: async with session.post(url=api_url, json=payload, @@ -396,6 +399,7 @@ async def async_request_openai_audio( generated_text = "" ttft = 0.0 st = time.perf_counter() + output.start_time = st most_recent_timestamp = st try: async with session.post(url=api_url, @@ -475,6 +479,7 @@ async def async_request_openai_embeddings( output = RequestFuncOutput() st = time.perf_counter() + output.start_time = st try: async with session.post( url=api_url, diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 1aeef0fd5bd85..d8784340eba15 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -18,9 +18,11 @@ On the client side, run: import argparse import asyncio import gc +import importlib.util import json import os import random +import shutil import time import warnings from collections.abc import AsyncGenerator, Iterable @@ -46,6 +48,9 @@ from vllm.transformers_utils.tokenizer import get_tokenizer MILLISECONDS_TO_SECONDS_CONVERSION = 1000 +TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None) + and (shutil.which("gnuplot") is not None)) + class TaskType(Enum): GENERATION = "generation" @@ -80,18 +85,23 @@ class BenchmarkMetrics: median_e2el_ms: float std_e2el_ms: float percentiles_e2el_ms: list[tuple[float, float]] + # Max output tokens per second and concurrent requests at that peak + max_output_tokens_per_s: float + max_concurrent_requests: int + @dataclass class EmbedBenchmarkMetrics: completed: int total_input: int request_throughput: float - total_token_throughput :float + total_token_throughput: float mean_e2el_ms: float std_e2el_ms: float median_e2el_ms: float percentiles_e2el_ms: float + def _get_current_request_rate( ramp_up_strategy: Optional[Literal["linear", "exponential"]], ramp_up_start_rps: Optional[int], @@ -150,8 +160,8 @@ async def get_request( assert burstiness > 0, ( f"A positive burstiness factor is expected, but given {burstiness}.") # Convert to list to get length for ramp-up calculations - if isinstance(input_requests, Iterable) and not isinstance( - input_requests, list): + if isinstance(input_requests, + Iterable) and not isinstance(input_requests, list): input_requests = list(input_requests) total_requests = len(input_requests) @@ -161,12 +171,9 @@ async def get_request( request_rates = [] delay_ts = [] for request_index, request in enumerate(input_requests): - current_request_rate = _get_current_request_rate(ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - request_index, - total_requests, - request_rate) + current_request_rate = _get_current_request_rate( + ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps, + request_index, total_requests, request_rate) request_rates.append(current_request_rate) if current_request_rate == float("inf"): delay_ts.append(0) @@ -206,10 +213,8 @@ async def get_request( def calculate_metrics_for_embeddings( - outputs: list[RequestFuncOutput], - dur_s: float, - selected_percentiles: list[float] -) -> EmbedBenchmarkMetrics: + outputs: list[RequestFuncOutput], dur_s: float, + selected_percentiles: list[float]) -> EmbedBenchmarkMetrics: """Calculate the metrics for the embedding requests. Args: @@ -242,10 +247,8 @@ def calculate_metrics_for_embeddings( mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[ - (p, np.percentile(e2els or 0, p) * 1000) - for p in selected_percentiles - ], + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], ) return metrics @@ -336,6 +339,67 @@ def calculate_metrics( "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", stacklevel=2) + + # Calculate max output tokens per second metric + max_output_tokens_per_s = 0.0 + max_concurrent_requests = 0 + + # Find the time range across all successful requests + successful_outputs = [output for output in outputs if output.success] + if successful_outputs: + min_start_time = min(output.start_time + for output in successful_outputs) + max_end_time = max(output.start_time + output.latency + for output in successful_outputs) + + # Create second buckets (ceiling to ensure we capture all time) + duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1 + tokens_per_second = np.zeros(duration_seconds) + concurrent_requests_per_second = np.zeros(duration_seconds) + + for i, output in enumerate(successful_outputs): + # Calculate token generation timestamp using + # start_time, ttft, and itl + token_times = [output.start_time + output.ttft] + current_time = token_times[0] + for itl_value in output.itl: + current_time += itl_value + token_times.append(current_time) + + # Add tokens to second buckets + for token_time in token_times: + second_bucket = int(token_time - min_start_time) + if 0 <= second_bucket < duration_seconds: + tokens_per_second[second_bucket] += 1 + + # Track concurrent requests for each second this request was active + request_start_second = int(output.start_time - min_start_time) + request_end_second = int((output.start_time + output.latency) - + min_start_time) + + for second in range(request_start_second, request_end_second + 1): + concurrent_requests_per_second[second] += 1 + + # Find the maximum tokens per second and corresponding + # concurrent requests + if len(tokens_per_second) > 0: + max_output_tokens_per_s = float(np.max(tokens_per_second)) + max_concurrent_requests = int( + np.max(concurrent_requests_per_second)) + + if TERM_PLOTLIB_AVAILABLE: + import termplotlib as tpl + fig = tpl.figure() + fig.plot(np.arange(len(tokens_per_second)), + tokens_per_second, + title="Output tokens per second") + fig.plot(np.arange(len(concurrent_requests_per_second)), + concurrent_requests_per_second, + title="Concurrent requests per second") + fig.show() + else: + print("tip: install termplotlib and gnuplot to plot the metrics") + metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -365,6 +429,8 @@ def calculate_metrics( median_e2el_ms=np.median(e2els or 0) * 1000, percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], + max_output_tokens_per_s=max_output_tokens_per_s, + max_concurrent_requests=max_concurrent_requests, ) return metrics, actual_output_lens @@ -396,11 +462,8 @@ async def benchmark( ramp_up_end_rps: Optional[int] = None, ready_check_timeout_sec: int = 600, ): - task_type = ( - TaskType.EMBEDDING - if api_url.endswith("/v1/embeddings") - else TaskType.GENERATION - ) + task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else + TaskType.GENERATION) if endpoint_type in ASYNC_REQUEST_FUNCS: if task_type == TaskType.EMBEDDING: request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"] @@ -435,14 +498,10 @@ async def benchmark( input_requests[0].multi_modal_data, ) - assert ( - test_mm_content is None - or isinstance(test_mm_content, dict) - or ( - isinstance(test_mm_content, list) - and all(isinstance(item, dict) for item in test_mm_content) - ) - ), "multi_modal_data must be a dict or list[dict]" + assert (test_mm_content is None or isinstance(test_mm_content, dict) + or (isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content)) + ), "multi_modal_data must be a dict or list[dict]" test_input = RequestFuncInput( model=model_id, model_name=model_name, @@ -488,13 +547,13 @@ async def benchmark( ignore_eos=ignore_eos, extra_headers=extra_headers, extra_body=extra_body) - profile_output = await request_func( - request_func_input=profile_input, session=session) + profile_output = await request_func(request_func_input=profile_input, + session=session) if profile_output.success: print("Profiler started") - distribution = ("Poisson process" if burstiness == 1.0 - else "Gamma distribution") + distribution = ("Poisson process" + if burstiness == 1.0 else "Gamma distribution") if ramp_up_strategy is not None: print(f"Traffic ramp-up strategy: {ramp_up_strategy}.") @@ -562,18 +621,20 @@ async def benchmark( req_lora_module = next(lora_modules) req_model_id, req_model_name = req_lora_module, req_lora_module - request_func_input = RequestFuncInput(model=req_model_id, - model_name=req_model_name, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - multi_modal_content=mm_content, - ignore_eos=ignore_eos, - extra_headers=extra_headers, - extra_body=extra_body, - request_id=request_id,) + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + multi_modal_content=mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body, + request_id=request_id, + ) tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, @@ -615,19 +676,21 @@ async def benchmark( benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) if isinstance(metrics, BenchmarkMetrics): - print("{:<40} {:<10}".format( - "Total generated tokens:", metrics.total_output)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) if isinstance(metrics, BenchmarkMetrics): - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput - ) - ) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format( + "Peak output token throughput (tok/s):", + metrics.max_output_tokens_per_s)) + print("{:<40} {:<10.2f}".format("Peak concurrent requests:", + metrics.max_concurrent_requests)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) @@ -648,6 +711,8 @@ async def benchmark( "itls": [output.itl for output in outputs], "generated_texts": [output.generated_text for output in outputs], "errors": [output.error for output in outputs], + "max_output_tokens_per_s": metrics.max_output_tokens_per_s, + "max_concurrent_requests": metrics.max_concurrent_requests, } else: result = { @@ -697,8 +762,8 @@ async def benchmark( if task_type == TaskType.GENERATION: process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric( - "tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") @@ -714,8 +779,8 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, ) - profile_output = await request_func( - request_func_input=profile_input, session=session) + profile_output = await request_func(request_func_input=profile_input, + session=session) if profile_output.success: print("Profiler stopped") @@ -851,7 +916,8 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--tokenizer", type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( @@ -982,7 +1048,6 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Specify the prefix of request id.", ) - sampling_group = parser.add_argument_group("sampling parameters") sampling_group.add_argument( "--top-p", @@ -1047,8 +1112,7 @@ def add_cli_args(parser: argparse.ArgumentParser): help="The ramp-up strategy. This would be used to " "ramp up the request rate from initial RPS to final " "RPS rate (specified by --ramp-up-start-rps and " - "--ramp-up-end-rps.) over the duration of the benchmark." - ) + "--ramp-up-end-rps.) over the duration of the benchmark.") parser.add_argument( "--ramp-up-start-rps", type=int, @@ -1087,13 +1151,11 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: raise ValueError( "When using ramp-up, do not specify --request-rate. " "The request rate will be controlled by ramp-up parameters. " - "Please remove the --request-rate argument." - ) + "Please remove the --request-rate argument.") if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: raise ValueError( "When using --ramp-up-strategy, both --ramp-up-start-rps and " - "--ramp-up-end-rps must be specified" - ) + "--ramp-up-end-rps must be specified") if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: raise ValueError("Ramp-up start and end RPS must be non-negative") if args.ramp_up_start_rps > args.ramp_up_end_rps: @@ -1127,8 +1189,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: headers[kvstring[0].strip()] = kvstring[1].strip() else: raise ValueError( - "Invalid header format. Please use KEY=VALUE format." - ) + "Invalid header format. Please use KEY=VALUE format.") tokenizer = get_tokenizer(tokenizer_id, tokenizer_mode=tokenizer_mode, @@ -1215,8 +1276,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: result_json[kvstring[0].strip()] = kvstring[1].strip() else: raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) + "Invalid metadata format. Please use KEY=VALUE format.") # Traffic result_json["request_rate"] = (args.request_rate if args.request_rate From e111d5b0ae9359e2a829771105e739d36505fa69 Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Wed, 17 Sep 2025 22:30:26 -0700 Subject: [PATCH 406/584] [CLI] Use streaming in CLI chat and completion commands (#23769) Signed-off-by: simon-mo <simon.mo@hey.com> --- vllm/entrypoints/cli/openai.py | 71 +++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 7c01de94a3436..1929d6a7f77af 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -45,6 +45,28 @@ def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]: return model_name, openai_client +def _print_chat_stream(stream) -> str: + output = "" + for chunk in stream: + delta = chunk.choices[0].delta + if delta.content: + output += delta.content + print(delta.content, end="", flush=True) + print() + return output + + +def _print_completion_stream(stream) -> str: + output = "" + for chunk in stream: + text = chunk.choices[0].text + if text is not None: + output += text + print(text, end="", flush=True) + print() + return output + + def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: conversation: list[ChatCompletionMessageParam] = [] if system_prompt is not None: @@ -58,14 +80,11 @@ def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: break conversation.append({"role": "user", "content": input_message}) - chat_completion = client.chat.completions.create(model=model_name, - messages=conversation) - - response_message = chat_completion.choices[0].message - output = response_message.content - - conversation.append(response_message) # type: ignore - print(output) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) def _add_query_options( @@ -108,9 +127,11 @@ class ChatCommand(CLISubcommand): if args.quick: conversation.append({"role": "user", "content": args.quick}) - chat_completion = client.chat.completions.create( - model=model_name, messages=conversation) - print(chat_completion.choices[0].message.content) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) return print("Please enter a message for the chat model:") @@ -121,14 +142,11 @@ class ChatCommand(CLISubcommand): break conversation.append({"role": "user", "content": input_message}) - chat_completion = client.chat.completions.create( - model=model_name, messages=conversation) - - response_message = chat_completion.choices[0].message - output = response_message.content - - conversation.append(response_message) # type: ignore - print(output) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: @@ -168,9 +186,10 @@ class CompleteCommand(CLISubcommand): model_name, client = _interactive_cli(args) if args.quick: - completion = client.completions.create(model=model_name, - prompt=args.quick) - print(completion.choices[0].text) + stream = client.completions.create(model=model_name, + prompt=args.quick, + stream=True) + _print_completion_stream(stream) return print("Please enter prompt to complete:") @@ -179,10 +198,10 @@ class CompleteCommand(CLISubcommand): input_prompt = input("> ") except EOFError: break - completion = client.completions.create(model=model_name, - prompt=input_prompt) - output = completion.choices[0].text - print(output) + stream = client.completions.create(model=model_name, + prompt=input_prompt, + stream=True) + _print_completion_stream(stream) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: From 81b16a2bc922e837267db7216a274c4d89a2cc0c Mon Sep 17 00:00:00 2001 From: Lumina <starry.qvq@gmail.com> Date: Thu, 18 Sep 2025 13:53:55 +0800 Subject: [PATCH 407/584] [Kernel] Better inf handling for grouped topk cu (#24886) Signed-off-by: lumina37 <starry.qvq@gmail.com> --- csrc/moe/grouped_topk_kernels.cu | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index accbb09858fac..b5321f748e6be 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -21,6 +21,7 @@ #include <torch/all.h> #include <cuda_fp16.h> #include <cuda_bf16.h> +#include <cuda/std/limits> #include <cooperative_groups.h> #include <cooperative_groups/reduce.h> namespace cg = cooperative_groups; @@ -28,7 +29,6 @@ namespace cg = cooperative_groups; namespace vllm { namespace moe { -constexpr float kNegInfinity = INFINITY * -1; constexpr unsigned FULL_WARP_MASK = 0xffffffff; constexpr int32_t WARP_SIZE = 32; constexpr int32_t BLOCK_SIZE = 512; @@ -411,14 +411,21 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) { return __bfloat162float(val); } +template <typename T> +__device__ inline T neg_inf() { + // cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16] + // so we need to cast from fp32 + return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity()); +} + template <typename T> __device__ void topk_with_k2(T* output, T const* input, cg::thread_block_tile<32> const& tile, int32_t const lane_id, int const num_experts_per_group) { // Get the top2 per thread - T largest = -INFINITY; - T second_largest = -INFINITY; + T largest = neg_inf<T>(); + T second_largest = neg_inf<T>(); if (num_experts_per_group > WARP_SIZE) { for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { @@ -513,8 +520,8 @@ __global__ void group_idx_and_topk_idx_kernel( warp_id * topk; s_topk_idx += warp_id * topk; - T value = kNegInfinity; - T topk_group_value = kNegInfinity; + T value = neg_inf<T>(); + T topk_group_value = neg_inf<T>(); int32_t num_equalto_topkth_group; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) @@ -525,11 +532,8 @@ __global__ void group_idx_and_topk_idx_kernel( if (case_id < num_tokens) { // calculate group_idx int32_t target_num_min = WARP_SIZE - n_group + topk_group; - if (lane_id < n_group && - (isfinite(cuda_cast<float, T>( - group_scores[lane_id])))) // The check is necessary to avoid - // abnormal input - { + // The check is necessary to avoid abnormal input + if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) { value = group_scores[lane_id]; } @@ -540,11 +544,11 @@ __global__ void group_idx_and_topk_idx_kernel( __syncwarp(); // Ensure all threads have valid data before reduction topk_group_value = cg::reduce(tile, value, cg::greater<T>()); if (value == topk_group_value) { - value = kNegInfinity; + value = neg_inf<T>(); } pre_count_equal_to_top_value = count_equal_to_top_value; - count_equal_to_top_value = __popc(__ballot_sync( - FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity)))); + count_equal_to_top_value = + __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>()))); } num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value; } @@ -552,11 +556,10 @@ __global__ void group_idx_and_topk_idx_kernel( warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t, /* is_stable */ true> - queue((int32_t)topk, -INFINITY); + queue((int32_t)topk, neg_inf<T>()); int count_equalto_topkth_group = 0; - bool if_proceed_next_topk = - (topk_group_value != cuda_cast<T, float>(kNegInfinity)); + bool if_proceed_next_topk = topk_group_value != neg_inf<T>(); if (case_id < num_tokens && if_proceed_next_topk) { for (int i_group = 0; i_group < n_group; i_group++) { if ((group_scores[i_group] > topk_group_value) || @@ -566,10 +569,10 @@ __global__ void group_idx_and_topk_idx_kernel( for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) { T candidates = - (i < num_experts_per_group) && isfinite(cuda_cast<float, T>( - scores_with_bias[offset + i])) + (i < num_experts_per_group) && + cuda::std::isfinite(scores_with_bias[offset + i]) ? scores_with_bias[offset + i] - : cuda_cast<T, float>(kNegInfinity); + : neg_inf<T>(); queue.add(candidates, offset + i); } if (group_scores[i_group] == topk_group_value) { @@ -598,7 +601,8 @@ __global__ void group_idx_and_topk_idx_kernel( if (i < topk) { s_topk_value[i] = value; } - topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>()); + topk_sum += + cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>()); } } From 349e0e34627950db1cc4be0df9a0bc616e210589 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 07:23:29 +0100 Subject: [PATCH 408/584] [Docs] Fix API Reference (#25140) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- mkdocs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index bbd850bdfee34..6f2be65a18af8 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -79,7 +79,7 @@ plugins: - "re:vllm\\._.*" # Internal modules - "vllm.third_party" - "vllm.vllm_flash_attn" - - !ENV [API_AUTONAV_EXCLUDE, ""] + - !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default - mkdocstrings: handlers: python: From f4cd80f94404787859ba72dcddb5e818d8f0c9e7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 07:29:05 +0100 Subject: [PATCH 409/584] Retrieve `sliding_window` from text config in Gemma3 MM (#25085) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/gemma3_mm.py | 3 ++- vllm/model_executor/models/gemma3n_mm.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index e652ba2f1c7fe..bee9fbd2c084a 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -688,7 +688,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) global_attn_masks.append(global_attn_mask) - if (sliding_window := self.config.sliding_window) is not None: + sliding_window = self.config.text_config.sliding_window + if sliding_window is not None: # Create a local causal mask with sliding window (1024). local_attn_mask = torch.ones_like(global_attn_mask) local_attn_mask = torch.tril(local_attn_mask, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 663d4da7cec23..8d3079aee0dfb 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -461,9 +461,6 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, self.multimodal_config = multimodal_config self.vocab_size = config.text_config.vocab_size - self.sliding_window = getattr(config.text_config, - "interleaved_sliding_window", None) - self.vision_tower = AutoModel.from_config(config=config.vision_config) self.audio_tower = AutoModel.from_config(config=config.audio_config) self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, From 350c94deb30747f84536ee34d91c6fca564667ce Mon Sep 17 00:00:00 2001 From: "rongfu.leng" <rongfu.leng@daocloud.io> Date: Thu, 18 Sep 2025 15:47:43 +0800 Subject: [PATCH 410/584] [Bugfix] when use s3 model cannot use default load_format (#24435) Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- vllm/config/__init__.py | 12 ++++++++++++ vllm/engine/arg_utils.py | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 64be2f38c6a31..631618d427d42 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3029,6 +3029,18 @@ class VllmConfig: SequenceClassificationConfig) SequenceClassificationConfig.verify_and_update_config(self) + if hasattr(self.model_config, "model_weights") and is_runai_obj_uri( + self.model_config.model_weights): + if self.load_config.load_format == "auto": + logger.info("Detected Run:ai model config. " + "Overriding `load_format` to 'runai_streamer'") + self.load_config.load_format = "runai_streamer" + elif self.load_config.load_format != "runai_streamer": + raise ValueError(f"To load a model from S3, 'load_format' " + f"must be 'runai_streamer', " + f"but got '{self.load_config.load_format}'. " + f"Model: {self.model_config.model}") + def __str__(self): return ( f"model={self.model_config.model!r}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4831cb5348c77..e2a1ec68e6f53 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -959,7 +959,6 @@ class EngineArgs: if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3 and self.model in MODELS_ON_S3 and self.load_format == "auto"): self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" - self.load_format = "runai_streamer" if self.disable_mm_preprocessor_cache: logger.warning( From ef7eefe17a7dc212ddb8a8aabd7760218a10e25e Mon Sep 17 00:00:00 2001 From: Tao He <linzhu.ht@alibaba-inc.com> Date: Thu, 18 Sep 2025 16:16:04 +0800 Subject: [PATCH 411/584] [Qwen] Add fp8 checkpoint support for qwen3-next. (#25079) Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com> --- vllm/model_executor/models/qwen3_next.py | 35 ++++++++++---------- vllm/model_executor/models/qwen3_next_mtp.py | 8 +++-- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ca9f4d402dac2..eb060cb90f44c 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -30,7 +30,6 @@ from vllm.model_executor.layers.layernorm import ( GemmaRMSNorm as Qwen3NextRMSNorm) # yapf: enable from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear) @@ -254,12 +253,20 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): # projection of the input hidden states self.projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2 self.projection_size_ba = self.num_v_heads * 2 - self.in_proj = MergedColumnParallelLinear( + self.in_proj_qkvz = ColumnParallelLinear( input_size=self.hidden_size, - output_sizes=[self.projection_size_qkvz, self.projection_size_ba], + output_size=self.projection_size_qkvz, bias=False, quant_config=quant_config, - prefix=f"{prefix}.in_proj", + prefix=f"{prefix}.in_proj_qkvz", + ) + # ba_proj doesn't support blockwise fp8 quantization. + self.in_proj_ba = ColumnParallelLinear( + input_size=self.hidden_size, + output_size=self.projection_size_ba, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_ba", ) query_key_settings = (self.key_dim, 0, False) @@ -420,19 +427,14 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): ssm_state = self_kv_cache[1] num_actual_tokens = attn_metadata.num_actual_tokens num_accepted_tokens = attn_metadata.num_accepted_tokens - - # 1. Set up dimensions for reshapes later - projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens]) if spec_token_masks is not None: spec_token_masks = spec_token_masks[:num_actual_tokens] - projected_states_qkvz, projected_states_ba = torch.split( - projected_states, - [ - self.projection_size_qkvz // self.tp_size, - self.projection_size_ba // self.tp_size - ], - dim=-1, - ) + + # 1. Set up dimensions for reshapes later + projected_states_qkvz, _ = self.in_proj_qkvz( + hidden_states[:num_actual_tokens]) + projected_states_ba, _ = self.in_proj_ba( + hidden_states[:num_actual_tokens]) query, key, value, z, b, a = self.fix_query_key_value_ordering( projected_states_qkvz, projected_states_ba) query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'), @@ -976,8 +978,6 @@ class Qwen3NextModel(nn.Module): ("qkv_proj", "v_proj", "v"), ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), - ("in_proj", "in_proj_qkvz", 0), - ("in_proj", "in_proj_ba", 1), ] params_dict = dict(self.named_parameters()) @@ -1055,7 +1055,6 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, "v_proj", ], "gate_up_proj": ["gate_proj", "up_proj"], - "in_proj": ["in_proj_qkvz", "in_proj_ba"], } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index 190a1750e673a..c755eeb9b4eaa 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -63,7 +63,9 @@ class Qwen3NextMultiTokenPredictor(nn.Module): self.config.hidden_size, gather_output=True, bias=False, - return_bias=False) + return_bias=False, + quant_config=quant_config, + prefix=f'{prefix}.fc') self.layers = torch.nn.ModuleList( Qwen3NextDecoderLayer( @@ -72,7 +74,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module): model_config=model_config, cache_config=cache_config, quant_config=quant_config, - prefix=f'{prefix}.layers.{self.mtp_start_layer_idx + idx}', + prefix=f'{prefix}.layers.{idx}', ) for idx in range(self.num_mtp_layers)) self.make_empty_intermediate_tensors = ( @@ -233,7 +235,7 @@ class Qwen3NextMTP(nn.Module, SupportsPP): self.config = config self.model = Qwen3NextMultiTokenPredictor(vllm_config=vllm_config, prefix=maybe_prefix( - prefix, "model")) + prefix, "mtp")) self.unpadded_vocab_size = config.vocab_size self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, From aa3f105c591a506523804e12800adcca80480bd8 Mon Sep 17 00:00:00 2001 From: Gerard Finol <gerardfinol@gmail.com> Date: Thu, 18 Sep 2025 11:02:14 +0200 Subject: [PATCH 412/584] Add 'path' option to ImagePrompt data_format (#25081) Signed-off-by: Gerard Finol <gerard.finol@urv.cat> --- .../prithvi_io_processor_plugin/prithvi_io_processor/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py index d480aef704c61..d4c6628211fb2 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py @@ -22,7 +22,7 @@ class DataModuleConfig(TypedDict): class ImagePrompt(BaseModel): - data_format: Literal["b64_json", "bytes", "url"] + data_format: Literal["b64_json", "bytes", "url", "path"] """ This is the data type for the input image """ From 05b044e698bb3c151871d94b64fabd87188de9ef Mon Sep 17 00:00:00 2001 From: Punitvara <punitvara@gmail.com> Date: Thu, 18 Sep 2025 14:35:16 +0530 Subject: [PATCH 413/584] [Doc] Fix cross-reference warnings (#25058) Signed-off-by: Punit Vara <punitvara@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/benchmarks/datasets.py | 3 ++- .../device_communicators/shm_object_storage.py | 8 ++++---- .../model_executor/layers/mamba/ops/causal_conv1d.py | 12 +++++++----- vllm/model_executor/models/mistral3.py | 2 +- vllm/multimodal/profiling.py | 2 +- vllm/v1/core/kv_cache_manager.py | 5 +++-- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1831539a6adbe..1cab40802c392 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -171,7 +171,8 @@ class BenchmarkDataset(ABC): If `None`, LoRA is not used. Returns: - A new [LoRARequest][] (or `None` if not applicable). + A new [`LoRARequest`][vllm.lora.request.LoRARequest] + (or `None` if not applicable). """ if max_loras is None or lora_path is None: return None diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py index 3fac104bda1e8..352e7525d4c84 100644 --- a/vllm/distributed/device_communicators/shm_object_storage.py +++ b/vllm/distributed/device_communicators/shm_object_storage.py @@ -30,7 +30,7 @@ class SingleWriterShmRingBuffer: - Maintains metadata for each allocated buffer chunk in the writer process - Supports custom "is_free_fn" functions to determine when buffers can be reused - - Each buffer chunk contains: [4-byte id][4-byte size][actual_data] + - Each buffer chunk contains: `[4-byte id][4-byte size][actual_data]` Key Concepts: - monotonic_id_start/end: Track the range of active buffer IDs @@ -99,7 +99,7 @@ class SingleWriterShmRingBuffer: - Writer handles garbage collection (free_buf) based on reader feedback Memory Layout per Buffer Chunk: - [4-byte monotonic_id][4-byte chunk_size][actual_data...] + `[4-byte monotonic_id][4-byte chunk_size][actual_data...]` ^metadata_start ^data_start The monotonic_id ensures data integrity - readers can verify they're @@ -185,7 +185,7 @@ class SingleWriterShmRingBuffer: ''' Allocate a buffer `MD_SIZE` + `size` bytes in the shared memory. Memory layout: - [4-byte monotonic_id][4-byte size][buffer data...] + `[4-byte monotonic_id][4-byte size][buffer data...]` ''' assert self.is_writer, "Only the writer can allocate buffers." assert size > 0, "Size must be greater than 0" @@ -413,7 +413,7 @@ class SingleWriterShmObjectStorage: allocation Memory Layout per Object: - [4-byte reference_count][metadata_size][serialized_object_data] + `[4-byte reference_count][metadata_size][serialized_object_data]` Thread Safety: - Writer operations (put, clear) are single-threaded by design diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 7e3ea561fd293..2a88fa661da01 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -927,11 +927,13 @@ def causal_conv1d_update( validate_data=False, ): """ - x: (batch, dim) or (batch, dim, seqlen) or (num_tokens, dim) - [shape=2: single token prediction] - [shape=3: single or multiple tokens prediction] - [shape=2 with num_tokens: continuous batching, where num_tokens is the - total tokens of all sequences in that batch] + x: Input tensor which can take the following shapes: + + - `[batch, dim]` - single token prediction + - `[batch, dim, seqlen]` - single or multiple tokens prediction + - `[num_tokens, dim]` - continuous batching, where num_tokens is + the total tokens of all sequences in that batch + conv_state: (..., dim, state_len), where state_len >= width - 1 weight: (dim, width) bias: (dim,) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 09479012a03ad..d15776a39362d 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -583,7 +583,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, inputs_embeds: Optional tensor of input embeddings. Info: - [Mistral3ImagePixelInputs][] + [`Mistral3ImagePixelInputs`][vllm.model_executor.models.mistral3.Mistral3ImagePixelInputs] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index bad6c0c3d9db2..fbbc55d3524ca 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -301,7 +301,7 @@ class MultiModalProfiler(Generic[_I]): Returns the maximum length of the multimodal (image placeholders+text) tokens, including any break/text tokens in-between image embeddings. - <im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end> + `<im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>` Returns 9, even when the number of image embeddings is 6. This is important to take into account when profiling and diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 3a0fbb5e5c41e..401327f727a4a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -24,8 +24,9 @@ class KVCacheBlocks: """ blocks: tuple[list[KVCacheBlock], ...] """ - blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens. - We don't use block of tokens as the outer dimension because it assumes all + `blocks[i][j]` refers to the i-th kv_cache_group + and the j-th block of tokens.We don't use block of + tokens as the outer dimension because it assumes all kv_cache_groups have the same number of blocks, which is true for now but will be broken if we want to give different block_size to different kv_cache_groups in the future. From 29283e89762a3d572c504e5ea317351696b553a6 Mon Sep 17 00:00:00 2001 From: Aaron Pham <contact@aarnphm.xyz> Date: Thu, 18 Sep 2025 05:20:27 -0400 Subject: [PATCH 414/584] [Chore] Cleanup guided namespace, move to structured outputs config (#22772) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../scripts/hardware_ci/run-amd-test.sh | 6 - .buildkite/test-pipeline.yaml | 3 +- .github/mergify.yml | 2 +- .../benchmark_serving_structured_output.py | 16 +- docs/api/README.md | 2 +- docs/features/reasoning_outputs.md | 10 +- docs/features/structured_outputs.md | 36 +-- docs/features/tool_calling.md | 11 +- docs/serving/openai_compatible_server.md | 4 +- .../offline_inference/structured_outputs.py | 54 ++--- ...t_completion_client_with_tools_required.py | 2 +- .../structured_outputs/structured_outputs.py | 8 +- tests/entrypoints/conftest.py | 2 +- tests/entrypoints/llm/test_lazy_outlines.py | 82 ------- tests/entrypoints/openai/test_chat.py | 123 +++++------ tests/entrypoints/openai/test_completion.py | 79 ++++--- .../test_completion_with_function_calling.py | 4 +- .../entrypoints/openai/test_openai_schema.py | 8 +- .../openai/test_prompt_validation.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 4 - .../openai/test_transcription_validation.py | 2 +- .../openai/test_translation_validation.py | 2 +- tests/test_sampling_params.py | 84 ------- tests/tool_use/test_tool_choice_required.py | 11 +- tests/v1/core/test_scheduler.py | 6 +- tests/v1/engine/test_llm_engine.py | 4 +- tests/v1/entrypoints/conftest.py | 2 +- .../llm/test_struct_output_generate.py | 135 ++++++------ .../openai/test_chat_completion.py | 14 +- .../v1/entrypoints/openai/test_completion.py | 14 +- vllm/config/__init__.py | 35 +-- vllm/engine/arg_utils.py | 95 ++++---- vllm/engine/async_llm_engine.py | 7 +- vllm/engine/llm_engine.py | 18 +- vllm/engine/protocol.py | 7 +- vllm/entrypoints/llm.py | 27 ++- vllm/entrypoints/openai/api_server.py | 10 +- vllm/entrypoints/openai/protocol.py | 206 ++++++------------ vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/model_executor/models/config.py | 6 +- vllm/sampling_params.py | 62 ++---- vllm/transformers_utils/tokenizers/mistral.py | 5 +- vllm/v1/engine/async_llm.py | 3 - vllm/v1/engine/processor.py | 57 +++-- vllm/v1/request.py | 2 +- vllm/v1/structured_output/__init__.py | 13 +- vllm/v1/structured_output/backend_guidance.py | 4 +- .../backend_lm_format_enforcer.py | 22 +- vllm/v1/structured_output/backend_outlines.py | 32 +-- vllm/v1/structured_output/backend_xgrammar.py | 38 ++-- vllm/v1/structured_output/request.py | 2 +- 51 files changed, 579 insertions(+), 806 deletions(-) delete mode 100644 tests/entrypoints/llm/test_lazy_outlines.py delete mode 100644 tests/test_sampling_params.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index c395011a24485..7f90181048d0f 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_prompt_validation.py "} fi -#Obsolete currently -##ignore certain Entrypoints/llm tests -#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then -# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} -#fi - # --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_oot_registration.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8dd99bf1a38f6..66dfc990805f2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -108,8 +108,7 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests diff --git a/.github/mergify.yml b/.github/mergify.yml index f2dd2e06214ae..94198b1251e09 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -171,7 +171,7 @@ pull_request_rules: - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files~=^tests/v1/structured_output/ - - files=tests/v1/entrypoints/llm/test_guided_generate.py + - files=tests/v1/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ actions: label: diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 4aae755eb4e44..73b4aa5a87e07 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -696,11 +696,11 @@ def evaluate(ret, args): return re.match(args.regex, actual) is not None def _eval_correctness(expected, actual): - if args.structure_type == "guided_json": + if args.structure_type == "json": return _eval_correctness_json(expected, actual) - elif args.structure_type == "guided_regex": + elif args.structure_type == "regex": return _eval_correctness_regex(expected, actual) - elif args.structure_type == "guided_choice": + elif args.structure_type == "choice": return _eval_correctness_choice(expected, actual) else: return None @@ -780,18 +780,18 @@ def main(args: argparse.Namespace): ) if args.dataset == "grammar": - args.structure_type = "guided_grammar" + args.structure_type = "grammar" elif args.dataset == "regex": - args.structure_type = "guided_regex" + args.structure_type = "regex" elif args.dataset == "choice": - args.structure_type = "guided_choice" + args.structure_type = "choice" else: - args.structure_type = "guided_json" + args.structure_type = "json" if args.no_structured_output: args.structured_output_ratio = 0 if args.save_results: - result_file_name = f"{args.structured_output_ratio}guided" + result_file_name = f"{args.structured_output_ratio}so" result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" diff --git a/docs/api/README.md b/docs/api/README.md index 57142e8f5625d..148211756480c 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes. - [vllm.config.LoRAConfig][] - [vllm.config.MultiModalConfig][] - [vllm.config.PoolerConfig][] -- [vllm.config.DecodingConfig][] +- [vllm.config.StructuredOutputsConfig][] - [vllm.config.ObservabilityConfig][] - [vllm.config.KVTransferConfig][] - [vllm.config.CompilationConfig][] diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index d518e7f0cff43..85681669dfb22 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models: | Model Series | Parser Name | Structured Output Support | Tool Calling | |--------------|-------------|------------------|-------------| -| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ | -| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ | +| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | +| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | -| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ | -| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ | -| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ | +| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ | +| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ | +| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | !!! note IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 0d6294a5fdd79..1f955c6e30d6c 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla The following parameters are supported, which must be added as extra parameters: -- `guided_choice`: the output will be exactly one of the choices. -- `guided_regex`: the output will follow the regex pattern. -- `guided_json`: the output will follow the JSON schema. -- `guided_grammar`: the output will follow the context free grammar. +- `choice`: the output will be exactly one of the choices. +- `regex`: the output will follow the regex pattern. +- `json`: the output will follow the JSON schema. +- `grammar`: the output will follow the context free grammar. - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text. You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page. Structured outputs are supported by default in the OpenAI-Compatible Server. You may choose to specify the backend to use by setting the -`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`, +`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`, which will try to choose an appropriate backend based on the details of the request. You may also choose a specific backend, along with some options. A full set of options is available in the `vllm serve --help` text. -Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: +Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one: ??? code @@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic messages=[ {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], - extra_body={"guided_choice": ["positive", "negative"]}, + extra_body={"structured_outputs": {"choice": ["positive", "negative"]}}, ) print(completion.choices[0].message.content) ``` -The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: +The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template: ??? code @@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", } ], - extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, + extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]}, ) print(completion.choices[0].message.content) ``` One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. -For this we can use the `guided_json` parameter in two different ways: +For this we can use the `json` parameter in two different ways: - Using directly a [JSON Schema](https://json-schema.org/) - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option). -The next example shows how to use the `guided_json` parameter with a Pydantic model: +The next example shows how to use the `response_format` parameter with a Pydantic model: ??? code @@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo JSON schema and how the fields should be populated. This can improve the results notably in most cases. -Finally we have the `guided_grammar` option, which is probably the most +Finally we have the `grammar` option, which is probably the most difficult to use, but it´s really powerful. It allows us to define complete languages like SQL queries. It works by using a context free EBNF grammar. As an example, we can use to define a specific format of simplified SQL queries: @@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries: "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", } ], - extra_body={"guided_grammar": simplified_sql_grammar}, + extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}}, ) print(completion.choices[0].message.content) ``` @@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online ## Offline Inference Offline inference allows for the same types of structured outputs. -To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`. -The main available options inside `GuidedDecodingParams` are: +To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`. +The main available options inside `StructuredOutputsParams` are: - `json` - `regex` @@ -309,12 +309,12 @@ shown below: ```python from vllm import LLM, SamplingParams - from vllm.sampling_params import GuidedDecodingParams + from vllm.sampling_params import StructuredOutputsParams llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") - guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) - sampling_params = SamplingParams(guided_decoding=guided_decoding_params) + structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"]) + sampling_params = SamplingParams(structured_outputs=structured_outputs_params) outputs = llm.generate( prompts="Classify this sentiment: vLLM is wonderful!", sampling_params=sampling_params, diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index a8c0db0a7ac13..2a48596571d1d 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -71,7 +71,7 @@ This example demonstrates: * Making a request with `tool_choice="auto"` * Handling the structured response and executing the corresponding function -You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. +You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. Remember that it's the caller's responsibility to: @@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci ## Named Function Calling -vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is -enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a +vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a high-quality one. -vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. -For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. +vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend. To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. ## Required Function Calling -vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine. +vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine. When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 181a874efa3cb..bc52d02a50bd2 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -133,7 +133,7 @@ completion = client.chat.completions.create( {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], extra_body={ - "guided_choice": ["positive", "negative"] + "structured_outputs": {"choice": ["positive", "negative"]} } ) ``` @@ -374,7 +374,7 @@ The following extra parameters are supported: ```python --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" ``` - + [](){ #translations-api } ### Translations API diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 88d87beb4874d..6b6099f71b120 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,11 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -This file demonstrates the example usage of guided decoding -to generate structured outputs using vLLM. It shows how to apply -different guided decoding techniques such as Choice, Regex, JSON schema, -and Grammar to produce structured and formatted results -based on specific prompts. +This file demonstrates the example usage of structured outputs +in vLLM. It shows how to apply different constraints such as choice, +regex, json schema, and grammar to produce structured and formatted +results based on specific prompts. """ from enum import Enum @@ -13,19 +12,23 @@ from enum import Enum from pydantic import BaseModel from vllm import LLM, SamplingParams -from vllm.sampling_params import GuidedDecodingParams +from vllm.sampling_params import StructuredOutputsParams MAX_TOKENS = 50 -# Guided decoding by Choice (list of possible options) -guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"]) -sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice) +# Structured outputs by Choice (list of possible options) +structured_outputs_params_choice = StructuredOutputsParams( + choice=["Positive", "Negative"] +) +sampling_params_choice = SamplingParams( + structured_outputs=structured_outputs_params_choice +) prompt_choice = "Classify this sentiment: vLLM is wonderful!" -# Guided decoding by Regex -guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n") +# Structured outputs by Regex +structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n") sampling_params_regex = SamplingParams( - guided_decoding=guided_decoding_params_regex, + structured_outputs=structured_outputs_params_regex, stop=["\n"], max_tokens=MAX_TOKENS, ) @@ -36,7 +39,7 @@ prompt_regex = ( ) -# Guided decoding by JSON using Pydantic schema +# Structured outputs by JSON using Pydantic schema class CarType(str, Enum): sedan = "sedan" suv = "SUV" @@ -51,17 +54,16 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() -guided_decoding_params_json = GuidedDecodingParams(json=json_schema) +structured_outputs_params_json = StructuredOutputsParams(json=json_schema) sampling_params_json = SamplingParams( - guided_decoding=guided_decoding_params_json, - max_tokens=MAX_TOKENS, + structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS ) prompt_json = ( - "Generate a JSON with the brand, model and car_type of" + "Generate a JSON with the brand, model and car_type of " "the most iconic car from the 90's" ) -# Guided decoding by Grammar +# Structured outputs by Grammar simplified_sql_grammar = """ root ::= select_statement select_statement ::= "SELECT " column " from " table " where " condition @@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 " condition ::= column "= " number number ::= "1 " | "2 " """ -guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar) +structured_outputs_params_grammar = StructuredOutputsParams( + grammar=simplified_sql_grammar +) sampling_params_grammar = SamplingParams( - guided_decoding=guided_decoding_params_grammar, + structured_outputs=structured_outputs_params_grammar, max_tokens=MAX_TOKENS, ) prompt_grammar = ( - "Generate an SQL query to show the 'username' and 'email'from the 'users' table." + "Generate an SQL query to show the 'username' and 'email' from the 'users' table." ) @@ -93,16 +97,16 @@ def main(): llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100) choice_output = generate_output(prompt_choice, sampling_params_choice, llm) - format_output("Guided decoding by Choice", choice_output) + format_output("Structured outputs by Choice", choice_output) regex_output = generate_output(prompt_regex, sampling_params_regex, llm) - format_output("Guided decoding by Regex", regex_output) + format_output("Structured outputs by Regex", regex_output) json_output = generate_output(prompt_json, sampling_params_json, llm) - format_output("Guided decoding by JSON", json_output) + format_output("Structured outputs by JSON", json_output) grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm) - format_output("Guided decoding by Grammar", grammar_output) + format_output("Structured outputs by Grammar", grammar_output) if __name__ == "__main__": diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py index 7eb8668213eef..6ff65b18f6674 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -6,7 +6,7 @@ without any specific flags: ```bash VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \ - --guided-decoding-backend outlines + --structured-outputs-config.backend outlines ``` This example demonstrates how to generate chat completions diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py index 2a8f4637260c2..3ea6c73e90e8f 100644 --- a/examples/online_serving/structured_outputs/structured_outputs.py +++ b/examples/online_serving/structured_outputs/structured_outputs.py @@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { "content": "Classify this sentiment: vLLM is wonderful!", } ], - "extra_body": {"guided_choice": ["positive", "negative"]}, + "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}}, }, "regex": { "messages": [ @@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { } ], "extra_body": { - "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n", + "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"}, }, }, "json": { @@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { } ], "extra_body": { - "guided_grammar": """ + "structured_outputs": { + "grammar": """ root ::= select_statement select_statement ::= "SELECT " column " from " table " where " condition @@ -135,6 +136,7 @@ condition ::= column "= " number number ::= "1 " | "2 " """, + } }, }, "structural_tag": { diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index c23eeee271869..da75806ccf4de 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -184,7 +184,7 @@ def sample_enum_json_schema(): @pytest.fixture -def sample_guided_choice(): +def sample_structured_outputs_choices(): return [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", "Swift", "Kotlin" diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py deleted file mode 100644 index ac0b7e134c55a..0000000000000 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import sys -from contextlib import nullcontext - -from vllm_test_utils import BlameResult, blame - -from vllm import LLM, SamplingParams -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.sampling_params import GuidedDecodingParams - - -def run_normal(): - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM without guided decoding as a baseline. - llm = LLM(model="distilbert/distilgpt2", - enforce_eager=True, - gpu_memory_utilization=0.3) - outputs = llm.generate(prompts, sampling_params) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - # Destroy the LLM object and free up the GPU memory. - del llm - cleanup_dist_env_and_memory() - - -def run_xgrammar(sample_regex): - # Create an LLM with guided decoding enabled. - llm = LLM(model="distilbert/distilgpt2", - enforce_eager=True, - guided_decoding_backend="xgrammar", - gpu_memory_utilization=0.3) - prompt = f"Give an example IPv4 address with this regex: {sample_regex}" - guided_decoding = GuidedDecodingParams(regex=sample_regex) - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - guided_decoding=guided_decoding) - outputs = llm.generate( - prompts=[prompt] * 2, - sampling_params=sampling_params, - use_tqdm=True, - ) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -def test_lazy_outlines(sample_regex): - """If users don't use guided decoding, outlines should not be imported. - """ - # make sure outlines is not imported - module_name = "outlines" - # In CI, we only check finally if the module is imported. - # If it is indeed imported, we can rerun the test with `use_blame=True`, - # which will trace every function call to find the first import location, - # and help find the root cause. - # We don't run it in CI by default because it is slow. - use_blame = False - context = blame( - lambda: module_name in sys.modules) if use_blame else nullcontext() - with context as result: - run_normal() - run_xgrammar(sample_regex) - if use_blame: - assert isinstance(result, BlameResult) - print(f"the first import location is:\n{result.trace_stack}") - assert module_name not in sys.modules, ( - f"Module {module_name} is imported. To see the first" - f" import location, run the test with `use_blame=True`.") diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d5924b7b3ae34..a827f94cfbfe5 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for guided decoding tests +# imports for structured outputs tests import json from typing import Optional @@ -480,10 +480,11 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_guided_choice_chat(client: openai.AsyncOpenAI, - sample_guided_choice, is_v1_server: bool): +async def test_structured_outputs_choice_chat( + client: openai.AsyncOpenAI, sample_structured_outputs_choices, + is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("Structured outputs is only supported in v1 engine") messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -498,9 +499,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_guided_choice)) + extra_body=dict( + structured_outputs={"choice": sample_structured_outputs_choices})) choice1 = chat_completion.choices[0].message.content - assert choice1 in sample_guided_choice + assert choice1 in sample_structured_outputs_choices messages.append({"role": "assistant", "content": choice1}) messages.append({ @@ -512,17 +514,19 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_guided_choice)) + extra_body=dict( + structured_outputs={"choice": sample_structured_outputs_choices})) choice2 = chat_completion.choices[0].message.content - assert choice2 in sample_guided_choice + assert choice2 in sample_structured_outputs_choices assert choice1 != choice2 @pytest.mark.asyncio -async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, - is_v1_server: bool): +async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, + sample_json_schema, + is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("Structured outputs is only supported in v1 engine") messages = [{ "role": "system", @@ -538,7 +542,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema)) + extra_body=dict(structured_outputs={"json": sample_json_schema})) message = chat_completion.choices[0].message assert message.content is not None json1 = json.loads(message.content) @@ -555,7 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema)) + extra_body=dict(structured_outputs={"json": sample_json_schema})) message = chat_completion.choices[0].message assert message.content is not None json2 = json.loads(message.content) @@ -565,10 +569,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, @pytest.mark.asyncio -async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, - is_v1_server: bool): +async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI, + sample_regex, is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("Structured outputs is only supported in v1 engine") messages = [{ "role": "system", @@ -583,7 +587,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex)) + extra_body=dict(structured_outputs={"regex": sample_regex})) ip1 = chat_completion.choices[0].message.content assert ip1 is not None assert re.fullmatch(sample_regex, ip1) is not None @@ -594,7 +598,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex)) + extra_body=dict(structured_outputs={"regex": sample_regex})) ip2 = chat_completion.choices[0].message.content assert ip2 is not None assert re.fullmatch(sample_regex, ip2) is not None @@ -602,7 +606,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, @pytest.mark.asyncio -async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): +async def test_structured_outputs_type_error(client: openai.AsyncOpenAI): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -614,17 +618,19 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): }] with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create(model=MODEL_NAME, - messages=messages, - extra_body=dict(guided_regex={ - 1: "Python", - 2: "C++" - })) + _ = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + extra_body=dict( + structured_outputs={"regex": { + 1: "Python", + 2: "C++" + }})) @pytest.mark.asyncio -async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, - sample_guided_choice): +async def test_structured_outputs_choice_chat_logprobs( + client: openai.AsyncOpenAI, sample_structured_outputs_choices): messages = [{ "role": "system", @@ -641,7 +647,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, max_completion_tokens=10, logprobs=True, top_logprobs=5, - extra_body=dict(guided_choice=sample_guided_choice)) + extra_body=dict( + structured_outputs={"choice": sample_structured_outputs_choices})) assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs.content is not None @@ -663,10 +670,23 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, }, { "role": "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {sample_json_schema}" + "content": ("Give an example JSON for an employee " + "profile using the specified tool.") }] + tools = [{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": sample_json_schema + } + }] + tool_choice = { + "type": "function", + "function": { + "name": "dummy_function_name" + } + } # non-streaming @@ -674,20 +694,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": sample_json_schema - } - }], - tool_choice={ - "type": "function", - "function": { - "name": "dummy_function_name" - } - }, + tools=tools, + tool_choice=tool_choice, ) message = chat_completion.choices[0].message assert len(message.content) == 0 @@ -705,25 +713,12 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, # streaming - stream = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_completion_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": sample_json_schema - } - }], - tool_choice={ - "type": "function", - "function": { - "name": "dummy_function_name" - } - }, - stream=True) + stream = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + max_completion_tokens=1000, + tools=tools, + tool_choice=tool_choice, + stream=True) output = [] finish_reason_count = 0 diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 3650b15792575..0347513befe32 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for guided decoding tests +# imports for structured outputs tests import json import os from typing import Optional @@ -23,8 +23,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # technically these adapters use a different base model, # but we're not testing generation quality here -GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"] - @pytest.fixture(scope="module") def default_server_args(zephyr_lora_files): @@ -595,12 +593,13 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI): @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_json_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_json_schema, is_v1_server: bool): +async def test_structured_outputs_json_completion( + client: openai.AsyncOpenAI, + sample_json_schema, + is_v1_server: bool, +): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("structured outputs is only supported in v1 engine") completion = await client.completions.create( model=MODEL_NAME, @@ -609,8 +608,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, n=3, temperature=1.0, max_tokens=500, - extra_body=dict(guided_json=sample_json_schema, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(structured_outputs=dict(json=sample_json_schema))) assert completion.id is not None assert len(completion.choices) == 3 @@ -620,12 +618,13 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_regex_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_regex, is_v1_server: bool): +async def test_structured_outputs_regex_completion( + client: openai.AsyncOpenAI, + sample_regex, + is_v1_server: bool, +): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("structured outputs is only supported in v1 engine") completion = await client.completions.create( model=MODEL_NAME, @@ -633,8 +632,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, n=3, temperature=1.0, max_tokens=20, - extra_body=dict(guided_regex=sample_regex, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(structured_outputs=dict(regex=sample_regex))) assert completion.id is not None assert len(completion.choices) == 3 @@ -644,13 +642,13 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_choice_completion(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_guided_choice, - is_v1_server: bool): +async def test_structured_outputs_choice_completion( + client: openai.AsyncOpenAI, + sample_structured_outputs_choices, + is_v1_server: bool, +): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("structured outputs is only supported in v1 engine") completion = await client.completions.create( model=MODEL_NAME, @@ -658,20 +656,21 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI, n=2, temperature=1.0, max_tokens=10, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(structured_outputs=dict( + choice=sample_structured_outputs_choices))) assert completion.id is not None assert len(completion.choices) == 2 for i in range(2): - assert completion.choices[i].text in sample_guided_choice + assert completion.choices[i].text in sample_structured_outputs_choices @pytest.mark.asyncio -async def test_guided_grammar(client: openai.AsyncOpenAI, - sample_sql_statements, is_v1_server: bool): +async def test_structured_outputs_grammar(client: openai.AsyncOpenAI, + sample_sql_statements, + is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided grammar is only supported in v1 engine") + pytest.skip("grammar is only supported in v1 engine") completion = await client.completions.create( model=MODEL_NAME, @@ -679,7 +678,8 @@ async def test_guided_grammar(client: openai.AsyncOpenAI, "table_1 where it is equals to 1"), temperature=1.0, max_tokens=500, - extra_body=dict(guided_grammar=sample_sql_statements)) + extra_body=dict( + structured_outputs=dict(grammar=sample_sql_statements), )) content = completion.choices[0].text @@ -730,27 +730,26 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_json_schema, sample_regex, - is_v1_server: bool): +async def test_structured_outputs_type_error(client: openai.AsyncOpenAI, + sample_json_schema, sample_regex, + is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("structured outputs is only supported in v1 engine") with pytest.raises(openai.BadRequestError): _ = await client.completions.create( model=MODEL_NAME, prompt="Give an example JSON that fits this schema: 42", - extra_body=dict(guided_json=42, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(structured_outputs=dict(json=42))) with pytest.raises(openai.BadRequestError): _ = await client.completions.create( model=MODEL_NAME, prompt="Give an example string that fits this regex", - extra_body=dict(guided_regex=sample_regex, - guided_json=sample_json_schema)) + extra_body=dict(structured_outputs=dict( + regex=sample_regex, + json=sample_json_schema, + ))) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 4ef5d4e8a699a..3649cefa9bf42 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -142,7 +142,7 @@ def server(): # noqa: F811 "--dtype", "half", "--enable-auto-tool-choice", - "--guided-decoding-backend", + "--structured-outputs-config.backend", "xgrammar", "--tool-call-parser", "hermes", @@ -225,7 +225,7 @@ def k2_server(): # noqa: F811 "--dtype", "half", "--enable-auto-tool-choice", - "--guided-decoding-backend", + "--structured-outputs-config.backend", "xgrammar", "--tool-call-parser", "hermes", diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index 11ed1c4a9ee4b..73f79ac28d110 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -102,12 +102,14 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy): if "custom" in tool_call: return False - # Sometimes guided_grammar is generated to be empty + # Sometimes structured_outputs.grammar is generated to be empty # Causing a server error in EBNF grammar parsing # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421 - guided_grammar = case.body.get("guided_grammar") + structured_outputs = case.body.get("structured_outputs", {}) + grammar = structured_outputs.get("grammar") if isinstance( + structured_outputs, dict) else None - if guided_grammar == '': + if grammar == '': # Allow None (will be handled as no grammar) # But skip empty strings return False diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index bfa3f983cd87e..bb4c633e5e502 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -3,7 +3,7 @@ import io -# imports for guided decoding tests +# imports for structured outputs tests import openai import pybase64 import pytest diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index de26fce854f5b..8e68699e5904a 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -333,7 +333,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): "role": "user", "content": "what is 1+1?" }], - guided_decoding_backend="outlines", ) with suppress(Exception): @@ -378,7 +377,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): "role": "user", "content": "what is 1+1?" }], - guided_decoding_backend="outlines", ) with suppress(Exception): @@ -433,7 +431,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): "role": "user", "content": "what is 1+1?" }], - guided_decoding_backend="outlines", ) with suppress(Exception): @@ -489,7 +486,6 @@ async def test_serving_chat_could_load_correct_generation_config(): "role": "user", "content": "what is 1+1?" }], - guided_decoding_backend="outlines", ) with suppress(Exception): diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 6a3cdfdfc8081..23c99da97ad3a 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for guided decoding tests +# imports for structured outputs tests import io import json diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index f43b7a253d28d..eb7879927b9b6 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import io -# imports for guided decoding tests +# imports for structured outputs tests import json import httpx diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py deleted file mode 100644 index 7330f61e67689..0000000000000 --- a/tests/test_sampling_params.py +++ /dev/null @@ -1,84 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for the SamplingParams class. -""" - -import pytest - -from vllm import SamplingParams -from vllm.config import ModelConfig -from vllm.entrypoints.openai.protocol import ChatCompletionRequest - -MODEL_NAME = "Qwen/Qwen1.5-7B" - - -def test_max_tokens_none(): - """max_tokens=None should be allowed""" - SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) - - -@pytest.fixture(scope="module") -def model_config(): - return ModelConfig( - MODEL_NAME, - seed=0, - dtype="float16", - ) - - -@pytest.fixture(scope="module") -def default_max_tokens(): - return 4096 - - -def test_sampling_params_from_request_with_no_guided_decoding_backend( - model_config, default_max_tokens): - # guided_decoding_backend is not present at request level - request = ChatCompletionRequest.model_validate({ - 'messages': [{ - 'role': 'user', - 'content': 'Hello' - }], - 'model': - MODEL_NAME, - 'response_format': { - 'type': 'json_object', - }, - }) - - sampling_params = request.to_sampling_params( - default_max_tokens, - model_config.logits_processor_pattern, - ) - # we do not expect any backend to be present and the default - # guided_decoding_backend at engine level will be used. - assert sampling_params.guided_decoding.backend is None - - -@pytest.mark.parametrize("request_level_guided_decoding_backend,expected", - [("xgrammar", "xgrammar"), ("guidance", "guidance"), - ("outlines", "outlines")]) -def test_sampling_params_from_request_with_guided_decoding_backend( - request_level_guided_decoding_backend: str, expected: str, - model_config, default_max_tokens): - - request = ChatCompletionRequest.model_validate({ - 'messages': [{ - 'role': 'user', - 'content': 'Hello' - }], - 'model': - MODEL_NAME, - 'response_format': { - 'type': 'json_object', - }, - 'guided_decoding_backend': - request_level_guided_decoding_backend, - }) - - sampling_params = request.to_sampling_params( - default_max_tokens, - model_config.logits_processor_pattern, - ) - # backend correctly identified in resulting sampling_params - assert sampling_params.guided_decoding.backend == expected diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py index e0ed221a93e12..130e9547bdccb 100644 --- a/tests/tool_use/test_tool_choice_required.py +++ b/tests/tool_use/test_tool_choice_required.py @@ -68,7 +68,7 @@ EXAMPLE_TOOLS = [ def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output, should_match: bool): self = MagicMock(tool_choice="required", tools=tools) - schema = ChatCompletionRequest._get_guided_json_from_tool(self) + schema = ChatCompletionRequest._get_json_schema_from_tool(self) assert isinstance(schema, dict) # use build_regex_from_schema used in JSONLogitsProcessor to create Guide @@ -218,7 +218,7 @@ VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS] } }, {}], False), ]) -def test_guided_json(sample_output, should_match): +def test_structured_outputs_json(sample_output, should_match): _compile_and_check(tools=TypeAdapter( list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS), sample_output=sample_output, @@ -273,8 +273,9 @@ def update_parameters_empty_dict( @pytest.mark.parametrize( "update_parameters", [update_parameters_none, update_parameters_empty_dict]) -def test_guided_json_without_parameters(sample_output, should_match, - update_parameters): +def test_structured_outputs_json_without_parameters(sample_output, + should_match, + update_parameters): updated_tools = [deepcopy(EXAMPLE_TOOLS[0])] tools = TypeAdapter( list[ChatCompletionToolsParam]).validate_python(updated_tools) @@ -334,4 +335,4 @@ def test_streaming_output_valid(output, empty_params, delta_len): combined_messages += message.tool_calls[0].function.arguments combined_messages += "}]" assert json.loads(combined_messages) == output - assert json.dumps(json.loads(combined_messages)) == output_json \ No newline at end of file + assert json.dumps(json.loads(combined_messages)) == output_json diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 572d6c9c889f6..f6fc1e6d37d14 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) from vllm.multimodal.inputs import (MultiModalFeatureSpec, MultiModalKwargsItem, PlaceholderRange) -from vllm.sampling_params import GuidedDecodingParams, SamplingParams +from vllm.sampling_params import SamplingParams, StructuredOutputsParams from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -1796,11 +1796,11 @@ def test_schedule_skip_tokenizer_init(): def test_schedule_skip_tokenizer_init_structured_output_request(): scheduler = create_scheduler(skip_tokenizer_init=True) - guided_params = GuidedDecodingParams(regex="[0-9]+") + structured_outputs_params = StructuredOutputsParams(regex="[0-9]+") sampling_params = SamplingParams( ignore_eos=False, max_tokens=16, - guided_decoding=guided_params, + structured_outputs=structured_outputs_params, ) request = Request( request_id="0", diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index 2848420c22085..7529c3780ec25 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Optional import pytest from vllm import LLM -from vllm.sampling_params import GuidedDecodingParams, SamplingParams +from vllm.sampling_params import SamplingParams, StructuredOutputsParams from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector if TYPE_CHECKING: @@ -97,7 +97,7 @@ def _get_test_sampling_params( top_p=0.95, n=n, seed=seed, - guided_decoding=GuidedDecodingParams( + structured_outputs=StructuredOutputsParams( regex="[0-9]+") if structured_outputs else None, ) for n in n_list ], n_list diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py index ffe0612124660..46b953fe37433 100644 --- a/tests/v1/entrypoints/conftest.py +++ b/tests/v1/entrypoints/conftest.py @@ -151,7 +151,7 @@ def sample_definition_json_schema(): @pytest.fixture -def sample_guided_choice(): +def sample_structured_outputs_choices(): return [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", "Swift", "Kotlin" diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index ad62914195b44..4b0f3b2d9967e 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -15,12 +15,13 @@ import torch from pydantic import BaseModel from tests.reasoning.utils import run_reasoning_extraction +from vllm.config import StructuredOutputsConfig from vllm.distributed import cleanup_dist_env_and_memory from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager -from vllm.sampling_params import GuidedDecodingParams, SamplingParams +from vllm.sampling_params import SamplingParams, StructuredOutputsParams if TYPE_CHECKING: from vllm.config import TokenizerMode @@ -90,7 +91,7 @@ def _load_json(s: str, backend: str) -> str: @pytest.mark.skip_global_cleanup @pytest.mark.parametrize( - "model_name, guided_decoding_backend, tokenizer_mode, speculative_config", + "model_name, backend, tokenizer_mode, speculative_config", PARAMS_MODELS_BACKENDS_TOKENIZER_MODE) def test_structured_output( monkeypatch: pytest.MonkeyPatch, @@ -99,8 +100,8 @@ def test_structured_output( sample_sql_ebnf: str, sample_sql_lark: str, sample_regex: str, - sample_guided_choice: str, - guided_decoding_backend: str, + sample_structured_outputs_choices: str, + backend: str, tokenizer_mode: str, model_name: str, speculative_config: dict[str, Any], @@ -115,16 +116,15 @@ def test_structured_output( enforce_eager = bool(not current_platform.is_tpu()) # Use a single LLM instance for several scenarios to # speed up the test suite. - llm = LLM( - model=model_name, - enforce_eager=enforce_eager, - max_model_len=1024, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=(guided_decoding_backend - in {"xgrammar", "guidance"}), - seed=120, - tokenizer_mode=tokenizer_mode, - speculative_config=speculative_config) + llm = LLM(model=model_name, + enforce_eager=enforce_eager, + max_model_len=1024, + structured_outputs_config=dict(backend=backend, + disable_any_whitespace=backend + in {"xgrammar", "guidance"}), + seed=120, + tokenizer_mode=tokenizer_mode, + speculative_config=speculative_config) # # Test 1: Generate JSON output based on a provided schema @@ -132,7 +132,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=sample_json_schema)) + structured_outputs=StructuredOutputsParams(json=sample_json_schema)) prompt = ("Give an example JSON for an employee profile that fits this " "schema. Make the response as short as possible. Schema: " @@ -152,7 +152,7 @@ def test_structured_output( generated_text = output.outputs[0].text assert generated_text is not None - if guided_decoding_backend != 'lm-format-enforcer': + if backend != 'lm-format-enforcer': assert "\n" not in generated_text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") output_json = json.loads(generated_text) @@ -161,12 +161,12 @@ def test_structured_output( # # Test 2: Generate JSON object without a schema # - if guided_decoding_backend != "outlines": + if backend != "outlines": sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, n=2, - guided_decoding=GuidedDecodingParams(json_object=True)) + structured_outputs=StructuredOutputsParams(json_object=True)) outputs = llm.generate(prompts=( "Generate a JSON object with curly braces for a person with " @@ -195,8 +195,9 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=unsupported_json_schema)) - if guided_decoding_backend.startswith("xgrammar"): + structured_outputs=StructuredOutputsParams( + json=unsupported_json_schema)) + if backend.startswith("xgrammar"): with pytest.raises(ValueError, match="The provided JSON schema contains features " "not supported by xgrammar."): @@ -230,7 +231,7 @@ def test_structured_output( parsed_json = json.loads(generated_text) assert isinstance(parsed_json, dict) - if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]: + if backend not in ["outlines", "lm-format-enforcer"]: # # Test 4: Generate SQL statement using EBNF grammar # @@ -238,7 +239,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf)) + structured_outputs=StructuredOutputsParams( + grammar=sample_sql_ebnf)) outputs = llm.generate( ("Generate a sql statement that selects col_1 from " "table_1 where it is equal to 1. Make the response as short as " @@ -271,7 +273,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark)) + structured_outputs=StructuredOutputsParams( + grammar=sample_sql_lark)) outputs = llm.generate( ("Generate a sql statement that selects col_1 from " "table_1 where it is equal to 1. Make the response as short as " @@ -309,7 +312,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar="not a grammar")) + structured_outputs=StructuredOutputsParams( + grammar="not a grammar")) with pytest.raises(ValueError, match="Failed to convert the grammar "): llm.generate( ("Generate a sql statement that selects col_1 from " @@ -325,7 +329,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.8, top_p=0.95, - guided_decoding=GuidedDecodingParams(regex=sample_regex)) + structured_outputs=StructuredOutputsParams(regex=sample_regex)) prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. " f"Make the response as short as possible.") @@ -352,7 +356,8 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.8, top_p=0.95, - guided_decoding=GuidedDecodingParams(choice=sample_guided_choice)) + structured_outputs=StructuredOutputsParams( + choice=sample_structured_outputs_choices)) outputs = llm.generate( ("The best language for type-safe systems programming is " @@ -368,7 +373,7 @@ def test_structured_output( generated_text = output.outputs[0].text print(generated_text) assert generated_text is not None - assert generated_text in sample_guided_choice + assert generated_text in sample_structured_outputs_choices print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # @@ -378,7 +383,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=1000, - guided_decoding=GuidedDecodingParams(json=json_schema)) + structured_outputs=StructuredOutputsParams(json=json_schema)) outputs = llm.generate( ("Generate a JSON with the brand, model and car_type of the most " @@ -422,7 +427,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=json_schema)) + structured_outputs=StructuredOutputsParams(json=json_schema)) outputs = llm.generate( ("Generate a description of a frog using 50 characters. " @@ -444,7 +449,7 @@ def test_structured_output( output_json = json.loads(generated_text) jsonschema.validate(instance=output_json, schema=json_schema) - if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]: + if backend not in ["outlines", "lm-format-enforcer"]: # # Test 11: Generate structured output using structural_tag format # @@ -470,7 +475,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams( + structured_outputs=StructuredOutputsParams( structural_tag=json.dumps(structural_tag_config))) prompt = """ @@ -547,7 +552,7 @@ Make the response as short as possible. @pytest.mark.skip_global_cleanup @pytest.mark.parametrize( - "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 + "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 [ ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto", "deepseek_r1", NGRAM_SPEC_CONFIG), @@ -556,7 +561,7 @@ Make the response as short as possible. ) def test_structured_output_with_reasoning_matrices( monkeypatch: pytest.MonkeyPatch, - guided_decoding_backend: str, + backend: str, tokenizer_mode: TokenizerMode, reasoning_parser: str, model_name: str, @@ -576,10 +581,11 @@ def test_structured_output_with_reasoning_matrices( enforce_eager=bool(not current_platform.is_tpu()), max_model_len=1024, max_num_seqs=16, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=True, + structured_outputs_config=dict(backend=backend, + disable_any_whitespace=backend + in {"xgrammar", "guidance"}, + reasoning_parser=reasoning_parser), tokenizer_mode=tokenizer_mode, - reasoning_parser=reasoning_parser, speculative_config=speculative_config, ) tokenizer = llm.get_tokenizer() @@ -603,7 +609,7 @@ def test_structured_output_with_reasoning_matrices( sampling_params = SamplingParams( temperature=0.1, max_tokens=8192, - guided_decoding=GuidedDecodingParams(json=reasoning_schema), + structured_outputs=StructuredOutputsParams(json=reasoning_schema), ) outputs = llm.generate( [reasoning_prompt], @@ -640,13 +646,14 @@ def test_structured_output_auto_mode( llm = LLM(model=model_name, max_model_len=1024, - guided_decoding_backend="auto", + structured_outputs_config=dict(backend="auto"), tokenizer_mode=tokenizer_mode) sampling_params = SamplingParams( temperature=1.0, max_tokens=1000, - guided_decoding=GuidedDecodingParams(json=unsupported_json_schema)) + structured_outputs=StructuredOutputsParams( + json=unsupported_json_schema)) prompts = ( "Give an example JSON object for a grade " @@ -681,9 +688,10 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=1024, - guided_decoding_backend="guidance", - guided_decoding_disable_any_whitespace=True, - guided_decoding_disable_additional_properties=True) + structured_outputs_config=dict( + backend="guidance", + disable_any_whitespace=True, + disable_additional_properties=True)) schema = { 'type': 'object', @@ -709,14 +717,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): "<|im_end|>\n<|im_start|>assistant\n") def generate_with_backend(backend): - guided_params = GuidedDecodingParams( + structured_outputs_params = StructuredOutputsParams( json=schema, backend=backend, disable_any_whitespace=True, disable_additional_properties=True) - sampling_params = SamplingParams(temperature=0, - max_tokens=256, - guided_decoding=guided_params) + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + structured_outputs=structured_outputs_params) outputs = llm.generate(prompt, sampling_params=sampling_params) assert outputs is not None @@ -736,12 +745,11 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): assert "a6" not in generated -@pytest.mark.parametrize("guided_decoding_backend", - ["guidance", "xgrammar", "outlines"]) -def test_structured_output_batched_with_non_guided_requests( +@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) +def test_structured_output_batched_with_non_structured_outputs_requests( monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], - guided_decoding_backend: str, + backend: str, ): monkeypatch.setenv("VLLM_USE_V1", "1") @@ -753,24 +761,25 @@ def test_structured_output_batched_with_non_guided_requests( model="meta-llama/Meta-Llama-3.1-8B-Instruct", enforce_eager=enforce_eager, max_model_len=1024, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=(guided_decoding_backend - in {"xgrammar", "guidance"}), + structured_outputs_config=StructuredOutputsConfig( + backend=backend, + disable_any_whitespace=backend in {"xgrammar", "guidance"}, + ), ) - guided_prompt = ( + structured_outputs_prompt = ( "Give an example JSON for an employee profile that fits this " "schema. Make the response as short as possible. Schema: " f"{sample_json_schema}") - non_guided_prompt = "The diameter of the Earth in kilometers is " + non_structured_outputs_prompt = "The diameter of the Earth in kilometers is " - prompts = [guided_prompt, non_guided_prompt] + prompts = [structured_outputs_prompt, non_structured_outputs_prompt] sampling_params = [ - SamplingParams( - temperature=1.0, - max_tokens=400, - guided_decoding=GuidedDecodingParams(json=sample_json_schema)), + SamplingParams(temperature=1.0, + max_tokens=400, + structured_outputs=StructuredOutputsParams( + json=sample_json_schema)), # No max tokens, temp=0 to assert on contents SamplingParams( seed=42, @@ -801,16 +810,16 @@ def test_structured_output_batched_with_non_guided_requests( print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}") if index == 0: - # First prompt is guided, expect valid JSON + # First prompt is structured outputs, expect valid JSON assert "\n" not in generated_text output_json = json.loads(generated_text) jsonschema.validate(instance=output_json, schema=sample_json_schema) else: - # Second prompt is not guided, expect valid output + # Second prompt is not structured outputs, expect valid output # Cannot assert on exact output, but we can expect it to be factual assert "12,742" in generated_text - # non-guided requests should not return a valid JSON here + # non-structured outputs requests should not return a valid JSON here with pytest.raises(ValueError): output_json = json.loads(generated_text) diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py index dffb32846c05e..9aa285aa9b18d 100644 --- a/tests/v1/entrypoints/openai/test_chat_completion.py +++ b/tests/v1/entrypoints/openai/test_chat_completion.py @@ -77,7 +77,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI, "role": "user", "content": prompt, }], - extra_body={"guided_json": invalid_json_schema}, + extra_body={"structured_outputs": { + "json": invalid_json_schema + }}, ) @@ -99,7 +101,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str): "content": prompt, }], extra_body={ - "guided_regex": r"[.*", + "structured_outputs": { + "regex": r"[.*" + }, "stop": ["\n"] }, ) @@ -134,5 +138,9 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): "role": "user", "content": prompt, }], - extra_body={"guided_grammar": invalid_simplified_sql_grammar}, + extra_body={ + "structured_outputs": { + "grammar": invalid_simplified_sql_grammar + } + }, ) diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 3114d7639f045..9090beb4bbd2a 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -627,7 +627,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI, await client.completions.create( model=model_name, prompt=prompt, - extra_body={"guided_json": invalid_json_schema}, + extra_body={"structured_outputs": { + "json": invalid_json_schema + }}, ) @@ -646,7 +648,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str): model=model_name, prompt=prompt, extra_body={ - "guided_regex": r"[.*", + "structured_outputs": { + "regex": r"[.*" + }, "stop": ["\n"] }, ) @@ -678,7 +682,11 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): await client.completions.create( model=model_name, prompt=prompt, - extra_body={"guided_grammar": invalid_simplified_sql_grammar}, + extra_body={ + "structured_outputs": { + "grammar": invalid_simplified_sql_grammar + } + }, ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 631618d427d42..9a1c5f0b0d453 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2277,34 +2277,34 @@ def get_served_model_name(model: str, return served_model_name -GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines", - "lm-format-enforcer"] +StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines", + "lm-format-enforcer"] @config @dataclass -class DecodingConfig: - """Dataclass which contains the decoding strategy of the engine.""" +class StructuredOutputsConfig: + """Dataclass which contains structured outputs config for the engine.""" - backend: GuidedDecodingBackend = "auto" - """Which engine will be used for guided decoding (JSON schema / regex etc) - by default. With "auto", we will make opinionated choices based on request - contents and what the backend libraries currently support, so the behavior - is subject to change in each release.""" + backend: StructuredOutputsBackend = "auto" + """Which engine will be used for structured outputs (e.g. JSON schema, + regex, etc) by default. With "auto", we will make opinionated choices + based on request contents and what the backend libraries currently support, + so the behavior is subject to change in each release.""" disable_fallback: bool = False """If `True`, vLLM will not fallback to a different backend on error.""" disable_any_whitespace: bool = False - """If `True`, the model will not generate any whitespace during guided - decoding. This is only supported for xgrammar and guidance backends.""" + """If `True`, the model will not generate any whitespace during structured + outputs. This is only supported for xgrammar and guidance backends.""" disable_additional_properties: bool = False """If `True`, the `guidance` backend will not use `additionalProperties` in the JSON schema. This is only supported for the `guidance` backend and is used to better align its behaviour with `outlines` and `xgrammar`.""" - reasoning_backend: str = "" + reasoning_parser: str = "" """Select the reasoning parser depending on the model that you're using. This is used to parse the reasoning content into OpenAI API format.""" @@ -2451,8 +2451,9 @@ class VllmConfig: """LoRA configuration.""" speculative_config: Optional[SpeculativeConfig] = None """Speculative decoding configuration.""" - decoding_config: DecodingConfig = field(default_factory=DecodingConfig) - """Decoding configuration.""" + structured_outputs_config: StructuredOutputsConfig = field( + default_factory=StructuredOutputsConfig) + """Structured outputs configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" quant_config: Optional[QuantizationConfig] = None @@ -2543,8 +2544,8 @@ class VllmConfig: vllm_factors.append(self.speculative_config.compute_hash()) else: vllm_factors.append("None") - if self.decoding_config: - vllm_factors.append(self.decoding_config.compute_hash()) + if self.structured_outputs_config: + vllm_factors.append(self.structured_outputs_config.compute_hash()) else: vllm_factors.append("None") if self.observability_config: @@ -3063,7 +3064,7 @@ class VllmConfig: f"enforce_eager={self.model_config.enforce_eager}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"device_config={self.device_config.device}, " - f"decoding_config={self.decoding_config!r}, " + f"structured_outputs_config={self.structured_outputs_config!r}, " f"observability_config={self.observability_config!r}, " f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e2a1ec68e6f53..fb5beab77b270 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -22,17 +22,16 @@ from typing_extensions import TypeIs, deprecated import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, - ConfigType, ConvertOption, DecodingConfig, - DetailedTraceModules, Device, DeviceConfig, - DistributedExecutorBackend, EPLBConfig, - GuidedDecodingBackend, HfOverrides, KVEventsConfig, + ConfigType, ConvertOption, DetailedTraceModules, + Device, DeviceConfig, DistributedExecutorBackend, + EPLBConfig, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, ModelDType, ModelImpl, ObservabilityConfig, ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, SchedulerPolicy, - SpeculativeConfig, TaskOption, TokenizerMode, - VllmConfig, get_attr_docs) + SpeculativeConfig, StructuredOutputsConfig, + TaskOption, TokenizerMode, VllmConfig, get_attr_docs) from vllm.config.multimodal import MMCacheType, MultiModalConfig from vllm.config.parallel import ExpertPlacementStrategy from vllm.config.utils import get_field @@ -418,12 +417,15 @@ class EngineArgs: disable_hybrid_kv_cache_manager: bool = ( SchedulerConfig.disable_hybrid_kv_cache_manager) - guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend - guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback - guided_decoding_disable_any_whitespace: bool = \ - DecodingConfig.disable_any_whitespace - guided_decoding_disable_additional_properties: bool = \ - DecodingConfig.disable_additional_properties + structured_outputs_config: StructuredOutputsConfig = get_field( + VllmConfig, "structured_outputs_config") + reasoning_parser: str = StructuredOutputsConfig.reasoning_parser + # Deprecated guided decoding fields + guided_decoding_backend: Optional[str] = None + guided_decoding_disable_fallback: Optional[bool] = None + guided_decoding_disable_any_whitespace: Optional[bool] = None + guided_decoding_disable_additional_properties: Optional[bool] = None + logits_processor_pattern: Optional[ str] = ModelConfig.logits_processor_pattern @@ -462,7 +464,6 @@ class EngineArgs: additional_config: dict[str, Any] = \ get_field(VllmConfig, "additional_config") - reasoning_parser: str = DecodingConfig.reasoning_backend use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location @@ -618,28 +619,29 @@ class EngineArgs: load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) - # Guided decoding arguments - guided_decoding_kwargs = get_kwargs(DecodingConfig) - guided_decoding_group = parser.add_argument_group( - title="DecodingConfig", - description=DecodingConfig.__doc__, + # Structured outputs arguments + structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig) + structured_outputs_group = parser.add_argument_group( + title="StructuredOutputsConfig", + description=StructuredOutputsConfig.__doc__, ) - guided_decoding_group.add_argument("--guided-decoding-backend", - **guided_decoding_kwargs["backend"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-fallback", - **guided_decoding_kwargs["disable_fallback"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-any-whitespace", - **guided_decoding_kwargs["disable_any_whitespace"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-additional-properties", - **guided_decoding_kwargs["disable_additional_properties"]) - guided_decoding_group.add_argument( + structured_outputs_group.add_argument( "--reasoning-parser", # This choice is a special case because it's not static choices=list(ReasoningParserManager.reasoning_parsers), - **guided_decoding_kwargs["reasoning_backend"]) + **structured_outputs_kwargs["reasoning_parser"]) + # Deprecated guided decoding arguments + for arg, type in [ + ("--guided-decoding-backend", str), + ("--guided-decoding-disable-fallback", bool), + ("--guided-decoding-disable-any-whitespace", bool), + ("--guided-decoding-disable-additional-properties", bool), + ]: + structured_outputs_group.add_argument( + arg, + type=type, + help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."), + deprecated=True) # Parallel arguments parallel_kwargs = get_kwargs(ParallelConfig) @@ -934,6 +936,8 @@ class EngineArgs: **vllm_kwargs["compilation_config"]) vllm_group.add_argument("--additional-config", **vllm_kwargs["additional_config"]) + vllm_group.add_argument('--structured-outputs-config', + **vllm_kwargs["structured_outputs_config"]) # Other arguments parser.add_argument('--disable-log-stats', @@ -1421,14 +1425,25 @@ class EngineArgs: load_config = self.create_load_config() - decoding_config = DecodingConfig( - backend=self.guided_decoding_backend, - disable_fallback=self.guided_decoding_disable_fallback, - disable_any_whitespace=self.guided_decoding_disable_any_whitespace, - disable_additional_properties=\ - self.guided_decoding_disable_additional_properties, - reasoning_backend=self.reasoning_parser - ) + # Pass reasoning_parser into StructuredOutputsConfig + if self.reasoning_parser: + self.structured_outputs_config.reasoning_parser = \ + self.reasoning_parser + + # Forward the deprecated CLI args to the StructuredOutputsConfig + so_config = self.structured_outputs_config + if self.guided_decoding_backend is not None: + so_config.guided_decoding_backend = \ + self.guided_decoding_backend + if self.guided_decoding_disable_fallback is not None: + so_config.guided_decoding_disable_fallback = \ + self.guided_decoding_disable_fallback + if self.guided_decoding_disable_any_whitespace is not None: + so_config.guided_decoding_disable_any_whitespace = \ + self.guided_decoding_disable_any_whitespace + if self.guided_decoding_disable_additional_properties is not None: + so_config.guided_decoding_disable_additional_properties = \ + self.guided_decoding_disable_additional_properties observability_config = ObservabilityConfig( show_hidden_metrics_for_version=( @@ -1446,7 +1461,7 @@ class EngineArgs: lora_config=lora_config, speculative_config=speculative_config, load_config=load_config, - decoding_config=decoding_config, + structured_outputs_config=self.structured_outputs_config, observability_config=observability_config, compilation_config=self.compilation_config, kv_transfer_config=self.kv_transfer_config, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1ae82c9f6f6f9..6793041abc502 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -10,9 +10,8 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, from weakref import ReferenceType import vllm.envs as envs -from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig, +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) -from vllm.config.lora import LoRAConfig from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout @@ -955,10 +954,6 @@ class AsyncLLMEngine(EngineClient): """Get the parallel configuration of the vLLM engine.""" return self.engine.get_parallel_config() - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - return self.engine.get_decoding_config() - async def get_scheduler_config(self) -> SchedulerConfig: """Get the scheduling configuration of the vLLM engine.""" return self.engine.get_scheduler_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 34b5dcb587503..708f3bbeeff15 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -16,9 +16,8 @@ import torch from typing_extensions import TypeVar import vllm.envs as envs -from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig, +from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, SchedulerConfig, VllmConfig) -from vllm.config.lora import LoRAConfig from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase, Stats @@ -213,8 +212,7 @@ class LLMEngine: self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config # noqa self.load_config = vllm_config.load_config - self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa - ) + self.structured_outputs_config = vllm_config.structured_outputs_config self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa ) @@ -364,10 +362,9 @@ class LLMEngine: self.observability_config.otlp_traces_endpoint) # Initialize reasoning parser if reasoning backend is set. - if self.decoding_config.reasoning_backend and \ - self.tokenizer: + if self.structured_outputs_config.reasoning_parser and self.tokenizer: reasoner_class = ReasoningParserManager.get_reasoning_parser( - self.decoding_config.reasoning_backend) + self.structured_outputs_config.reasoning_parser) self.reasoner: ReasoningParser = reasoner_class( self.tokenizer.get_lora_tokenizer()) @@ -381,7 +378,8 @@ class LLMEngine: self.seq_counter, stop_checker=StopChecker( self.scheduler_config.max_model_len, - self.reasoner if self.decoding_config.reasoning_backend + self.reasoner + if self.structured_outputs_config.reasoning_parser and self.tokenizer else None, ), )) @@ -772,10 +770,6 @@ class LLMEngine: """Gets the parallel configuration.""" return self.parallel_config - def get_decoding_config(self) -> DecodingConfig: - """Gets the decoding configuration.""" - return self.decoding_config - def get_scheduler_config(self) -> SchedulerConfig: """Gets the scheduler configuration.""" return self.scheduler_config diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 808d2d0ce3d28..c345f17e6614f 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -6,7 +6,7 @@ from abc import ABC, abstractmethod from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt @@ -248,11 +248,6 @@ class EngineClient(ABC): """Get the model configuration of the vLLM engine.""" ... - @abstractmethod - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - ... - @abstractmethod async def get_input_preprocessor(self) -> InputPreprocessor: """Get the input processor of the vLLM engine.""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f2264292fa660..63e9478612bb1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -15,8 +15,8 @@ import vllm.envs as envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) -from vllm.config import (CompilationConfig, ModelDType, TokenizerMode, - is_init_field) +from vllm.config import (CompilationConfig, ModelDType, + StructuredOutputsConfig, TokenizerMode, is_init_field) from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides, PoolerConfig, RunnerOption) from vllm.engine.llm_engine import LLMEngine @@ -192,6 +192,8 @@ class LLM: hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, override_pooler_config: Optional[PoolerConfig] = None, + structured_outputs_config: Optional[Union[dict[ + str, Any], StructuredOutputsConfig]] = None, kv_cache_memory_bytes: Optional[int] = None, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, @@ -236,14 +238,30 @@ class LLM: compilation_config_instance = CompilationConfig( level=compilation_config) elif isinstance(compilation_config, dict): - predicate = lambda x: is_init_field(CompilationConfig, x[0]) compilation_config_instance = CompilationConfig( - **dict(filter(predicate, compilation_config.items()))) + **{ + k: v + for k, v in compilation_config.items() + if is_init_field(CompilationConfig, k) + }) else: compilation_config_instance = compilation_config else: compilation_config_instance = CompilationConfig() + if structured_outputs_config is not None: + if isinstance(structured_outputs_config, dict): + structured_outputs_instance = StructuredOutputsConfig( + **{ + k: v + for k, v in structured_outputs_config.items() + if is_init_field(StructuredOutputsConfig, k) + }) + else: + structured_outputs_instance = structured_outputs_config + else: + structured_outputs_instance = StructuredOutputsConfig() + engine_args = EngineArgs( model=model, runner=runner, @@ -271,6 +289,7 @@ class LLM: hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, + structured_outputs_config=structured_outputs_instance, compilation_config=compilation_config_instance, logits_processors=logits_processors, **kwargs, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c07e95e9370a0..93ea846f26f6c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1678,7 +1678,7 @@ async def init_app_state( enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, tool_server=tool_server, - reasoning_parser=args.reasoning_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, @@ -1697,7 +1697,7 @@ async def init_app_state( exclude_tools_when_tool_choice_none=args. exclude_tools_when_tool_choice_none, tool_parser=args.tool_call_parser, - reasoning_parser=args.reasoning_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, @@ -1800,10 +1800,10 @@ def validate_api_server_args(args): f"(chose from {{ {','.join(valid_tool_parses)} }})") valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() - if args.reasoning_parser \ - and args.reasoning_parser not in valid_reasoning_parses: + if ((reasoning_parser := args.structured_outputs_config.reasoning_parser) + and reasoning_parser not in valid_reasoning_parses): raise KeyError( - f"invalid reasoning parser: {args.reasoning_parser} " + f"invalid reasoning parser: {reasoning_parser} " f"(chose from {{ {','.join(valid_reasoning_parses)} }})") diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2505e493625d8..cff4a45fdc43e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -54,8 +54,8 @@ from vllm.entrypoints.score_utils import (ScoreContentPartParam, from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.pooling_params import PoolingParams -from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, - RequestOutputKind, SamplingParams) +from vllm.sampling_params import (BeamSearchParams, RequestOutputKind, + SamplingParams, StructuredOutputsParams) from vllm.utils import random_uuid, resolve_obj_by_qualname logger = init_logger(__name__) @@ -373,11 +373,12 @@ class ResponsesRequest(OpenAIBaseModel): stop_token_ids = default_sampling_params.get("stop_token_ids") # Structured output - guided_decoding = None + structured_outputs = None if self.text is not None and self.text.format is not None: response_format = self.text.format - if response_format.type == "json_schema": - guided_decoding = GuidedDecodingParams.from_optional( + if (response_format.type == "json_schema" + and response_format.schema_ is not None): + structured_outputs = StructuredOutputsParams( json=response_format.schema_) elif response_format.type == "json_object": raise NotImplementedError("json_object is not supported") @@ -392,7 +393,7 @@ class ResponsesRequest(OpenAIBaseModel): stop_token_ids=stop_token_ids, output_kind=(RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY), - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, ) def is_include_output_logprobs(self) -> bool: @@ -547,42 +548,9 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description=("Additional kwargs to pass to the HF processor."), ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( + structured_outputs: Optional[StructuredOutputsParams] = Field( default=None, - description=("If specified, the output will follow the JSON schema."), - ) - guided_regex: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[list[str]] = Field( - default=None, - description=( - "If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the context free grammar."), - ) - structural_tag: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the structural tag schema."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be either " - "'outlines' / 'lm-format-enforcer'"), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding."), + description="Additional kwargs for structured outputs", ) priority: int = Field( default=0, @@ -701,31 +669,33 @@ class ChatCompletionRequest(OpenAIBaseModel): if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - guided_json_object = None - if self.response_format is not None: - if self.response_format.type == "json_object": - guided_json_object = True - elif self.response_format.type == "json_schema": - json_schema = self.response_format.json_schema - assert json_schema is not None - self.guided_json = json_schema.json_schema - elif self.response_format.type == "structural_tag": - structural_tag = self.response_format - assert structural_tag is not None and isinstance( - structural_tag, StructuralTagResponseFormat) - s_tag_obj = structural_tag.model_dump(by_alias=True) - self.structural_tag = json.dumps(s_tag_obj) + response_format = self.response_format + json_schema_from_tool = self._get_json_schema_from_tool() + if response_format is not None or json_schema_from_tool is not None: + # If structured outputs wasn't already enabled, + # we must enable it for these features to work + if self.structured_outputs is None: + self.structured_outputs = StructuredOutputsParams() - guided_decoding = GuidedDecodingParams.from_optional( - json=self._get_guided_json_from_tool() or self.guided_json, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - json_object=guided_json_object, - backend=self.guided_decoding_backend, - whitespace_pattern=self.guided_whitespace_pattern, - structural_tag=self.structural_tag, - ) + # Set structured output params for response format + if response_format is not None: + if response_format.type == "json_object": + self.structured_outputs.json_object = True + elif response_format.type == "json_schema": + json_schema = response_format.json_schema + assert json_schema is not None + self.structured_outputs.json = json_schema.json_schema + elif response_format.type == "structural_tag": + structural_tag = response_format + assert structural_tag is not None and isinstance( + structural_tag, StructuralTagResponseFormat) + s_tag_obj = structural_tag.model_dump(by_alias=True) + self.structured_outputs.structural_tag = json.dumps( + s_tag_obj) + + # Set structured output params for tool calling + if json_schema_from_tool is not None: + self.structured_outputs.json = json_schema_from_tool extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -757,15 +727,14 @@ class ChatCompletionRequest(OpenAIBaseModel): truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - guided_decoding=guided_decoding, + structured_outputs=self.structured_outputs, logit_bias=self.logit_bias, - bad_words= self.bad_words, + bad_words=self.bad_words, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, ) - def _get_guided_json_from_tool( - self) -> Optional[Union[str, dict, BaseModel]]: + def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]: # user has chosen to not use any tool if self.tool_choice == "none" or self.tools is None: return None @@ -875,28 +844,31 @@ class ChatCompletionRequest(OpenAIBaseModel): @model_validator(mode="before") @classmethod - def check_guided_decoding_count(cls, data): + def check_structured_outputs_count(cls, data): if isinstance(data, ValueError): raise data - guide_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None - ]) - # you can only use one kind of guided decoding - if guide_count > 1: + if "structured_outputs" not in data: + return data + + structured_outputs_kwargs = data['structured_outputs'] + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice")) + # you can only use one kind of constraints for structured outputs + if count > 1: raise ValueError( - "You can only use one kind of guided decoding " - "('guided_json', 'guided_regex' or 'guided_choice').") - # you can only either use guided decoding or tools, not both - if guide_count > 1 and data.get("tool_choice", "none") not in ( + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice').") + # you can only either use structured outputs or tools, not both + if count > 1 and data.get("tool_choice", "none") not in ( "none", "auto", "required", ): raise ValueError( - "You can only either use guided decoding or tools, not both.") + "You can only either use constraints for structured outputs " + "or tools, not both.") return data @model_validator(mode="before") @@ -1049,37 +1021,9 @@ class CompletionRequest(OpenAIBaseModel): ", {'type': 'structural_tag'}, or {'type': 'text' } is supported." ), ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( + structured_outputs: Optional[StructuredOutputsParams] = Field( default=None, - description="If specified, the output will follow the JSON schema.", - ) - guided_regex: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[list[str]] = Field( - default=None, - description=( - "If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the context free grammar."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be one of " - "'outlines' / 'lm-format-enforcer'"), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding."), + description="Additional kwargs for structured outputs", ) priority: int = Field( default=0, @@ -1210,20 +1154,10 @@ class CompletionRequest(OpenAIBaseModel): echo_without_generation = self.echo and self.max_tokens == 0 - guided_json_object = None - if (self.response_format is not None + if (self.structured_outputs is not None + and self.response_format is not None and self.response_format.type == "json_object"): - guided_json_object = True - - guided_decoding = GuidedDecodingParams.from_optional( - json=self.guided_json, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - json_object=guided_json_object, - backend=self.guided_decoding_backend, - whitespace_pattern=self.guided_whitespace_pattern, - ) + self.structured_outputs.json_object = True extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -1255,7 +1189,7 @@ class CompletionRequest(OpenAIBaseModel): truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - guided_decoding=guided_decoding, + structured_outputs=self.structured_outputs, logit_bias=self.logit_bias, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, @@ -1263,16 +1197,18 @@ class CompletionRequest(OpenAIBaseModel): @model_validator(mode="before") @classmethod - def check_guided_decoding_count(cls, data): - guide_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None - ]) - if guide_count > 1: + def check_structured_outputs_count(cls, data): + if "structured_outputs" not in data: + return data + + structured_outputs_kwargs = data['structured_outputs'] + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice")) + if count > 1: raise ValueError( - "You can only use one kind of guided decoding " - "('guided_json', 'guided_regex' or 'guided_choice').") + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice').") return data @model_validator(mode="before") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index cd85baa9ba661..16564214e353a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -993,7 +993,7 @@ class OpenAIServingChat(OpenAIServing): # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing - # only happens if we are NOT using guided decoding + # only happens if we are NOT using structured outputs auto_tools_called = False if tool_parser: auto_tools_called = len( diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 687af7a189cea..ce3d23763ed64 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -262,9 +262,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: - decoding_config = vllm_config.decoding_config - if decoding_config.reasoning_backend == "": - decoding_config.reasoning_backend = "openai_gptoss" + structured_outputs_config = vllm_config.structured_outputs_config + if structured_outputs_config.reasoning_parser == "": + structured_outputs_config.reasoning_parser = "openai_gptoss" # Increase the max capture size from 512 to 1024 for performance. # NOTE(woosuk): This will increase the number of CUDA graphs diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index fe93e906064e4..0a01cb0260ae5 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampling parameters for text generation.""" import copy -from dataclasses import dataclass +from dataclasses import field from enum import Enum, IntEnum from functools import cached_property from typing import Annotated, Any, Optional, Union import msgspec -from pydantic import BaseModel +from pydantic.dataclasses import dataclass from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -28,60 +28,35 @@ class SamplingType(IntEnum): # maybe make msgspec? @dataclass -class GuidedDecodingParams: - """One of these fields will be used to build a logit processor.""" +class StructuredOutputsParams: + # One of these fields will be used to build a logit processor. json: Optional[Union[str, dict]] = None regex: Optional[str] = None choice: Optional[list[str]] = None grammar: Optional[str] = None json_object: Optional[bool] = None - """These are other options that can be set""" - backend: Optional[str] = None - backend_was_auto: bool = False + # These are other options that can be set. disable_fallback: bool = False disable_any_whitespace: bool = False disable_additional_properties: bool = False whitespace_pattern: Optional[str] = None structural_tag: Optional[str] = None - @staticmethod - def from_optional( - json: Optional[Union[dict, BaseModel, str]] = None, - regex: Optional[str] = None, - choice: Optional[list[str]] = None, - grammar: Optional[str] = None, - json_object: Optional[bool] = None, - backend: Optional[str] = None, - whitespace_pattern: Optional[str] = None, - structural_tag: Optional[str] = None, - ) -> Optional["GuidedDecodingParams"]: - if all(arg is None for arg in (json, regex, choice, grammar, - json_object, structural_tag)): - return None - # Extract json schemas from pydantic models - if isinstance(json, (BaseModel, type(BaseModel))): - json = json.model_json_schema() - return GuidedDecodingParams( - json=json, - regex=regex, - choice=choice, - grammar=grammar, - json_object=json_object, - backend=backend, - whitespace_pattern=whitespace_pattern, - structural_tag=structural_tag, - ) + _backend: Optional[str] = field(default=None, init=False) + """CAUTION: Should only be set by Processor._validate_structured_output""" + _backend_was_auto: bool = field(default=False, init=False) + """CAUTION: Should only be set by Processor._validate_structured_output""" def __post_init__(self): """Validate that some fields are mutually exclusive.""" - guide_count = sum([ + count = sum([ self.json is not None, self.regex is not None, self.choice is not None, self.grammar is not None, self.json_object is not None ]) - if guide_count > 1: + if count > 1: raise ValueError( - "You can only use one kind of guided decoding but multiple are " - f"specified: {self.__dict__}") + "You can only use one kind of structured outputs constraint " + f"but multiple are specified: {self.__dict__}") class RequestOutputKind(Enum): @@ -196,9 +171,8 @@ class SamplingParams( _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) # Fields used to construct logits processors - guided_decoding: Optional[GuidedDecodingParams] = None - """If provided, the engine will construct a guided decoding logits - processor from these parameters.""" + structured_outputs: Optional[StructuredOutputsParams] = None + """Parameters for configuring structured outputs.""" logit_bias: Optional[dict[int, float]] = None """If provided, the engine will construct a logits processor that applies these logit biases.""" @@ -246,7 +220,7 @@ class SamplingParams( msgspec.Meta( ge=-1)]] = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, - guided_decoding: Optional[GuidedDecodingParams] = None, + structured_outputs: Optional[StructuredOutputsParams] = None, logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, allowed_token_ids: Optional[list[int]] = None, extra_args: Optional[dict[str, Any]] = None, @@ -288,7 +262,7 @@ class SamplingParams( logits_processors=logits_processors, truncate_prompt_tokens=truncate_prompt_tokens, output_kind=output_kind, - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, logit_bias=logit_bias, allowed_token_ids=allowed_token_ids, extra_args=extra_args, @@ -559,7 +533,7 @@ class SamplingParams( "spaces_between_special_tokens=" f"{self.spaces_between_special_tokens}, " f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " - f"guided_decoding={self.guided_decoding}, " + f"structured_outputs={self.structured_outputs}, " f"extra_args={self.extra_args})") diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 5b07327cf2b81..d8a8d19391cd0 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -274,7 +274,7 @@ class MistralTokenizer(TokenizerBase): return tokenizer_file # the following attributes are set to fit vLLM's design and are used - # by the guided structured output backends. + # by the structured output backends. @property def all_special_tokens_extended(self) -> list[str]: from mistral_common.tokens.tokenizers.base import SpecialTokens @@ -463,9 +463,6 @@ class MistralTokenizer(TokenizerBase): return decoded - # WARN: Outlines logits processors can overwrite this method. - # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer - # for more. def decode(self, ids: Union[list[int], int], skip_special_tokens: bool = True) -> str: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f17c269e4709e..73165c7e4c0ad 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -588,9 +588,6 @@ class AsyncLLM(EngineClient): async def get_model_config(self) -> ModelConfig: return self.model_config - async def get_decoding_config(self): - raise ValueError("Not Supported on V1 yet.") - async def get_input_preprocessor(self) -> InputPreprocessor: return self.processor.input_preprocessor diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 8d9f2ba1ec825..71f539583a1be 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -45,7 +45,7 @@ class Processor: self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config - self.decoding_config = vllm_config.decoding_config + self.structured_outputs_config = vllm_config.structured_outputs_config self.tokenizer = tokenizer self.generation_config_fields = ( @@ -219,58 +219,57 @@ class Processor: "[lora_path]` to use the LoRA tokenizer.") def _validate_structured_output(self, params: SamplingParams) -> None: - if not params.guided_decoding or not self.decoding_config: + if not params.structured_outputs or not self.structured_outputs_config: return - if self.model_config.skip_tokenizer_init and params.guided_decoding: + if self.model_config.skip_tokenizer_init and params.structured_outputs: raise ValueError( "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) - engine_level_backend = self.decoding_config.backend - if params.guided_decoding.backend: - # Request-level backend selection is not supported in V1. + backend = self.structured_outputs_config.backend + if _backend := params.structured_outputs._backend: + # Request-level backend selection is not supported. # The values may differ if `params` is reused and was set # to a specific backend based on `auto` behavior in a previous # request. We remember that it was set as a result of `auto` - # using the `_auto` option set on the backend in the params. - if (params.guided_decoding.backend != engine_level_backend - and not (engine_level_backend == "auto" - and params.guided_decoding.backend_was_auto)): + # using the `_backend_was_auto` field set in the params. + if (backend != _backend + and not (backend == "auto" + and params.structured_outputs._backend_was_auto)): raise ValueError( - "Request-level structured output backend selection is no " - "longer supported. The request specified " - f"'{params.guided_decoding.backend}', but vLLM was " - f"initialised with '{engine_level_backend}'. This error " - "can be resolved by removing backend selection from the " - "request.") + "Request-level structured output backend selection is not " + f"supported. The request specified '{_backend}', but vLLM " + f"was initialised with '{backend}'. This error can be " + "resolved by removing '_backend' from the request.") else: - params.guided_decoding.backend = engine_level_backend + params.structured_outputs._backend = backend # Request content validation - if (isinstance(params.guided_decoding.choice, list) - and not params.guided_decoding.choice): + if (isinstance(params.structured_outputs.choice, list) + and not params.structured_outputs.choice): # It is invalid for choice to be an empty list - raise ValueError(f"Choice '{params.guided_decoding.choice}' " - "cannot be an empty list") + raise ValueError( + f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501 + ) - if engine_level_backend.startswith("xgrammar"): + if backend.startswith("xgrammar"): # xgrammar with no fallback validate_xgrammar_grammar(params) - elif engine_level_backend.startswith("guidance"): + elif backend.startswith("guidance"): # TODO: ideally we would have the LLTokenizer here as Lark syntax # allows <|special_token|> and similar, see # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens # Without tokenizer these are disallowed in grammars. validate_guidance_grammar(params, tokenizer=None) - elif engine_level_backend == "outlines": + elif backend == "outlines": # outlines backend validate_structured_output_request_outlines(params) - elif engine_level_backend == "lm-format-enforcer": + elif backend == "lm-format-enforcer": # lm format enforcer backend validate_structured_output_request_lm_format_enforcer(params) else: - # NOTE: engine_level_backend must be "auto" here, because we have + # NOTE: backend must be "auto" here, because we have # checked supported_backends above. # In this mode, we set opinionated defaults based on what we think # will satisfy the most use cases without having to worry about @@ -278,15 +277,15 @@ class Processor: # other setting where a specific backend was specified. try: validate_xgrammar_grammar(params) - params.guided_decoding.backend = "xgrammar" + params.structured_outputs._backend = "xgrammar" except ValueError: # The request either failed validation # or includes some jsonschema feature(s) that # are not supported in xgrammar. Fall back to guidance. validate_guidance_grammar(params, tokenizer=None) - params.guided_decoding.backend = "guidance" + params.structured_outputs._backend = "guidance" # Remember that this backend was set automatically - params.guided_decoding.backend_was_auto = True + params.structured_outputs._backend_was_auto = True def _maybe_build_mm_uuids( self, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 4e3e581235cce..145af788d2372 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -67,7 +67,7 @@ class Request: # Generative models. assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - if sampling_params.guided_decoding is not None: + if sampling_params.structured_outputs is not None: self.status = RequestStatus.WAITING_FOR_FSM self.use_structured_output = True diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 1ab29dfecd9e4..13c33d3edf141 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -61,11 +61,11 @@ class StructuredOutputManager: self.executor = ThreadPoolExecutor(max_workers=max_workers) self.tokenizer = init_tokenizer_from_configs( model_config=self.vllm_config.model_config) - reasoning_backend = \ - self.vllm_config.decoding_config.reasoning_backend - if reasoning_backend: + reasoning_parser = \ + self.vllm_config.structured_outputs_config.reasoning_parser + if reasoning_parser: reasoner_cls = ReasoningParserManager.get_reasoning_parser( - reasoning_backend) + reasoning_parser) self.reasoner = reasoner_cls(tokenizer=self.tokenizer) def grammar_init(self, request: Request) -> None: @@ -74,15 +74,16 @@ class StructuredOutputManager: if TYPE_CHECKING: assert request.sampling_params is not None and \ - request.sampling_params.guided_decoding is not None + request.sampling_params.structured_outputs is not None # Initialize the backend the first time it is needed. # # NOTE: We only support a single backend. We do NOT support different # backends on a per-request basis in V1 (for now, anyway...). + # _backend is set in Processor._validate_structured_output if self.backend is None: assert request.sampling_params is not None - backend = request.sampling_params.guided_decoding.backend + backend = request.sampling_params.structured_outputs._backend vocab_size = self.vllm_config.model_config.get_vocab_size() if backend == "xgrammar": self.backend = XgrammarBackend( diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 02e7fc33f517d..e06ab6377de3a 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_outputs_config.disable_any_whitespace self.disable_additional_properties = \ - self.vllm_config.decoding_config.disable_additional_properties + self.vllm_config.structured_outputs_config.disable_additional_properties self.ll_tokenizer = llguidance_hf.from_tokenizer( self.tokenizer, self.vocab_size) diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py index 2279a1c8c8a00..465b2428f8938 100644 --- a/vllm/v1/structured_output/backend_lm_format_enforcer.py +++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py @@ -138,30 +138,30 @@ class LMFormatEnforcerBackend(StructuredOutputBackend): def validate_structured_output_request_lm_format_enforcer( params: SamplingParams): - if params.guided_decoding is None: + if params.structured_outputs is None: return - gd_params = params.guided_decoding + so_params = params.structured_outputs - if gd_params.regex: + if so_params.regex: return - elif gd_params.json: - if isinstance(gd_params.json, str): + elif so_params.json: + if isinstance(so_params.json, str): try: # make sure schema is valid json - json.loads(gd_params.json) + json.loads(so_params.json) except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: try: - json.dumps(gd_params.json) + json.dumps(so_params.json) except Exception as e: raise ValueError( - f"Error serializing guided decoding jsonschema: {e}" + f"Error serializing structured outputs jsonschema: {e}" ) from e return - elif gd_params.choice: + elif so_params.choice: return - elif gd_params.grammar: - raise ValueError("LM Format Enforcer guided decoding backend " + elif so_params.grammar: + raise ValueError("LM Format Enforcer structured outputs backend " "does not support grammar specifications") diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py index 572e4984480fa..e5e638a6ad764 100644 --- a/vllm/v1/structured_output/backend_outlines.py +++ b/vllm/v1/structured_output/backend_outlines.py @@ -158,36 +158,36 @@ class OutlinesGrammar(StructuredOutputGrammar): def validate_structured_output_request_outlines(params: SamplingParams): - if params.guided_decoding is None: + if params.structured_outputs is None: return - gd_params = params.guided_decoding + so_params = params.structured_outputs - if gd_params.regex: - validate_regex_is_buildable(gd_params.regex) - elif gd_params.json: - if isinstance(gd_params.json, str): + if so_params.regex: + validate_regex_is_buildable(so_params.regex) + elif so_params.json: + if isinstance(so_params.json, str): try: # make sure schema is valid json - json.loads(gd_params.json) - schema = gd_params.json + json.loads(so_params.json) + schema = so_params.json except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: try: - schema = json.dumps(gd_params.json) + schema = json.dumps(so_params.json) except Exception as e: raise ValueError( - f"Error serializing guided decoding jsonschema: {e}" + f"Error serializing structured outputs jsonschema: {e}" ) from e pattern = json_schema.build_regex_from_schema(schema) validate_regex_is_buildable(pattern) - elif gd_params.choice: - choices = [regex_escape(str(choice)) for choice in gd_params.choice] + elif so_params.choice: + choices = [regex_escape(str(choice)) for choice in so_params.choice] regex = "(" + "|".join(choices) + ")" validate_regex_is_buildable(regex) - elif gd_params.grammar: - raise ValueError("Outlines guided decoding backend " + elif so_params.grammar: + raise ValueError("Outlines structured outputs backend " "does not support grammar specifications") @@ -306,7 +306,7 @@ def validate_regex_is_buildable(pattern: str) -> None: _check_unsupported(parsed) except ValueError as e: raise ValueError( - f"Regex uses unsupported feature for guided decoding: {e}. " + f"Regex uses unsupported feature for structured outputs: {e}. " "Only basic matching constructs are supported—lookarounds, " "backreferences, and unicode boundaries are not.") from e @@ -315,6 +315,6 @@ def validate_regex_is_buildable(pattern: str) -> None: "Regex does not have a anchored universal start state" "This means that the Regex uses anchors (^) or look-arounds " "in a way which requires context before any token is matched." - "Guided decoding needs regexes that can match without needing " + "structured outputs needs regexes that can match without needing " "that context. Try rewriting the pattern without using these " f"constructs. Pattern:\n{pattern}") diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 5e00f63804162..55b4792fe010d 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_outputs_config.disable_any_whitespace if isinstance(self.tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly. @@ -248,37 +248,37 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: Raises ValueError if the request is not supported. """ - if sampling_params.guided_decoding is None: + if sampling_params.structured_outputs is None: return - gd_params = sampling_params.guided_decoding + so_params = sampling_params.structured_outputs - if gd_params.regex: + if so_params.regex: try: - xgr.Grammar.from_regex(gd_params.regex) + xgr.Grammar.from_regex(so_params.regex) except Exception as err: raise ValueError("Failed to transform regex into a grammar: " f"{err}") from err - if gd_params.choice: - choice_grammar = choice_as_grammar(gd_params.choice) + if so_params.choice: + choice_grammar = choice_as_grammar(so_params.choice) try: xgr.Grammar.from_ebnf(choice_grammar) except Exception as err: raise ValueError("Failed to transform choices into a grammar: " "{err}") from err - gd_params.choice = None - gd_params.grammar = choice_grammar + so_params.choice = None + so_params.grammar = choice_grammar return - if gd_params.json: - if isinstance(gd_params.json, str): + if so_params.json: + if isinstance(so_params.json, str): try: - schema = json.loads(gd_params.json) + schema = json.loads(so_params.json) except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: - schema = gd_params.json + schema = so_params.json try: xgr.Grammar.from_json_schema(schema) @@ -291,11 +291,11 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: "supported by xgrammar.") return - if gd_params.grammar: - if grammar_is_likely_lark(gd_params.grammar): + if so_params.grammar: + if grammar_is_likely_lark(so_params.grammar): # xgrammar supports EBNF grammars only try: - gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar) + so_params.grammar = convert_lark_to_ebnf(so_params.grammar) except ValueError as e: raise ValueError( "Failed to convert the grammar from Lark to EBNF. ") from e @@ -303,14 +303,14 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: # Test parsing EBNF grammar, possibly already converted from Lark try: # parse the grammar, but we aren't compiling it. - xgr.Grammar.from_ebnf(gd_params.grammar) + xgr.Grammar.from_ebnf(so_params.grammar) except Exception as e: raise ValueError("Invalid grammar specification.") from e return - if gd_params.structural_tag: + if so_params.structural_tag: try: - s_tag = json.loads(gd_params.structural_tag) + s_tag = json.loads(so_params.structural_tag) tags = [ xgr.StructuralTagItem( begin=s["begin"], diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index fc365f12573fc..99974ef46ecd5 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -60,7 +60,7 @@ class StructuredOutputRequest: def get_structured_output_key( sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.guided_decoding + params = sampling_params.structured_outputs assert params is not None, "params can't be None." if params.json is not None: if not isinstance(params.json, str): From 4f02b77de4e794a0d417ed98a26884208f75e043 Mon Sep 17 00:00:00 2001 From: ihb2032 <40718643+ihb2032@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:43:23 +0800 Subject: [PATCH 415/584] Fix: Add explicit #include <omp.h> for OpenMP compatibility on certain toolchains (#24951) Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn> Signed-off-by: ihb2032 <1355790728@qq.com> --- csrc/cpu/cpu_types.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 17bbe04eef94a..c3a21796881c9 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -17,4 +17,8 @@ #warning "unsupported vLLM cpu implementation" #endif +#ifdef _OPENMP + #include <omp.h> +#endif + #endif \ No newline at end of file From abdfcd4f3dc21dc162baf6887f658fb0f2f3d783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= <elvircrn@gmail.com> Date: Thu, 18 Sep 2025 12:25:12 +0200 Subject: [PATCH 416/584] silu-v1: Fix EPS not being used during max-reduction (#25069) Signed-off-by: elvircrn <elvircrn@gmail.com> --- csrc/quantization/activation_kernels.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 9ddb5af3052fa..9aa1411b4a25c 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( int32_t compute_pipeline_offset_64 = 0; for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) { - __nv_bfloat16 y_max_bf16 = EPS; __nv_bfloat162 results_bf162[2]; cp_async_wait<NUM_STAGES - 2>(); @@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( auto _y_max2 = __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1])); - y_max_bf16 = __hmax(_y_max2.x, _y_max2.y); + __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y)); // An entire group is assigned to a single warp, so a simple warp reduce // is used. From cc935fdd7e0c466cd556b6515e435dddd78677e0 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Thu, 18 Sep 2025 18:34:42 +0800 Subject: [PATCH 417/584] [Frontend] Support setting logprobs to -1 (#25031) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> --- tests/entrypoints/openai/test_chat_echo.py | 23 ++++++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 8 +++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index 0f459dd3d8574..ce965eb829248 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -99,3 +99,26 @@ async def test_prompt_logprobs(client: openai.AsyncOpenAI): assert completion.prompt_logprobs is not None assert len(completion.prompt_logprobs) > 0 + + +@pytest.mark.asyncio +async def test_top_logprobs(client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "Beijing is the capital of which country?" + }] + + completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + extra_body={ + "top_logprobs": -1, + "logprobs": "true", + }, + ) + assert completion.choices[0].logprobs is not None + assert completion.choices[0].logprobs.content is not None + assert len(completion.choices[0].logprobs.content) > 0 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index cff4a45fdc43e..7ad8e73d89d59 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -832,10 +832,12 @@ class ChatCompletionRequest(OpenAIBaseModel): raise ValueError("`prompt_logprobs=-1` is only supported with " "vLLM engine V1.") if (top_logprobs := data.get("top_logprobs")) is not None: - if top_logprobs < 0: - raise ValueError("`top_logprobs` must be a positive value.") + if top_logprobs < 0 and top_logprobs != -1: + raise ValueError( + "`top_logprobs` must be a positive value or -1.") - if top_logprobs > 0 and not data.get("logprobs"): + if (top_logprobs == -1 + or top_logprobs > 0) and not data.get("logprobs"): raise ValueError( "when using `top_logprobs`, `logprobs` must be set to true." ) From 37970105fed95d58677f0a4635cb253a71e8817c Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Thu, 18 Sep 2025 19:04:21 +0800 Subject: [PATCH 418/584] [Model] Improve Pooling Model (#25149) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- vllm/model_executor/layers/pooler.py | 12 ++++++------ vllm/v1/worker/gpu_model_runner.py | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index b571a8f866990..4a97438b1bb2c 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -12,8 +12,9 @@ import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config import ModelConfig, PoolerConfig +from vllm.config import ModelConfig, PoolerConfig, get_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.models.adapters import _load_st_projector from vllm.pooling_params import PoolingParams from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput from vllm.tasks import PoolingTask @@ -377,7 +378,6 @@ class PoolerClassify(PoolerActivation): super().__init__() if static_num_labels: - from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() self.num_labels = getattr(vllm_config.model_config.hf_config, "num_labels", 0) @@ -427,8 +427,6 @@ class EmbeddingPoolerHead(PoolerHead): super().__init__(activation=PoolerNormalize()) # Load ST projector if available - from vllm.config import get_current_vllm_config - from vllm.model_executor.models.adapters import _load_st_projector vllm_config = get_current_vllm_config() self.projector: Optional[nn.Module] = _load_st_projector( @@ -489,7 +487,6 @@ class RewardPoolerHead(PoolerHead): def __init__(self) -> None: super().__init__(activation=PoolerClassify(static_num_labels=False)) - from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() self.head_dtype = vllm_config.model_config.head_dtype @@ -638,7 +635,6 @@ class ClassifierPooler(Pooler): ) -> None: super().__init__() - from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() self.pooling = pooling @@ -730,3 +726,7 @@ class DispatchPooler(Pooler): offset += num_items return PoolerOutput(outputs) + + def extra_repr(self) -> str: + s = f"supported_task={self.get_supported_tasks()}" + return s diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e8ad9c2fca07c..2e67984cb4327 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3151,6 +3151,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model = cast(VllmModelForPooling, self.get_model()) dummy_pooling_params = PoolingParams(task=task) + dummy_pooling_params.verify(task=task, model_config=self.model_config) to_update = model.pooler.get_pooling_updates(task) to_update.apply(dummy_pooling_params) From 8ed039d52775aaee4a61663dd5d8c840f5eebd15 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:24:27 +0100 Subject: [PATCH 419/584] Move `StructuredOutputsConfig` from `config/__init__.py` to `config/structured_outputs.py` (#25153) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 61 +---------------------------- vllm/config/structured_outputs.py | 64 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 60 deletions(-) create mode 100644 vllm/config/structured_outputs.py diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 9a1c5f0b0d453..69ab5712d404c 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -42,6 +42,7 @@ from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.speculative import SpeculativeConfig +from vllm.config.structured_outputs import StructuredOutputsConfig from vllm.config.utils import ConfigType, config from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods @@ -2277,66 +2278,6 @@ def get_served_model_name(model: str, return served_model_name -StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines", - "lm-format-enforcer"] - - -@config -@dataclass -class StructuredOutputsConfig: - """Dataclass which contains structured outputs config for the engine.""" - - backend: StructuredOutputsBackend = "auto" - """Which engine will be used for structured outputs (e.g. JSON schema, - regex, etc) by default. With "auto", we will make opinionated choices - based on request contents and what the backend libraries currently support, - so the behavior is subject to change in each release.""" - - disable_fallback: bool = False - """If `True`, vLLM will not fallback to a different backend on error.""" - - disable_any_whitespace: bool = False - """If `True`, the model will not generate any whitespace during structured - outputs. This is only supported for xgrammar and guidance backends.""" - - disable_additional_properties: bool = False - """If `True`, the `guidance` backend will not use `additionalProperties` - in the JSON schema. This is only supported for the `guidance` backend and - is used to better align its behaviour with `outlines` and `xgrammar`.""" - - reasoning_parser: str = "" - """Select the reasoning parser depending on the model that you're using. - This is used to parse the reasoning content into OpenAI API format.""" - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self): - if (self.disable_any_whitespace - and self.backend not in ("xgrammar", "guidance")): - raise ValueError("disable_any_whitespace is only supported for " - "xgrammar and guidance backends.") - if (self.disable_additional_properties and self.backend != "guidance"): - raise ValueError("disable_additional_properties is only supported " - "for the guidance backend.") - - DetailedTraceModules = Literal["model", "worker", "all"] diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py new file mode 100644 index 0000000000000..b1f14294510f8 --- /dev/null +++ b/vllm/config/structured_outputs.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import Any, Literal + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines", + "lm-format-enforcer"] + + +@config +@dataclass +class StructuredOutputsConfig: + """Dataclass which contains structured outputs config for the engine.""" + + backend: StructuredOutputsBackend = "auto" + """Which engine will be used for structured outputs (e.g. JSON schema, + regex, etc) by default. With "auto", we will make opinionated choices + based on request contents and what the backend libraries currently support, + so the behavior is subject to change in each release.""" + disable_fallback: bool = False + """If `True`, vLLM will not fallback to a different backend on error.""" + disable_any_whitespace: bool = False + """If `True`, the model will not generate any whitespace during structured + outputs. This is only supported for xgrammar and guidance backends.""" + disable_additional_properties: bool = False + """If `True`, the `guidance` backend will not use `additionalProperties` + in the JSON schema. This is only supported for the `guidance` backend and + is used to better align its behaviour with `outlines` and `xgrammar`.""" + reasoning_parser: str = "" + """Select the reasoning parser depending on the model that you're using. + This is used to parse the reasoning content into OpenAI API format.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self): + if (self.disable_any_whitespace + and self.backend not in ("xgrammar", "guidance")): + raise ValueError("disable_any_whitespace is only supported for " + "xgrammar and guidance backends.") + if (self.disable_additional_properties and self.backend != "guidance"): + raise ValueError("disable_additional_properties is only supported " + "for the guidance backend.") From eaffe4486cb1d7edf884e6e254cab33fc397e308 Mon Sep 17 00:00:00 2001 From: Kay Yan <kay.yan@daocloud.io> Date: Thu, 18 Sep 2025 19:36:47 +0800 Subject: [PATCH 420/584] [Docs] Fix pooling-params doc references in openai_compatible_server.md (#24939) --- docs/api/README.md | 1 - docs/serving/openai_compatible_server.md | 20 ++++++++++++-------- vllm/pooling_params.py | 20 ++++++++++++++------ 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index 148211756480c..86e310f567dd3 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -46,7 +46,6 @@ Engine classes for offline and online inference. Inference parameters for vLLM APIs. [](){ #sampling-params } -[](){ #pooling-params } - [vllm.SamplingParams][] - [vllm.PoolingParams][] diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index bc52d02a50bd2..bac3f6c1fe90c 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -317,10 +317,11 @@ Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_cli #### Extra parameters -The following [pooling parameters][pooling-params] are supported. +The following [pooling parameters][vllm.PoolingParams] are supported. ```python ---8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params" +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:embedding-pooling-params" ``` The following extra parameters are supported by default: @@ -527,10 +528,11 @@ curl -v "http://127.0.0.1:8000/classify" \ #### Extra parameters -The following [pooling parameters][pooling-params] are supported. +The following [pooling parameters][vllm.PoolingParams] are supported. ```python ---8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params" +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:classification-pooling-params" ``` The following extra parameters are supported: @@ -733,10 +735,11 @@ Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_mu #### Extra parameters -The following [pooling parameters][pooling-params] are supported. +The following [pooling parameters][vllm.PoolingParams] are supported. ```python ---8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params" +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:classification-pooling-params" ``` The following extra parameters are supported: @@ -815,10 +818,11 @@ Result documents will be sorted by relevance, and the `index` property can be us #### Extra parameters -The following [pooling parameters][pooling-params] are supported. +The following [pooling parameters][vllm.PoolingParams] are supported. ```python ---8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params" +--8<-- "vllm/pooling_params.py:common-pooling-params" +--8<-- "vllm/pooling_params.py:classification-pooling-params" ``` The following extra parameters are supported: diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 6672392b8d080..a6313367457a4 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -20,25 +20,33 @@ class PoolingParams( """API parameters for pooling models. Attributes: + truncate_prompt_tokens: Controls prompt truncation. + Set to -1 to use the model's default truncation size. + Set to k to keep only the last k tokens (left truncation). + Set to None to disable truncation. normalize: Whether to normalize the embeddings outputs. dimensions: Reduce the dimensions of embeddings - if model support matryoshka representation. + if model support matryoshka representation. activation: Whether to apply activation function to - the classification outputs. + the classification outputs. softmax: Whether to apply softmax to the reward outputs. """ + + # --8<-- [start:common-pooling-params] truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=-1)]] = None - """If set to -1, will use the truncation size supported by the model. If - set to an integer k, will use only the last k tokens from the prompt - (i.e., left truncation). If set to `None`, truncation is disabled.""" + # --8<-- [end:common-pooling-params] ## for embeddings models + # --8<-- [start:embedding-pooling-params] dimensions: Optional[int] = None normalize: Optional[bool] = None + # --8<-- [end:embedding-pooling-params] - ## for classification models + ## for classification, scoring and rerank + # --8<-- [start:classification-pooling-params] activation: Optional[bool] = None + # --8<-- [end:classification-pooling-params] ## for reward models softmax: Optional[bool] = None From c9ff9e6f0cf48615bad1525caeef3025c62b2720 Mon Sep 17 00:00:00 2001 From: William Song <jinwook@umich.edu> Date: Thu, 18 Sep 2025 20:37:08 +0900 Subject: [PATCH 421/584] [Docs] add the parallel sampling usage in LLMEngine and AsyncLLM (#24222) --- vllm/sampling_params.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 0a01cb0260ae5..efe70d019ccc6 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -81,7 +81,13 @@ class SamplingParams( """ n: int = 1 - """Number of output sequences to return for the given prompt.""" + """Number of outputs to return for the given prompt request. + + NOTE: + `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs + are generated and streamed cumulatively per request. To see all `n` + outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY` + in `SamplingParams`.""" best_of: Optional[int] = None """Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. `best_of` From 5a33ae9a3faae79cad9d2659862fcd8d86483659 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:41:41 +0100 Subject: [PATCH 422/584] Fix forward reference warning in documentation (#25150) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/async_timeout.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index 28a023a71ef52..3b9c055160c1b 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -16,19 +16,6 @@ if sys.version_info[:2] >= (3, 11): from asyncio import timeout as asyncio_timeout else: - def asyncio_timeout(delay: Optional[float]) -> "Timeout": - """timeout context manager. - Useful in cases when you want to apply timeout logic around block - of code or in cases when asyncio.wait_for is not suitable. For example: - >>> async with timeout(0.001): - ... async with aiohttp.get('https://github.com') as r: - ... await r.text() - delay - value in seconds or None to disable timeout logic - """ - loop = asyncio.get_running_loop() - deadline = loop.time() + delay if delay is not None else None - return Timeout(deadline, loop) - class _State(enum.Enum): INIT = "INIT" ENTER = "ENTER" @@ -171,3 +158,16 @@ else: self._state = _State.TIMEOUT # drop the reference early self._timeout_handler = None + + def asyncio_timeout(delay: Optional[float]) -> Timeout: + """timeout context manager. + Useful in cases when you want to apply timeout logic around block + of code or in cases when asyncio.wait_for is not suitable. For example: + >>> async with timeout(0.001): + ... async with aiohttp.get('https://github.com') as r: + ... await r.text() + delay - value in seconds or None to disable timeout logic + """ + loop = asyncio.get_running_loop() + deadline = loop.time() + delay if delay is not None else None + return Timeout(deadline, loop) From 3ed1ec4af25a9cb7dcfea74b839864fc3c8ba09d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 13:06:28 +0100 Subject: [PATCH 423/584] Fix `validate-config` pre-commit check (#25157) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 4 +--- tools/validate_config.py | 23 ++++++++++++++++------- vllm/config/__init__.py | 2 ++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c16bdeeecd07a..13ad3af97d839 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -164,9 +164,7 @@ repos: name: Validate configuration has default values and that each field has a docstring entry: python tools/validate_config.py language: python - types: [python] - pass_filenames: true - files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py + additional_dependencies: [regex] # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/tools/validate_config.py b/tools/validate_config.py index 8b1e955c653d7..f6439fa9ada5f 100644 --- a/tools/validate_config.py +++ b/tools/validate_config.py @@ -9,6 +9,8 @@ import ast import inspect import sys +import regex as re + def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]: """ @@ -88,11 +90,12 @@ def validate_class(class_node: ast.ClassDef): for stmt in class_node.body: # A field is defined as a class variable that has a type annotation. if isinstance(stmt, ast.AnnAssign): - # Skip ClassVar + # Skip ClassVar and InitVar # see https://docs.python.org/3/library/dataclasses.html#class-variables - if isinstance(stmt.annotation, ast.Subscript) and isinstance( - stmt.annotation.value, - ast.Name) and stmt.annotation.value.id == "ClassVar": + # and https://docs.python.org/3/library/dataclasses.html#init-only-variables + if (isinstance(stmt.annotation, ast.Subscript) + and isinstance(stmt.annotation.value, ast.Name) + and stmt.annotation.value.id in {"ClassVar", "InitVar"}): continue if isinstance(stmt.target, ast.Name): @@ -132,7 +135,7 @@ def validate_ast(tree: ast.stmt): def validate_file(file_path: str): try: - print(f"validating {file_path} config dataclasses ", end="") + print(f"Validating {file_path} config dataclasses ", end="") with open(file_path, encoding="utf-8") as f: source = f.read() @@ -140,7 +143,7 @@ def validate_file(file_path: str): validate_ast(tree) except ValueError as e: print(e) - SystemExit(2) + raise SystemExit(1) from e else: print("✅") @@ -151,7 +154,13 @@ def fail(message: str, node: ast.stmt): def main(): for filename in sys.argv[1:]: - validate_file(filename) + # Only run for Python files in vllm/ or tests/ + if not re.match(r"^(vllm|tests)/.*\.py$", filename): + continue + # Only run if the file contains @config + with open(filename, encoding="utf-8") as f: + if "@config" in f.read(): + validate_file(filename) if __name__ == "__main__": diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 69ab5712d404c..25daca00c02d9 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -450,6 +450,8 @@ class ModelConfig: # Multimodal config and init vars multimodal_config: Optional[MultiModalConfig] = None + """Configuration for multimodal model. If `None`, this will be inferred + from the architecture of `self.model`.""" limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None From 66072b36dbf1707440ff43d57273d9e9974349d7 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:21:17 +0300 Subject: [PATCH 424/584] [Bugfix][Mamba] - Fix Conv State Kernel FP32 Support (#24883) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> --- tests/models/language/generation/test_hybrid.py | 9 ++++++--- vllm/model_executor/layers/mamba/ops/causal_conv1d.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index d0e42062099ec..206ad1352e06e 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -418,7 +418,9 @@ def test_full_cuda_graph( @pytest.mark.parametrize("model", FP32_STATE_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_fp32_state( +@pytest.mark.parametrize("cache_dtype_param", + ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]) +def test_fp32_cache_state( hf_runner, vllm_runner, example_prompts, @@ -426,6 +428,7 @@ def test_fp32_state( model: str, max_tokens: int, num_logprobs: int, + cache_dtype_param: str, ) -> None: try: @@ -443,13 +446,13 @@ def test_fp32_state( m.setenv("VLLM_USE_V1", "0") with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - mamba_ssm_cache_dtype="float32") as vllm_model: + **{cache_dtype_param: "float32"}) as vllm_model: vllm_v0_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - mamba_ssm_cache_dtype="float32") as vllm_model: + **{cache_dtype_param: "float32"}) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 2a88fa661da01..8cfd0962c5bfe 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -415,6 +415,9 @@ def causal_conv1d_fn( activation = "silu" args = None + # Store original dtype to cast back at the end + original_x_dtype = x.dtype + x = x.to(conv_states.dtype) out = torch.empty_like(x) if metadata is not None: cu_seqlen = metadata.cu_seqlen @@ -613,7 +616,7 @@ def causal_conv1d_fn( BLOCK_N=256, num_stages=2, ) - return out + return out.to(original_x_dtype) @triton.jit() @@ -973,6 +976,9 @@ def causal_conv1d_update( activation = "silu" if activation is True else None elif activation is not None: assert activation in ["silu", "swish"] + + original_x_dtype = x.dtype + x = x.to(conv_state.dtype) unsqueeze = query_start_loc is None and x.dim() == 2 if unsqueeze: # make it (batch, dim, seqlen) with seqlen == 1 @@ -1081,4 +1087,4 @@ def causal_conv1d_update( ) if unsqueeze: out = out.squeeze(-1) - return out + return out.to(original_x_dtype) From 21da73343ad35f756e053ba4155dafb05229b0c5 Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Thu, 18 Sep 2025 05:43:33 -0700 Subject: [PATCH 425/584] [Misc] Clean up flags in `vllm bench serve` (#25138) Signed-off-by: Roger Wang <hey@rogerw.io> --- docs/contributing/benchmarks.md | 3 -- tests/benchmarks/test_serve_cli.py | 2 +- vllm/benchmarks/datasets.py | 8 ++--- vllm/benchmarks/serve.py | 49 +++++++++++++++++++++--------- 4 files changed, 39 insertions(+), 23 deletions(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index d04b1d1136a1c..2a03ce1dffd63 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -245,7 +243,6 @@ vllm bench serve \ ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 5471d6b8e4a5f..fafbef5f37180 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -68,7 +68,7 @@ def test_bench_serve_chat(server): "5", "--endpoint", "/v1/chat/completions", - "--endpoint-type", + "--backend", "openai-chat", ] result = subprocess.run(command, capture_output=True, text=True) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1cab40802c392..68a937d5750ec 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1358,7 +1358,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: elif args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. - if args.endpoint_type == "openai-chat": + if args.backend == "openai-chat": input_requests = dataset.sample( num_requests=args.num_prompts, input_len=args.sonnet_input_len, @@ -1462,7 +1462,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: "Please consider contributing if you would " "like to add support for additional dataset formats.") - if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [ + if dataset_class.IS_MULTIMODAL and args.backend not in [ "openai-chat", "openai-audio", ]: @@ -1470,7 +1470,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: # endpoint-type. raise ValueError( "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' endpoint-type.") + "'openai-audio' backends.") input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset, @@ -1563,7 +1563,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: try: # Enforce endpoint compatibility for multimodal datasets. - if args.dataset_name == "random-mm" and args.endpoint_type not in [ + if args.dataset_name == "random-mm" and args.backend not in [ "openai-chat"]: raise ValueError( "Multi-modal content (images) is only supported on " diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index d8784340eba15..7382782f11655 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -8,8 +8,8 @@ to launch the vLLM OpenAI API server: On the client side, run: vllm bench serve \ - --endpoint-type <endpoint_type. Default 'openai'> \ - --label <benchmark result label. Default using endpoint_type> \ + --backend <backend or endpoint type. Default 'openai'> \ + --label <benchmark result label. Default using backend> \ --model <your_model> \ --dataset-name <dataset_name. Default 'random'> \ --request-rate <request_rate. Default inf> \ @@ -52,6 +52,21 @@ TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None) and (shutil.which("gnuplot") is not None)) +# TODO: Remove this in v0.11.0 +class DeprecatedEndpointTypeAction(argparse.Action): + """Argparse action for the deprecated --endpoint-type flag. + """ + + def __call__(self, _, namespace, values, option_string=None): + warnings.warn( + "'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead or remove this argument if you " + "have already set it.", + stacklevel=1, + ) + setattr(namespace, self.dest, values) + + class TaskType(Enum): GENERATION = "generation" EMBEDDING = "embedding" @@ -470,7 +485,7 @@ async def benchmark( else: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] else: - raise ValueError(f"Unknown endpoint_type: {endpoint_type}") + raise ValueError(f"Unknown backend: {endpoint_type}") # Reuses connections across requests to reduce TLS handshake overhead. connector = aiohttp.TCPConnector( @@ -850,24 +865,28 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, def add_cli_args(parser: argparse.ArgumentParser): add_dataset_parser(parser) - parser.add_argument( - "--endpoint-type", - type=str, - default="openai", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) parser.add_argument( "--label", type=str, default=None, help="The label (prefix) of the benchmark results. If not specified, " - "the endpoint type will be used as the label.", + "the value of '--backend' will be used as the label.", ) parser.add_argument( "--backend", type=str, - default="vllm", + default="openai", choices=list(ASYNC_REQUEST_FUNCS.keys()), + help="The type of backend or endpoint to use for the benchmark." + ) + parser.add_argument( + "--endpoint-type", + type=str, + default=None, + choices=list(ASYNC_REQUEST_FUNCS.keys()), + action=DeprecatedEndpointTypeAction, + help="'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead.", ) parser.add_argument( "--base-url", @@ -1165,7 +1184,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: raise ValueError( "For exponential ramp-up, the start RPS cannot be 0.") - endpoint_type = args.endpoint_type label = args.label model_id = args.model model_name = args.served_model_name @@ -1228,7 +1246,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: gc.freeze() benchmark_result = await benchmark( - endpoint_type=args.endpoint_type, + endpoint_type=args.backend, api_url=api_url, base_url=base_url, model_id=model_id, @@ -1262,7 +1280,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") result_json["date"] = current_dt - result_json["endpoint_type"] = args.endpoint_type + result_json["endpoint_type"] = args.backend # for backward compatibility + result_json["backend"] = args.backend result_json["label"] = label result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id @@ -1312,7 +1331,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: base_model_id = model_id.split("/")[-1] max_concurrency_str = (f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "") - label = label or endpoint_type + label = label or args.backend if args.ramp_up_strategy is not None: file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa else: From 470484a4f503d4768008c2f5a8dc828dc90633b4 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Thu, 18 Sep 2025 20:44:31 +0800 Subject: [PATCH 426/584] [Structured Output][Refactor] Move `apply_grammar_bitmask()` method from `ModelRunner` to structured output utils (#21999) Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm/v1/structured_output/utils.py | 80 ++++++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 75 ++-------------------------- 2 files changed, 84 insertions(+), 71 deletions(-) diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 953185a8fc31d..127c8876525b5 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -8,7 +8,9 @@ import importlib.metadata import os from typing import TYPE_CHECKING +import numpy as np import regex as re +import torch from cachetools import LRUCache from diskcache import Cache @@ -20,9 +22,13 @@ if TYPE_CHECKING: import outlines_core as oc import transformers.file_utils as file_utils import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2 + import xgrammar as xgr from vllm.transformers_utils.tokenizer import AnyTokenizer + from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.worker.gpu_input_batch import InputBatch else: + xgr = LazyLoader("xgr", globals(), "xgrammar") oc = LazyLoader("oc", globals(), "outlines_core") file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils") tokenization_gpt2 = LazyLoader( @@ -36,6 +42,80 @@ logger = init_logger(__name__) CACHE = None +def apply_grammar_bitmask( + scheduler_output: SchedulerOutput, + input_batch: InputBatch, + logits: torch.Tensor, + device: torch.device, +) -> None: + """ + Apply grammar bitmask to output logits of the model with xgrammar function. + + Args: + scheduler_output (SchedulerOutput): The result of engine scheduling. + input_batch (InputBatch): The input of model runner. + logits (torch.Tensor): The output logits of model forward. + device (torch.device): The device that model runner running on. + """ + grammar_bitmask = scheduler_output.grammar_bitmask + if grammar_bitmask is None: + return + + # We receive the structured output bitmask from the scheduler, + # compacted to contain bitmasks only for structured output requests. + # The order of the requests in the bitmask is not guaranteed to be the + # same as the order of the requests in the gpu runner's batch. We need + # to sort the bitmask to match the order of the requests used here. + + # Get the batch indices of the structured output requests. + # Keep track of the number of speculative tokens scheduled for every + # request in the batch, as the logit indices are offset by this amount. + struct_out_req_batch_indices: dict[str, int] = {} + cumulative_offset = 0 + seq = sorted(input_batch.req_id_to_index.items(), key=lambda x: x[1]) + for req_id, batch_index in seq: + logit_index = batch_index + cumulative_offset + cumulative_offset += len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + if req_id in scheduler_output.structured_output_request_ids: + struct_out_req_batch_indices[req_id] = logit_index + + out_indices = [] + + # Reorder the bitmask to match the order of the requests in the batch. + sorted_bitmask = np.full(shape=(logits.shape[0], grammar_bitmask.shape[1]), + fill_value=-1, + dtype=grammar_bitmask.dtype) + cumulative_index = 0 + seq = sorted(scheduler_output.structured_output_request_ids.items(), + key=lambda x: x[1]) + for req_id, _ in seq: + logit_index = struct_out_req_batch_indices[req_id] + num_spec_tokens = len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + for i in range(1 + num_spec_tokens): + sorted_bitmask[logit_index + i] = \ + grammar_bitmask[cumulative_index + i] + out_indices.append(logit_index + i) + cumulative_index += 1 + num_spec_tokens + grammar_bitmask = sorted_bitmask + + # If the length of out indices and the logits have the same shape + # we don't need to pass indices to the kernel, + # since the bitmask is already aligned with the logits. + skip_out_indices = len(out_indices) == logits.shape[0] + + # Serialization of np.ndarray is much more efficient than a tensor, + # so we receive it in that format. + grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() + + xgr.apply_token_bitmask_inplace( + logits, + grammar_bitmask.to(device, non_blocking=True), + indices=out_indices if not skip_out_indices else None, + ) + + class OutlinesVocabulary: """ Wrapper class for `outlines_core.Vocabulary`, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2e67984cb4327..4873b586724ec 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -54,7 +54,7 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, + GiB_bytes, check_use_alibi, get_dtype_size, is_pin_memory_available, round_up, supports_dynamo) from vllm.v1.attention.backends.flash_attn import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder @@ -85,6 +85,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.medusa import MedusaProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper @@ -101,12 +102,8 @@ from .utils import (AttentionGroup, MultiModalBudget, scatter_mm_placeholders) if TYPE_CHECKING: - import xgrammar as xgr - from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput -else: - xgr = LazyLoader("xgr", globals(), "xgrammar") logger = init_logger(__name__) @@ -1617,71 +1614,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return tuple(tasks) - def apply_grammar_bitmask( - self, - scheduler_output: "SchedulerOutput", - logits: torch.Tensor, - ): - grammar_bitmask = scheduler_output.grammar_bitmask - if grammar_bitmask is None: - return - - # We receive the structured output bitmask from the scheduler, - # compacted to contain bitmasks only for structured output requests. - # The order of the requests in the bitmask is not guaranteed to be the - # same as the order of the requests in the gpu runner's batch. We need - # to sort the bitmask to match the order of the requests used here. - - # Get the batch indices of the structured output requests. - # Keep track of the number of speculative tokens scheduled for every - # request in the batch, as the logit indices are offset by this amount. - struct_out_req_batch_indices: dict[str, int] = {} - cumulative_offset = 0 - seq = sorted(self.input_batch.req_id_to_index.items(), - key=lambda x: x[1]) - for req_id, batch_index in seq: - logit_index = batch_index + cumulative_offset - cumulative_offset += len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - if req_id in scheduler_output.structured_output_request_ids: - struct_out_req_batch_indices[req_id] = logit_index - - out_indices = [] - - # Reorder the bitmask to match the order of the requests in the batch. - sorted_bitmask = np.full(shape=(logits.shape[0], - grammar_bitmask.shape[1]), - fill_value=-1, - dtype=grammar_bitmask.dtype) - cumulative_index = 0 - seq = sorted(scheduler_output.structured_output_request_ids.items(), - key=lambda x: x[1]) - for req_id, _ in seq: - logit_index = struct_out_req_batch_indices[req_id] - num_spec_tokens = len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = \ - grammar_bitmask[cumulative_index + i] - out_indices.append(logit_index + i) - cumulative_index += 1 + num_spec_tokens - grammar_bitmask = sorted_bitmask - - # If the length of out indices and the logits have the same shape - # we don't need to pass indices to the kernel, - # since the bitmask is already aligned with the logits. - skip_out_indices = len(out_indices) == logits.shape[0] - - # Serialization of np.ndarray is much more efficient than a tensor, - # so we receive it in that format. - grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() - - xgr.apply_token_bitmask_inplace( - logits, - grammar_bitmask.to(self.device, non_blocking=True), - indices=out_indices if not skip_out_indices else None, - ) - def sync_and_slice_intermediate_tensors( self, num_tokens: int, intermediate_tensors: IntermediateTensors, sync_self: bool) -> IntermediateTensors: @@ -2232,7 +2164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: - self.apply_grammar_bitmask(scheduler_output, logits) + apply_grammar_bitmask(scheduler_output, self.input_batch, + logits, self.device) with record_function_or_nullcontext("Sample"): sampler_output = self._sample(logits, spec_decode_metadata) From fbd6523ac00082c398dc8126434cede595169609 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Thu, 18 Sep 2025 08:53:45 -0400 Subject: [PATCH 427/584] Refactor dense FP8 tensor/channel/block utils and add CT FP8 block (#21404) --- vllm/model_executor/layers/linear.py | 14 +- .../compressed_tensors/compressed_tensors.py | 68 ++--- .../schemes/compressed_tensors_w8a8_fp8.py | 191 ++++++------- .../model_executor/layers/quantization/fp8.py | 265 ++++++------------ .../layers/quantization/utils/fp8_utils.py | 220 +++++++++++++++ 5 files changed, 441 insertions(+), 317 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index cd05136520977..5bf96398bc710 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -805,12 +805,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear): assert loaded_shard_id < len(self.output_sizes) if isinstance(param, BlockQuantScaleParameter): - from vllm.model_executor.layers.quantization.fp8 import ( - Fp8LinearMethod, Fp8MoEMethod) assert self.quant_method is not None - assert isinstance(self.quant_method, - (Fp8LinearMethod, Fp8MoEMethod)) - weight_block_size = self.quant_method.quant_config.weight_block_size + # Assume the weight block size has been set by quant method + assert hasattr(self, "weight_block_size") + weight_block_size = self.weight_block_size assert weight_block_size is not None block_n, _ = weight_block_size[0], weight_block_size[1] shard_offset = ( @@ -989,8 +987,10 @@ class QKVParallelLinear(ColumnParallelLinear): # Note(simon): This is needed for Qwen3's fp8 quantization. if isinstance(param, BlockQuantScaleParameter): assert self.quant_method is not None - assert hasattr(self.quant_method, "quant_config") - weight_block_size = self.quant_method.quant_config.weight_block_size + # Assume the weight block size has been set by quant method + assert hasattr(self, "weight_block_size") + weight_block_size = self.weight_block_size + assert weight_block_size is not None block_n, _ = weight_block_size[0], weight_block_size[1] shard_offset = (shard_offset + block_n - 1) // block_n shard_size = (shard_size + block_n - 1) // block_n diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b56a691311774..d6550dd16892f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -12,7 +12,6 @@ from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy, QuantizationType) from compressed_tensors.transform import TransformConfig -from pydantic import BaseModel import vllm.envs as envs from vllm.logger import init_logger @@ -268,7 +267,8 @@ class CompressedTensorsConfig(QuantizationConfig): else: return False - def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel): + def _is_fp4a4_nvfp4(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs): if weight_quant is None or input_quant is None: return False @@ -288,8 +288,8 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_tensor_group_quant and is_float_type and is_4_bits and is_group_size_16 and is_symmetric) - def _is_fp4a16_nvfp4(self, weight_quant: BaseModel, - input_quant: BaseModel): + def _is_fp4a16_nvfp4(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs): is_weight_only = weight_quant is not None and input_quant is None is_tensor_group_quant = ( @@ -303,8 +303,8 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_weight_only and is_tensor_group_quant and is_float_type and is_4_bits and is_group_size_16 and is_symmetric) - def _is_static_tensor_w8a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 weight_strategy = ( weight_quant.strategy == QuantizationStrategy.TENSOR.value @@ -317,8 +317,8 @@ class CompressedTensorsConfig(QuantizationConfig): # Only symmetric weight quantization supported. return is_8_bits and is_tensor and weight_quant.symmetric and is_static - def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 weight_strategy = ( weight_quant.strategy == QuantizationStrategy.TENSOR.value @@ -331,8 +331,8 @@ class CompressedTensorsConfig(QuantizationConfig): # Only symmetric weight quantization supported. return is_8_bits and is_token and weight_quant.symmetric and is_dynamic - def _is_dynamic_token_w4a8_int(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_dynamic_token_w4a8_int(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: is_weight_4_bits = weight_quant.num_bits == 4 is_activation_8_bits = input_quant.num_bits == 8 weight_strategy = ( @@ -347,8 +347,8 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_weight_4_bits and is_activation_8_bits and is_token and weight_quant.symmetric and is_dynamic) - def _is_fp8_w8a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: # Confirm weights and activations quantized. if weight_quant is None or input_quant is None: return False @@ -358,11 +358,12 @@ class CompressedTensorsConfig(QuantizationConfig): and input_quant.type == QuantizationType.FLOAT) is_symmetric_weight = weight_quant.symmetric is_static_weight = not weight_quant.dynamic - is_per_tensor_or_channel_weight = (weight_quant.strategy in [ - QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL + is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [ + QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL, + QuantizationStrategy.BLOCK ]) if not (is_floating_point and is_symmetric_weight and is_static_weight - and is_per_tensor_or_channel_weight): + and is_tensor_or_channel_or_block_weight): return False # Dynamic quantization is always supported if weights supported. @@ -375,8 +376,8 @@ class CompressedTensorsConfig(QuantizationConfig): input_quant.strategy == QuantizationStrategy.TENSOR) return is_symmetric_activation and is_per_tensor_activation - def _is_fp8_w4a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w4a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: if not weight_quant or not input_quant: return False is_weight_4_bits = weight_quant.num_bits == 4 @@ -392,24 +393,24 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_weight_4_bits and is_activation_8_bits and is_token and is_symmetric and is_dynamic) - def _is_fp8_w4a8_sm90(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w4a8_sm90(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: return (self._check_scheme_supported(90, error=False, match_exact=True) and self._is_fp8_w4a8(weight_quant, input_quant)) - def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a8_sm90(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: return (self._check_scheme_supported(90, error=False, match_exact=True) and self._is_fp8_w8a8(weight_quant, input_quant)) - def _is_fp8_w8a8_sm100(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a8_sm100(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: return (self._check_scheme_supported( 100, error=False, match_exact=True) and self._is_fp8_w8a8(weight_quant, input_quant)) - def _is_fp8_w8a16(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a16(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: # Confirm weights quantized. if weight_quant is None: return False @@ -421,18 +422,19 @@ class CompressedTensorsConfig(QuantizationConfig): # Confirm weight scheme is supported. is_symmetric_weight = weight_quant.symmetric is_static_weight = not weight_quant.dynamic - is_per_tensor_or_channel_weight = (weight_quant.strategy in [ - QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL + is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [ + QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL, + QuantizationStrategy.BLOCK ]) if not (is_symmetric_weight and is_static_weight # noqa: SIM103 - and is_per_tensor_or_channel_weight): + and is_tensor_or_channel_or_block_weight): return False # All conditions satisfied. return True - def _is_wNa16_group_channel(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_wNa16_group_channel(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: input_quant_none = input_quant is None is_channel_group = ( weight_quant.strategy == QuantizationStrategy.CHANNEL.value @@ -443,8 +445,8 @@ class CompressedTensorsConfig(QuantizationConfig): def _get_scheme_from_parts( self, - weight_quant: BaseModel, - input_quant: BaseModel, + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, format: Optional[str] = None) -> "CompressedTensorsScheme": # use the per-layer format if defined, otherwise, use global format @@ -496,7 +498,7 @@ class CompressedTensorsConfig(QuantizationConfig): CompressedTensorsW8A8Fp8.get_min_capability(), error=False) if is_fp8_w8a8_supported: return CompressedTensorsW8A8Fp8( - strategy=weight_quant.strategy, + weight_quant=weight_quant, is_static_input_scheme=(input_quant and not input_quant.dynamic)) else: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index d984e89d9e02a..d42ae22c51393 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -4,28 +4,41 @@ from typing import Callable, Optional import torch -from compressed_tensors.quantization import QuantizationStrategy +from compressed_tensors.quantization import (QuantizationArgs, + QuantizationStrategy) from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_fp8_block_linear, check_aiter_fp8_linear_support, + create_fp8_input_scale, create_fp8_scale_parameter, + create_fp8_weight_parameter, maybe_post_process_fp8_weight_block, + process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy, + process_fp8_weight_tensor_strategy, validate_fp8_block_shape) from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz, - requantize_with_max_scale) -from vllm.model_executor.parameter import (ChannelQuantScaleParameter, - ModelWeightParameter, + Fp8LinearOp, cutlass_block_fp8_supported, maybe_create_device_identity) +from vllm.model_executor.parameter import (BlockQuantScaleParameter, + ChannelQuantScaleParameter, PerTensorScaleParameter) -from vllm.platforms import current_platform __all__ = ["CompressedTensorsW8A8Fp8"] +strategy_to_parameter_type = { + QuantizationStrategy.BLOCK: BlockQuantScaleParameter, + QuantizationStrategy.CHANNEL: ChannelQuantScaleParameter, + QuantizationStrategy.TENSOR: PerTensorScaleParameter, +} + class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): - def __init__(self, strategy: str, is_static_input_scheme: bool): - self.strategy = strategy + def __init__(self, weight_quant: QuantizationArgs, + is_static_input_scheme: bool): + self.weight_quant = weight_quant + self.strategy = weight_quant.strategy self.out_dtype = torch.get_default_dtype() self.is_static_input_scheme = is_static_input_scheme self.act_q_group_shape = GroupShape.PER_TENSOR \ @@ -34,61 +47,84 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): act_quant_static=self.is_static_input_scheme, act_quant_group_shape=self.act_q_group_shape) + self.weight_block_size = self.weight_quant.block_structure + self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() + self.use_aiter_and_is_supported = check_aiter_fp8_linear_support() + @classmethod def get_min_capability(cls) -> int: # lovelace and up return 89 + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + weight_loader: Callable, **kwargs): + maybe_create_device_identity() + + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.weight_block_size = None + + if self.strategy == QuantizationStrategy.BLOCK: + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + # Validate block quantization shapes + validate_fp8_block_shape(layer, input_size, output_size, + input_size_per_partition, + output_partition_sizes, + self.weight_block_size) + + # WEIGHT + weight = create_fp8_weight_parameter(output_size_per_partition, + input_size_per_partition, + weight_loader) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + weight_scale = create_fp8_scale_parameter( + strategy_to_parameter_type[self.strategy], output_partition_sizes, + input_size_per_partition, layer.weight_block_size, weight_loader) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = create_fp8_input_scale(output_partition_sizes, + weight_loader) + layer.register_parameter("input_scale", input_scale) + def process_weights_after_loading(self, layer) -> None: - # If per tensor, when we have a fused module (e.g. QKV) with per - # tensor scales (thus N scales being passed to the kernel), - # requantize so we can always run per tensor if self.strategy == QuantizationStrategy.TENSOR: - max_w_scale, weight = requantize_with_max_scale( - weight=layer.weight, - weight_scale=layer.weight_scale, - logical_widths=layer.logical_widths, - ) + weight, weight_scale, input_scale = ( + process_fp8_weight_tensor_strategy( + layer.weight, layer.weight_scale, layer.logical_widths, + getattr(layer, 'input_scale', None))) + weight = weight.t() - if current_platform.is_fp8_fnuz(): - input_scale = getattr(layer, 'input_scale', None) - - weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=max_w_scale, - input_scale=input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) - - layer.weight = Parameter(weight.t(), requires_grad=False) - layer.weight_scale = Parameter(max_w_scale, requires_grad=False) - - # If channelwise, scales are already lined up, so just transpose. elif self.strategy == QuantizationStrategy.CHANNEL: - weight = layer.weight + weight, weight_scale, input_scale = ( + process_fp8_weight_channel_strategy( + layer.weight, layer.weight_scale, + getattr(layer, 'input_scale', None))) + weight = weight.t() - if current_platform.is_fp8_fnuz(): - input_scale = getattr(layer, 'input_scale', None) - - weight, weight_scale, input_scale = \ - normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=layer.weight_scale, - input_scale=input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) - else: - weight_scale = layer.weight_scale.data - - layer.weight = Parameter(weight.t(), requires_grad=False) - # required by torch.compile to be torch.nn.Parameter - layer.weight_scale = Parameter(weight_scale, requires_grad=False) + elif self.strategy == QuantizationStrategy.BLOCK: + assert self.is_static_input_scheme is False + weight, weight_scale = process_fp8_weight_block_strategy( + layer.weight, layer.weight_scale) + input_scale = None else: raise ValueError(f"Unknown quantization strategy {self.strategy}") + # required by torch.compile to be torch.nn.Parameter + layer.weight = Parameter(weight.data, requires_grad=False) + layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) + if input_scale is not None: + layer.input_scale = Parameter(input_scale.data, + requires_grad=False) + # INPUT SCALE if self.is_static_input_scheme and hasattr(layer, 'input_scale'): layer.input_scale = Parameter(layer.input_scale.max(), @@ -96,58 +132,23 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): else: layer.input_scale = None - def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: list[int], - input_size_per_partition: int, - params_dtype: torch.dtype, weight_loader: Callable, - **kwargs): - maybe_create_device_identity() - - output_size_per_partition = sum(output_partition_sizes) - layer.logical_widths = output_partition_sizes - - # WEIGHT - weight = ModelWeightParameter(data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=torch.float8_e4m3fn), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) - layer.register_parameter("weight", weight) - - # WEIGHT SCALE - # TODO: update create_xxx_parameter functions to return - # the newly added parameters - if self.strategy == QuantizationStrategy.CHANNEL: - weight_scale = ChannelQuantScaleParameter( - data=torch.empty((sum(output_partition_sizes), 1), - dtype=torch.float32), - output_dim=0, - weight_loader=weight_loader) - else: - assert self.strategy == QuantizationStrategy.TENSOR - weight_scale = PerTensorScaleParameter(data=torch.empty( - len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) - - # min requirement for fp8 kernels - weight_scale[:] = torch.finfo(torch.float32).min - layer.register_parameter("weight_scale", weight_scale) - - # INPUT SCALE - if self.is_static_input_scheme: - input_scale = PerTensorScaleParameter(data=torch.empty( - len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) - input_scale[:] = torch.finfo(torch.float32).min - layer.register_parameter("input_scale", input_scale) + if self.strategy == QuantizationStrategy.BLOCK: + maybe_post_process_fp8_weight_block( + layer, self.cutlass_block_fp8_supported) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: + if layer.weight_block_size is not None: + return apply_fp8_block_linear( + layer, + input=x, + bias=bias, + cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, + use_aiter_and_is_supported=self.use_aiter_and_is_supported) + return self.fp8_linear.apply(input=x, weight=layer.weight, weight_scale=layer.weight_scale, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e75094c54743c..aec9c79f1ea82 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any, Callable, Optional, Union import torch -import torch.nn.functional as F from torch.nn import Module from torch.nn.parameter import Parameter @@ -32,8 +31,12 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, select_cutlass_fp8_gemm_impl, swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace, - should_use_deepgemm_for_fp8_linear) + apply_fp8_block_linear, check_aiter_fp8_linear_support, + create_fp8_input_scale, create_fp8_scale_parameter, + create_fp8_weight_parameter, get_col_major_tma_aligned_tensor, + maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy, + process_fp8_weight_tensor_strategy, requant_weight_ue8m0_inplace, + validate_fp8_block_shape) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin, prepare_moe_fp8_layer_for_marlin) @@ -42,8 +45,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, all_close_1d, cutlass_block_fp8_supported, cutlass_fp8_supported, maybe_create_device_identity, - normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize, - requantize_with_max_scale) + normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.parameter import (BlockQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter) @@ -233,14 +235,10 @@ class Fp8LinearMethod(LinearMethodBase): if current_platform.is_rocm(): self.use_marlin = False - # AITER is only supported on ROCm and only for FP8_FNUZ - # and at the moment are MI300 series - self.use_aiter_and_is_supported = (current_platform.is_rocm() - and envs.VLLM_ROCM_USE_AITER - and envs.VLLM_ROCM_USE_AITER_LINEAR - and current_platform.is_fp8_fnuz()) + self.use_aiter_and_is_supported = check_aiter_fp8_linear_support() - self.block_quant = self.quant_config.weight_block_size is not None + self.weight_block_size = self.quant_config.weight_block_size + self.block_quant = self.weight_block_size is not None self.act_q_static = self.quant_config.activation_scheme == "static" # Use per-token quantization for better perf if dynamic and cutlass if not self.act_q_static and cutlass_fp8_supported(): @@ -273,51 +271,27 @@ class Fp8LinearMethod(LinearMethodBase): layer.weight_block_size = None if self.block_quant: - tp_size = getattr(layer, "tp_size", - get_tensor_model_parallel_world_size()) - assert self.quant_config.weight_block_size is not None - layer.weight_block_size = self.quant_config.weight_block_size - block_n, block_k = ( - self.quant_config.weight_block_size[0], - self.quant_config.weight_block_size[1], - ) - # Required by row parallel - if (tp_size > 1 - and input_size // input_size_per_partition == tp_size - and input_size_per_partition % block_k != 0): - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"weight quantization block_k = {block_k}.") - # Required by column parallel or enabling merged weights - is_tp_split = (tp_size > 1 and - output_size // output_size_per_partition == tp_size) - is_merged_gemm = len(output_partition_sizes) > 1 - if is_tp_split or is_merged_gemm: - sizes_to_check = output_partition_sizes - if not is_tp_split and is_merged_gemm: - # In case of merged matrices, we allow the last - # matrix to not be a multiple of block size - sizes_to_check = output_partition_sizes[:-1] - for output_partition_size in sizes_to_check: - if output_partition_size % block_n != 0: - raise ValueError( - f"Weight output_partition_size = " - f"{output_partition_size} is not divisible by " - f"weight quantization block_n = {block_n}.") + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + validate_fp8_block_shape(layer, input_size, output_size, + input_size_per_partition, + output_partition_sizes, + self.weight_block_size) # WEIGHT - weight_dtype = (torch.float8_e4m3fn - if self.quant_config.is_checkpoint_fp8_serialized else - params_dtype) - - weight = ModelWeightParameter(data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=weight_dtype), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) + if self.quant_config.is_checkpoint_fp8_serialized: + weight = create_fp8_weight_parameter(output_size_per_partition, + input_size_per_partition, + weight_loader) + else: + # For non-serialized checkpoints, use original dtype + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=params_dtype), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) layer.register_parameter("weight", weight) # If checkpoint is serialized fp8, load them. @@ -325,154 +299,87 @@ class Fp8LinearMethod(LinearMethodBase): if self.quant_config.is_checkpoint_fp8_serialized: # WEIGHT SCALE if not self.block_quant: - scale = PerTensorScaleParameter( - data=torch.empty(len(output_partition_sizes), - dtype=torch.float32), - weight_loader=weight_loader, - ) - scale[:] = torch.finfo(torch.float32).min + scale = create_fp8_scale_parameter(PerTensorScaleParameter, + output_partition_sizes, + input_size_per_partition, + None, weight_loader) set_weight_attrs(scale, {"scale_type": "weight_scale"}) layer.register_parameter("weight_scale", scale) else: - assert self.quant_config.activation_scheme == "dynamic" - scale = BlockQuantScaleParameter( - data=torch.empty( - (output_size_per_partition + block_n - 1) // block_n, - (input_size_per_partition + block_k - 1) // block_k, - dtype=torch.float32, - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - scale[:] = torch.finfo(torch.float32).min + assert not self.act_q_static + assert self.weight_block_size is not None + scale = create_fp8_scale_parameter(BlockQuantScaleParameter, + output_partition_sizes, + input_size_per_partition, + self.weight_block_size, + weight_loader) set_weight_attrs(scale, {"scale_type": "weight_scale"}) # The weight_scale_inv name is intentional for deepseekv3 layer.register_parameter("weight_scale_inv", scale) # INPUT ACTIVATION SCALE - if self.quant_config.activation_scheme == "static": - scale = PerTensorScaleParameter(data=torch.empty( - len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) - - scale[:] = torch.finfo(torch.float32).min + if self.act_q_static: + scale = create_fp8_input_scale(output_partition_sizes, + weight_loader) set_weight_attrs(scale, {"scale_type": "input_scale"}) layer.register_parameter("input_scale", scale) else: layer.register_parameter("input_scale", None) - def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor: - # Pad the weight tensor. This is an optimization on ROCm platform, which - # can benefit from tensors located far enough from one another in memory - if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm() - and weight.stride(-1) == 1 - and (weight.stride(-2) * weight.element_size()) % 512 == 0): - num_pad = 256 // weight.element_size() - weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad] - torch.cuda.empty_cache() - return weight - def process_weights_after_loading(self, layer: Module) -> None: size_k_first = True + input_scale = None # TODO(rob): refactor block quant into separate class. if self.block_quant: - assert self.quant_config.activation_scheme == "dynamic" + assert not self.act_q_static size_k_first = False - if current_platform.is_fp8_fnuz(): - weight, weight_scale_inv, _ = \ - normalize_e4m3fn_to_e4m3fnuz( - weight=layer.weight, - weight_scale=layer.weight_scale_inv) - else: - weight = layer.weight.data - weight_scale_inv = layer.weight_scale_inv.data - weight = self._maybe_pad_weight(weight) - - # Torch.compile cannot use Parameter subclasses. - layer.weight = Parameter(weight, requires_grad=False) - layer.weight_scale_inv = Parameter(weight_scale_inv, - requires_grad=False) + weight, weight_scale = process_fp8_weight_block_strategy( + layer.weight, layer.weight_scale_inv) + # Delete the weight_scale_inv parameter to avoid confusion + # with the weight_scale parameter + del layer.weight_scale_inv # If checkpoint not serialized fp8, quantize the weights. elif not self.quant_config.is_checkpoint_fp8_serialized: qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) + weight = qweight.t() - # Update the layer with the new values. - layer.weight = Parameter(qweight.t(), requires_grad=False) - layer.weight_scale = Parameter(weight_scale, requires_grad=False) - # layer.input_scale is None indicates dynamic quant and scale is - # computed from input. - layer.input_scale = None - - # If checkpoint is fp8, handle that there are N scales for N + # If checkpoint is fp8 per-tensor, handle that there are N scales for N # shards in a fused module else: - layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data, - requires_grad=False) - if self.quant_config.activation_scheme == "static": - layer.input_scale = torch.nn.Parameter(layer.input_scale.data, - requires_grad=False) - weight = layer.weight weight_scale = layer.weight_scale # If using w8a8, torch._scaled_mm needs per tensor, so # requantize the logical shards as a single weight. if not self.use_marlin: - # Dequant -> Quant with max scale so we can run per tensor. - if current_platform.is_fp8_fnuz(): - weight, weight_scale, input_scale = \ - normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=weight_scale, - input_scale=layer.input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) + weight, weight_scale, input_scale = ( + process_fp8_weight_tensor_strategy( + weight, weight_scale, layer.logical_widths, + getattr(layer, 'input_scale', None))) + if self.act_q_static: + assert input_scale is not None + input_scale = input_scale.max() + weight = weight.t() - weight_scale, weight = requantize_with_max_scale( - weight=weight, - weight_scale=weight_scale, - logical_widths=layer.logical_widths, - ) - - weight = self._maybe_pad_weight(weight) - # Update layer with new values. - layer.weight = Parameter(weight.t(), requires_grad=False) - layer.weight_scale = Parameter(weight_scale, requires_grad=False) - if self.quant_config.activation_scheme == "static": - layer.input_scale = Parameter(layer.input_scale.max(), - requires_grad=False) + # Update layer with new values. + layer.weight = Parameter(weight.data, requires_grad=False) + layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) + layer.input_scale = Parameter( + input_scale, + requires_grad=False) if input_scale is not None else None if self.use_marlin: prepare_fp8_layer_for_marlin(layer, size_k_first) # Activations not quantized for marlin. del layer.input_scale + return - # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to - # requantize the weight and input to the specific scale - # at the same time. - if is_deep_gemm_e8m0_used() and self.block_quant: - assert layer.weight_block_size is not None - block_sz = tuple(layer.weight_block_size) - requant_weight_ue8m0_inplace( - layer.weight.data, - layer.weight_scale_inv.data if hasattr( - layer, "weight_scale_inv") else layer.weight_scale.data, - block_sz, - ) - - # SM90 Block FP8 CUTLASS requires row-major weight scales - if (self.block_quant and current_platform.is_device_capability(90) - and self.cutlass_block_fp8_supported - and not should_use_deepgemm_for_fp8_linear( - torch.bfloat16, layer.weight)): - layer.weight_scale_inv = Parameter( - layer.weight_scale_inv.data.T.contiguous(), - requires_grad=False) + if self.block_quant: + maybe_post_process_fp8_weight_block( + layer, self.cutlass_block_fp8_supported) def apply(self, layer: torch.nn.Module, @@ -490,18 +397,12 @@ class Fp8LinearMethod(LinearMethodBase): bias=bias) if self.block_quant: - assert self.quant_config.weight_block_size is not None - - return torch.ops.vllm.apply_w8a8_block_fp8_linear( + return apply_fp8_block_linear( + layer, input=x, - weight=layer.weight, - block_size=self.quant_config.weight_block_size, - weight_scale=layer.weight_scale_inv, - input_scale=layer.input_scale, bias=bias, cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, - use_aiter_and_is_supported=self.use_aiter_and_is_supported, - ) + use_aiter_and_is_supported=self.use_aiter_and_is_supported) return self.fp8_linear.apply(input=x, weight=layer.weight, @@ -528,7 +429,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): super().__init__(layer.moe_config) self.layer = layer self.quant_config = quant_config - self.block_quant = self.quant_config.weight_block_size is not None + self.weight_block_size = self.quant_config.weight_block_size + self.block_quant = self.weight_block_size is not None self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None self.fused_experts: Optional[ @@ -590,12 +492,12 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.quant_config.is_checkpoint_fp8_serialized: params_dtype = torch.float8_e4m3fn if self.block_quant: - assert self.quant_config.weight_block_size is not None - layer.weight_block_size = self.quant_config.weight_block_size + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size tp_size = get_tensor_model_parallel_world_size() block_n, block_k = ( - self.quant_config.weight_block_size[0], - self.quant_config.weight_block_size[1], + self.weight_block_size[0], + self.weight_block_size[1], ) # NOTE: To ensure proper alignment of the block-wise quantization # scales, the output_size of the weights for both the gate and up @@ -952,7 +854,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): "BatchedTritonOrDeepGemmExperts(%s): " "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s", self.__class__.__name__, max_num_tokens_per_rank, - self.quant_config.weight_block_size, False) + self.weight_block_size, False) return BatchedTritonOrDeepGemmExperts( max_num_tokens=max_num_tokens_per_rank, num_dispatchers=prepare_finalize.num_dispatchers(), @@ -969,8 +871,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): else: logger.debug( "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s", - self.__class__.__name__, self.quant_config.weight_block_size, - False) + self.__class__.__name__, self.weight_block_size, False) return TritonOrDeepGemmExperts( quant_config=self.moe_quant_config, allow_deep_gemm=self.allow_deep_gemm, @@ -988,7 +889,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.block_quant else layer.w2_weight_scale), a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, - block_shape=self.quant_config.weight_block_size, + block_shape=self.weight_block_size, ) def apply( @@ -1046,7 +947,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): intermediate_size=layer.intermediate_size_per_partition, expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, - block_shape=self.quant_config.weight_block_size, + block_shape=self.weight_block_size, routed_scaling=routed_scaling_factor, ) else: diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index bbe0c6f6d38ec..fc12483de0c0e 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -17,6 +17,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( group_broadcast) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( CUTLASS_BLOCK_FP8_SUPPORTED) +from vllm.model_executor.parameter import (BlockQuantScaleParameter, + ChannelQuantScaleParameter, + PerTensorScaleParameter) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import cdiv, direct_register_custom_op @@ -794,3 +797,220 @@ def requant_weight_ue8m0_inplace( # Write back the results in-place. w_q.copy_(w_requant) s_old.copy_(s_requant) + + +def check_aiter_fp8_linear_support() -> bool: + """AITER is only supported on ROCm and only for FP8_FNUZ + and at the moment are MI300 series""" + return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER + and envs.VLLM_ROCM_USE_AITER_LINEAR + and current_platform.is_fp8_fnuz()) + + +def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor: + """Pad the weight tensor. This is an optimization on ROCm platform, which + can benefit from tensors located far enough from one another in memory""" + if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm() + and weight.stride(-1) == 1 + and (weight.stride(-2) * weight.element_size()) % 512 == 0): + num_pad = 256 // weight.element_size() + import torch.nn.functional as F + weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad] + torch.cuda.empty_cache() + return weight + + +def validate_fp8_block_shape(layer: torch.nn.Module, input_size: int, + output_size: int, input_size_per_partition: int, + output_partition_sizes: list[int], + block_size: list[int]) -> None: + """Validate block quantization shapes for tensor parallelism.""" + from vllm.distributed import get_tensor_model_parallel_world_size + + tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size()) + block_n, block_k = block_size[0], block_size[1] + + # Required by row parallel + if (tp_size > 1 and input_size // input_size_per_partition == tp_size + and input_size_per_partition % block_k != 0): + raise ValueError( + f"Weight input_size_per_partition = {input_size_per_partition} " + f"is not divisible by weight quantization block_k = {block_k}.") + + # Required by column parallel or enabling merged weights + is_tp_split = (tp_size > 1 + and output_size // sum(output_partition_sizes) == tp_size) + is_merged_gemm = len(output_partition_sizes) > 1 + if is_tp_split or is_merged_gemm: + sizes_to_check = output_partition_sizes + if not is_tp_split and is_merged_gemm: + # In case of merged matrices, we allow the last + # matrix to not be a multiple of block size + sizes_to_check = output_partition_sizes[:-1] + for output_partition_size in sizes_to_check: + if output_partition_size % block_n != 0: + raise ValueError( + f"Weight output_partition_size = " + f"{output_partition_size} is not divisible by " + f"weight quantization block_n = {block_n}.") + + +def create_fp8_weight_parameter( + output_size_per_partition: int, input_size_per_partition: int, + weight_loader: Optional[Callable]) -> torch.nn.Parameter: + """Create FP8 weight parameter.""" + from vllm.model_executor.parameter import ModelWeightParameter + + return ModelWeightParameter(data=torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + + +def create_fp8_scale_parameter( + parameter_type: torch.nn.Parameter, output_partition_sizes: list[int], + input_size_per_partition: int, block_size: Optional[list[int]], + weight_loader: Optional[Callable]) -> torch.nn.Parameter: + """Create scale parameter based on quantization strategy.""" + if parameter_type == ChannelQuantScaleParameter: + scale = parameter_type(data=torch.empty( + (sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + elif parameter_type == BlockQuantScaleParameter: + assert block_size is not None + block_n, block_k = block_size[0], block_size[1] + output_size_per_partition = sum(output_partition_sizes) + scale = parameter_type( + data=torch.empty( + (output_size_per_partition + block_n - 1) // block_n, + (input_size_per_partition + block_k - 1) // block_k, + dtype=torch.float32, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + elif parameter_type == PerTensorScaleParameter: + scale = parameter_type(data=torch.empty(len(output_partition_sizes), + dtype=torch.float32), + weight_loader=weight_loader) + else: + raise ValueError(f"Unknown parameter type: {parameter_type}") + + scale[:] = torch.finfo(torch.float32).min + return scale + + +def create_fp8_input_scale( + output_partition_sizes: list[int], + weight_loader: Optional[Callable]) -> torch.nn.Parameter: + """Create input scale parameter for static activation quantization.""" + from vllm.model_executor.parameter import PerTensorScaleParameter + + scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + scale[:] = torch.finfo(torch.float32).min + return scale + + +def process_fp8_weight_tensor_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, + logical_widths: list[int], + input_scale: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """Process weights for tensor-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale, input_scale=input_scale) + + # Requantize with max scale + weight_scale, weight = requantize_with_max_scale( + weight=weight, + weight_scale=weight_scale, + logical_widths=logical_widths, + ) + + weight = _maybe_pad_fp8_weight(weight) + return weight, weight_scale, input_scale + + +def process_fp8_weight_channel_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """Process weights for channel-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale, input_scale=input_scale) + + return weight, weight_scale, input_scale + + +def process_fp8_weight_block_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Process weights for block-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale) + + weight = _maybe_pad_fp8_weight(weight) + return weight, weight_scale + + +def maybe_post_process_fp8_weight_block(layer: torch.nn.Module, + cutlass_block_fp8_supported: bool): + assert layer.weight_block_size is not None + + from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used, + should_use_deepgemm_for_fp8_linear) + + # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to + # requantize the weight and input to the specific scale + # at the same time. + if is_deep_gemm_e8m0_used(): + block_sz = tuple(layer.weight_block_size) + requant_weight_ue8m0_inplace(layer.weight.data, + layer.weight_scale.data, block_sz) + # SM90 Block FP8 CUTLASS requires row-major weight scales + elif (current_platform.is_device_capability(90) + and cutlass_block_fp8_supported + and not should_use_deepgemm_for_fp8_linear(torch.bfloat16, + layer.weight)): + layer.weight_scale = torch.nn.Parameter( + layer.weight_scale.data.T.contiguous(), requires_grad=False) + + +def apply_fp8_block_linear(layer: torch.nn.Module, input: torch.Tensor, + bias: Optional[torch.Tensor], + cutlass_block_fp8_supported: bool, + use_aiter_and_is_supported: bool) -> torch.Tensor: + """Apply block-wise FP8 linear operation.""" + assert layer.weight_block_size is not None + + return torch.ops.vllm.apply_w8a8_block_fp8_linear( + input=input, + weight=layer.weight, + block_size=layer.weight_block_size, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + cutlass_block_fp8_supported=cutlass_block_fp8_supported, + use_aiter_and_is_supported=use_aiter_and_is_supported, + ) From bc19d7598566ae81b3f69b43cbc2bd34aa5497c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Thu, 18 Sep 2025 15:56:07 +0200 Subject: [PATCH 428/584] [Misc] Add kv-connector label (#25156) Signed-off-by: NickLucche <nlucches@redhat.com> --- .github/mergify.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 94198b1251e09..75ee3e3c55b46 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -302,3 +302,20 @@ pull_request_rules: label: remove: - needs-rebase + +- name: label-kv-connector + description: Automatically apply kv-connector label + conditions: + - or: + - files~=^examples/online_serving/disaggregated[^/]*/.* + - files~=^examples/offline_inference/disaggregated[^/]*/.* + - files~=^examples/others/lmcache/ + - files~=^tests/v1/kv_connector/ + - files~=^vllm/distributed/kv_transfer/ + - title~=(?i)\bP/?D\b + - title~=(?i)NIXL + - title~=(?i)LMCache + actions: + label: + add: + - kv-connector \ No newline at end of file From 01a583fea40571986ffe277549e5bb441d409768 Mon Sep 17 00:00:00 2001 From: jvlunteren <161835099+jvlunteren@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:27:01 +0200 Subject: [PATCH 429/584] [Kernel] Decouple Tile Size from Block Size in Triton Unified Attention Kernel (#21197) Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com> --- .../test_triton_unified_attention.py | 3 - .../attention/ops/triton_unified_attention.py | 122 ++++++++++-------- 2 files changed, 70 insertions(+), 55 deletions(-) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 4b97d51e6ed21..ab91560e995c8 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -102,9 +102,6 @@ def test_triton_unified_attn( ) -> None: torch.set_default_device("cuda") - if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32: - pytest.skip("block size must be at least 32 for fp8") - current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index d2ad2f7e8d2aa..591b68bfa6468 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -73,6 +73,7 @@ def kernel_unified_attention_2d( output_stride_1: tl.int64, # int, should be equal to head_size qq_bias_stride_0: tl.int64, # int BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int must be power of 2 HEAD_SIZE: tl.constexpr, # int HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 USE_ALIBI_SLOPES: tl.constexpr, # bool @@ -118,6 +119,7 @@ def kernel_unified_attention_2d( offs_m = tl.arange(0, BLOCK_M) offs_d = tl.arange(0, HEAD_SIZE_PADDED) + offs_t = tl.arange(0, TILE_SIZE) query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv query_offset_0 = cur_batch_in_all_start_index + query_pos @@ -177,31 +179,32 @@ def kernel_unified_attention_2d( # actual sequence length max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) - # calculate the number of tiles (blocks) that need to be processed to - # cover the longest sequence prefix (due to causal masking, blocks beyond + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond # this prefix can be skipped) - num_blocks = cdiv_fn(max_seq_prefix_len, BLOCK_SIZE) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) # iterate through tiles - for j in range(0, num_blocks): + for j in range(0, num_tiles): + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len - physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j) + physical_block_idx = tl.load(block_tables_ptr + block_table_offset + + seq_offset // BLOCK_SIZE).to(tl.int64) - offs_n = tl.arange(0, BLOCK_SIZE) - - v_offset = (physical_block_idx * stride_v_cache_0 + + v_offset = (physical_block_idx[:, None] * stride_v_cache_0 + kv_head_idx * stride_v_cache_2 + offs_d[None, :] * stride_v_cache_3 + - offs_n[:, None] * stride_v_cache_1) + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1) - k_offset = (physical_block_idx * stride_k_cache_0 + + k_offset = (physical_block_idx[None, :] * stride_k_cache_0 + kv_head_idx * stride_k_cache_2 + offs_d[:, None] * stride_k_cache_3 + - offs_n[None, :] * stride_k_cache_1) + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1) - # K : (HEAD_SIZE, BLOCK_SIZE) + # K : (HEAD_SIZE, TILE_SIZE) K_load = tl.load(key_cache_ptr + k_offset, - mask=dim_mask[:, None], + mask=dim_mask[:, None] & tile_mask[None, :], other=0.0) if K_load.dtype.is_fp8(): @@ -212,9 +215,9 @@ def kernel_unified_attention_2d( else: K = K_load - # V : (BLOCK_SIZE, HEAD_SIZE) + # V : (TILE_SIZE, HEAD_SIZE) V_load = tl.load(value_cache_ptr + v_offset, - mask=dim_mask[None, :], + mask=dim_mask[None, :] & tile_mask[:, None], other=0.0) if V_load.dtype.is_fp8(): @@ -225,12 +228,10 @@ def kernel_unified_attention_2d( else: V = V_load - seq_offset = j * BLOCK_SIZE + offs_n - seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 - # S : (BLOCK_M, BLOCK_SIZE) - S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32) + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) S += scale * tl.dot(Q, K) @@ -262,11 +263,12 @@ def kernel_unified_attention_2d( # compute running maximum # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) + # For sliding window there's a chance the max is -inf due to masking of # the entire row. In this case we need to set m_j 0 to avoid NaN m_j = tl.where(m_j > float("-inf"), m_j, 0.0) - # P : (BLOCK_M, BLOCK_SIZE) + # P : (BLOCK_M, TILE_SIZE) P = tl.exp(S - m_j[:, None]) # l_j : (BLOCK_M,) @@ -327,6 +329,7 @@ def kernel_unified_attention_3d( query_stride_1: tl.int64, # int, should be equal to head_size qq_bias_stride_0: tl.int64, # int BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int, must be power of 2 HEAD_SIZE: tl.constexpr, # int HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 USE_ALIBI_SLOPES: tl.constexpr, # bool @@ -374,20 +377,19 @@ def kernel_unified_attention_3d( # number of segments for this particular sequence num_segments = NUM_SEGMENTS_PER_SEQ - blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE) + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) - if segm_idx * blocks_per_segment * BLOCK_SIZE >= seq_len: + if segm_idx * tiles_per_segment * TILE_SIZE >= seq_len: return offs_m = tl.arange(0, BLOCK_M) offs_d = tl.arange(0, HEAD_SIZE_PADDED) - + offs_t = tl.arange(0, TILE_SIZE) query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv query_offset_0 = cur_batch_in_all_start_index + query_pos query_offset_1 = kv_head_idx * num_queries_per_kv + \ offs_m % num_queries_per_kv - query_offset = (query_offset_0[:, None] * query_stride_0 + query_offset_1[:, None] * query_stride_1 + offs_d[None, :]) @@ -433,30 +435,44 @@ def kernel_unified_attention_3d( qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 ) # shape: [BLOCK_M] - num_blocks = cdiv_fn(seq_len, BLOCK_SIZE) + # compute the length of the longest sequence prefix spanned by any + # query token in the current q_block (q_block_local_idx) + max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + ( + BLOCK_M - 1) // num_queries_per_kv + 1 + + # adjust for potential padding in the last q_block by considering the + # actual sequence length + max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) + + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond + # this prefix can be skipped) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) # iterate through tiles within current segment for j in range( - segm_idx * blocks_per_segment, - min((segm_idx + 1) * blocks_per_segment, num_blocks), + segm_idx * tiles_per_segment, + min((segm_idx + 1) * tiles_per_segment, num_tiles), ): - physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j) + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len - offs_n = tl.arange(0, BLOCK_SIZE) + physical_block_idx = tl.load(block_tables_ptr + block_table_offset + + seq_offset // BLOCK_SIZE).to(tl.int64) - v_offset = (physical_block_idx * stride_v_cache_0 + + v_offset = (physical_block_idx[:, None] * stride_v_cache_0 + kv_head_idx * stride_v_cache_2 + offs_d[None, :] * stride_v_cache_3 + - offs_n[:, None] * stride_v_cache_1) + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1) - k_offset = (physical_block_idx * stride_k_cache_0 + + k_offset = (physical_block_idx[None, :] * stride_k_cache_0 + kv_head_idx * stride_k_cache_2 + offs_d[:, None] * stride_k_cache_3 + - offs_n[None, :] * stride_k_cache_1) + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1) - # K : (HEAD_SIZE, BLOCK_SIZE) + # K : (HEAD_SIZE, TILE_SIZE) K_load = tl.load(key_cache_ptr + k_offset, - mask=dim_mask[:, None], + mask=dim_mask[:, None] & tile_mask[None, :], other=0.0) if K_load.dtype.is_fp8(): @@ -467,9 +483,9 @@ def kernel_unified_attention_3d( else: K = K_load - # V : (BLOCK_SIZE, HEAD_SIZE) + # V : (TILE_SIZE, HEAD_SIZE) V_load = tl.load(value_cache_ptr + v_offset, - mask=dim_mask[None, :], + mask=dim_mask[None, :] & tile_mask[:, None], other=0.0) if V_load.dtype.is_fp8(): @@ -480,13 +496,10 @@ def kernel_unified_attention_3d( else: V = V_load - seq_offset = j * BLOCK_SIZE + offs_n - seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 - # S : (BLOCK_M, BLOCK_SIZE) - S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32) - + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) S += scale * tl.dot(Q, K) if USE_SOFTCAP: @@ -517,11 +530,12 @@ def kernel_unified_attention_3d( # compute running maximum # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) + # For sliding window there's a chance the max is -inf due to masking of # the entire row. In this case we need to set m_j 0 to avoid NaN m_j = tl.where(m_j > float("-inf"), m_j, 0.0) - # P : (BLOCK_M, BLOCK_SIZE,) + # P : (BLOCK_M, TILE_SIZE,) P = tl.exp(S - m_j[:, None]) # l_j : (BLOCK_M,) @@ -573,7 +587,7 @@ def reduce_segments( output_stride_0: tl.int64, # int output_stride_1: tl.int64, # int, should be equal to head_size block_table_stride: tl.int64, # int - BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int HEAD_SIZE: tl.constexpr, # int, must be power of 2 HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 query_start_len_ptr, # [num_seqs+1] @@ -594,10 +608,10 @@ def reduce_segments( # number of segments for this particular sequence num_segments = NUM_SEGMENTS_PER_SEQ - blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE) + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) # create masks for subsequent loads - act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE) + act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE) segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full( [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32) dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, @@ -671,13 +685,10 @@ def unified_attention( # Optional tensor for sinks sinks=None, ): + assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" - block_size = v.shape[1] - assert q.element_size() >= 2 or block_size >= 32, \ - "Block size must be at least 32 for fp8" - if sinks is not None: assert sinks.shape[0] == q.shape[1], \ "Sinks must be num_query_heads size" @@ -707,6 +718,12 @@ def unified_attention( # = floor(q.shape[0] / BLOCK_Q) + num_seqs total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs + # Assigning default tile sizes for prefill and decode. + # Note: each tile size must be at least 32 for "fp8" (q.element_size() == 1) + # and at least 16 for all other data types. + TILE_SIZE_PREFILL = 32 + TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32 + # if batch contains a prefill if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128: kernel_unified_attention_2d[( @@ -736,6 +753,7 @@ def unified_attention( output_stride_1=out.stride(1), qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_PREFILL, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), USE_ALIBI_SLOPES=use_alibi_slopes, @@ -809,6 +827,7 @@ def unified_attention( query_stride_1=q.stride(1), qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_DECODE, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), USE_ALIBI_SLOPES=use_alibi_slopes, @@ -830,7 +849,6 @@ def unified_attention( BLOCK_M=BLOCK_M, NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, ) - reduce_segments[(q.shape[0], num_query_heads)]( output_ptr=out, segm_output_ptr=segm_output, @@ -844,7 +862,7 @@ def unified_attention( output_stride_0=out.stride(0), output_stride_1=out.stride(1), block_table_stride=block_table.stride(0), - BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_DECODE, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), query_start_len_ptr=cu_seqlens_q, From 072d7e53e534d337b41262dd44ded9b44aa699ef Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:27:49 +0400 Subject: [PATCH 430/584] [PERF] Add `conv1d` metadata to GDN attn (#25105) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com> --- vllm/model_executor/layers/mamba/mamba2_metadata.py | 8 +++++--- vllm/model_executor/models/qwen3_next.py | 10 +++++++++- vllm/v1/attention/backends/gdn_attn.py | 6 ++++++ vllm/v1/attention/backends/mamba2_attn.py | 4 ++-- vllm/v1/attention/backends/short_conv_attn.py | 4 ++-- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index 368bfe3af1d3f..c926e17a2c197 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -11,6 +11,7 @@ from vllm.attention.backends.placeholder_attn import ( PlaceholderAttentionMetadata) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.platforms import current_platform +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from vllm.v1.attention.backends.mamba2_attn import ( Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets) @@ -45,8 +46,8 @@ class Mamba2Metadata: """ nums_dict: Optional[dict] = None cu_seqlen: Optional[int] = None - batch_ptr: Optional[torch.tensor] = None - token_chunk_offset_ptr: Optional[torch.tensor] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: @@ -117,7 +118,8 @@ def prepare_mamba2_metadata( def update_metadata(x: torch.Tensor, query_start_loc: torch.Tensor, mamba2_metadata: Union[Mamba2Metadata, - Mamba2AttentionMetadata]): + Mamba2AttentionMetadata, + GDNAttentionMetadata]): """ this is triggered upon handling a new input at the first layer """ diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index eb060cb90f44c..0c974ee44eee2 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -35,6 +35,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba2_metadata import update_metadata from vllm.model_executor.layers.mamba.mamba_mixer2 import ( mamba_v2_sharded_weight_loader) from vllm.model_executor.layers.mamba.mamba_utils import ( @@ -414,6 +415,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): assert isinstance(attn_metadata, dict) attn_metadata = attn_metadata[self.prefix] + conv_metadata = attn_metadata assert isinstance(attn_metadata, GDNAttentionMetadata) has_initial_state = attn_metadata.has_initial_state spec_query_start_loc = attn_metadata.spec_query_start_loc @@ -475,10 +477,15 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): # 2.2: process the remaining part if attn_metadata.num_prefills > 0: + mixed_qkv_non_spec_T = mixed_qkv_non_spec.transpose(0, 1) + if conv_metadata.cu_seqlen is None: + conv_metadata = update_metadata(mixed_qkv_non_spec_T, + non_spec_query_start_loc, + conv_metadata) # - "cache_indices" updates the conv_state cache in positions # pointed to by "mamba_cache_params.state_indices_tensor" mixed_qkv_non_spec = causal_conv1d_fn( - mixed_qkv_non_spec.transpose(0, 1), + mixed_qkv_non_spec_T, conv_weights, self.conv1d.bias, activation=self.activation, @@ -486,6 +493,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): has_initial_state=has_initial_state, cache_indices=non_spec_state_indices_tensor, query_start_loc=non_spec_query_start_loc, + metadata=conv_metadata, ).transpose(0, 1) elif attn_metadata.num_decodes > 0: mixed_qkv_non_spec = causal_conv1d_update( diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index ba89f93e8b56d..5dadc52d0fb1c 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -50,6 +50,12 @@ class GDNAttentionMetadata: Tensor] = None # shape: [num_prefill_tokens + num_decode_tokens,] num_accepted_tokens: Optional[torch.Tensor] = None # shape: [batch,] + # The following attributes are for triton implementation of causal_conv1d + nums_dict: Optional[dict] = None + cu_seqlen: Optional[int] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None + class GDNAttentionMetadataBuilder( AttentionMetadataBuilder[GDNAttentionMetadata]): diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 359bad1ea9dee..2fe1f14ca1db0 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -132,8 +132,8 @@ class Mamba2AttentionMetadata: # The following attributes are for triton implementation of causal_conv1d nums_dict: Optional[dict] = None cu_seqlen: Optional[int] = None - batch_ptr: Optional[torch.tensor] = None - token_chunk_offset_ptr: Optional[torch.tensor] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None class Mamba2AttentionMetadataBuilder( diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index f5ad65b02b4d4..717c40b37ecfb 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -34,8 +34,8 @@ class ShortConvAttentionMetadata: # For causal_conv1d nums_dict: Optional[dict] = None cu_seqlen: Optional[int] = None - batch_ptr: Optional[torch.tensor] = None - token_chunk_offset_ptr: Optional[torch.tensor] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None class ShortConvAttentionMetadataBuilder( From 67244c86f0f1ffc06fcab9cad5e78989695cc15f Mon Sep 17 00:00:00 2001 From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com> Date: Thu, 18 Sep 2025 22:29:40 +0800 Subject: [PATCH 431/584] feat(api): Return 503 on /health when engine is dead (#24897) Signed-off-by: dongbo910220 <1275604947@qq.com> Co-authored-by: Claude <noreply@anthropic.com> --- vllm/entrypoints/openai/api_server.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 93ea846f26f6c..912e664120929 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -103,6 +103,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, is_valid_ipv6_address, set_ulimit) +from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -351,8 +352,11 @@ def engine_client(request: Request) -> EngineClient: @router.get("/health", response_class=Response) async def health(raw_request: Request) -> Response: """Health check.""" - await engine_client(raw_request).check_health() - return Response(status_code=200) + try: + await engine_client(raw_request).check_health() + return Response(status_code=200) + except EngineDeadError: + return Response(status_code=503) @router.get("/load") From 5f696c33b1fbf33fe91ecdd958874b9dd52f79b4 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" <noooop@126.com> Date: Thu, 18 Sep 2025 23:22:01 +0800 Subject: [PATCH 432/584] [New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- docs/models/supported_models.md | 11 +++ examples/offline_inference/pooling/README.md | 8 ++- examples/offline_inference/pooling/ner.py | 54 ++++++++++++++ examples/online_serving/pooling/README.md | 6 ++ examples/online_serving/pooling/ner.py | 71 +++++++++++++++++++ .../pooling/test_token_classification.py | 39 ++++++++++ tests/models/registry.py | 1 + vllm/entrypoints/llm.py | 4 ++ vllm/model_executor/models/bert.py | 52 ++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/v1/attention/backends/flex_attention.py | 12 +++- 11 files changed, 257 insertions(+), 2 deletions(-) create mode 100644 examples/offline_inference/pooling/ner.py create mode 100644 examples/online_serving/pooling/ner.py create mode 100644 tests/models/language/pooling/test_token_classification.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7aeaeca97699c..b67ebcbe3c81a 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -554,6 +554,17 @@ If your model is not in the above list, we will try to automatically convert the For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +#### Token Classification + +These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API. + +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | +|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------| +| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ | + +!!! note + Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>. + [](){ #supported-mm-models } ## List of Multimodal Language Models diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index 8693f5e08e0ba..79afbd9cfac47 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -26,8 +26,14 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py python examples/offline_inference/pooling/embed_matryoshka_fy.py ``` +## Named Entity Recognition (NER) usage + +```bash +python examples/offline_inference/pooling/ner.py +``` + ## Qwen3 reranker usage ```bash -python qwen3_reranker.py +python examples/offline_inference/pooling/qwen3_reranker.py ``` diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py new file mode 100644 index 0000000000000..f18742fac0d54 --- /dev/null +++ b/examples/offline_inference/pooling/ner.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults( + model="boltuix/NeuroBERT-NER", + runner="pooling", + enforce_eager=True, + trust_remote_code=True, + ) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Barack Obama visited Microsoft headquarters in Seattle on January 2025." + ] + + # Create an LLM. + llm = LLM(**vars(args)) + tokenizer = llm.get_tokenizer() + label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label + + # Run inference + outputs = llm.encode(prompts) + + for prompt, output in zip(prompts, outputs): + logits = output.outputs.data + predictions = logits.argmax(dim=-1) + + # Map predictions to labels + tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids) + labels = [label_map[p.item()] for p in predictions] + + # Print results + for token, label in zip(tokens, labels): + if token not in tokenizer.all_special_tokens: + print(f"{token:15} → {label}") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index f7926542202d6..2c271b6a32bc2 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -12,6 +12,12 @@ python examples/online_serving/pooling/cohere_rerank_client.py python examples/online_serving/pooling/jinaai_rerank_client.py ``` +## Named Entity Recognition (NER) usage + +```bash +python examples/online_serving/pooling/ner.py +``` + ## Openai chat embedding for multimodal usage ```bash diff --git a/examples/online_serving/pooling/ner.py b/examples/online_serving/pooling/ner.py new file mode 100644 index 0000000000000..9ec2bd45a0fe5 --- /dev/null +++ b/examples/online_serving/pooling/ner.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER + +""" +Example online usage of Pooling API for Named Entity Recognition (NER). + +Run `vllm serve <model> --runner pooling` +to start up the server in vLLM. e.g. + +vllm serve boltuix/NeuroBERT-NER +""" + +import argparse + +import requests +import torch + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER") + + return parser.parse_args() + + +def main(args): + from transformers import AutoConfig, AutoTokenizer + + api_url = f"http://{args.host}:{args.port}/pooling" + model_name = args.model + + # Load tokenizer and config + tokenizer = AutoTokenizer.from_pretrained(model_name) + config = AutoConfig.from_pretrained(model_name) + label_map = config.id2label + + # Input text + text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025." + prompt = {"model": model_name, "input": text} + + pooling_response = post_http_request(prompt=prompt, api_url=api_url) + + # Run inference + output = pooling_response.json()["data"][0] + logits = torch.tensor(output["data"]) + predictions = logits.argmax(dim=-1) + inputs = tokenizer(text, return_tensors="pt") + + # Map predictions to labels + tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) + labels = [label_map[p.item()] for p in predictions] + assert len(tokens) == len(predictions) + + # Print results + for token, label in zip(tokens, labels): + if token not in tokenizer.all_special_tokens: + print(f"{token:15} → {label}") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py new file mode 100644 index 0000000000000..fd5e48a8b1449 --- /dev/null +++ b/tests/models/language/pooling/test_token_classification.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +from transformers import AutoModelForTokenClassification + +from tests.models.utils import softmax + + +@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"]) +# The float32 is required for this tiny model to pass the test. +@pytest.mark.parametrize("dtype", ["float"]) +@torch.inference_mode +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.encode(example_prompts) + + with hf_runner(model, + dtype=dtype, + auto_cls=AutoModelForTokenClassification) as hf_model: + tokenizer = hf_model.tokenizer + hf_outputs = [] + for prompt in example_prompts: + inputs = tokenizer([prompt], return_tensors="pt") + inputs = hf_model.wrap_device(inputs) + output = hf_model.model(**inputs) + hf_outputs.append(softmax(output.logits[0])) + + # check logits difference + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + hf_output = torch.tensor(hf_output).cpu().float() + vllm_output = torch.tensor(vllm_output).cpu().float() + assert torch.allclose(hf_output, vllm_output, 1e-2) diff --git a/tests/models/registry.py b/tests/models/registry.py index 93aa9d4025498..e9cc5170ade74 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -414,6 +414,7 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { # [Cross-encoder] "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"), # noqa: E501 + "BertForTokenClassification": _HfExamplesInfo("boltuix/NeuroBERT-NER"), "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base", # noqa: E501 trust_remote_code=True, hf_overrides={ diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 63e9478612bb1..df6b16c73d6e7 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -943,6 +943,10 @@ class LLM: considered legacy and may be deprecated in the future. You should instead pass them via the `inputs` parameter. """ + + if self.supported_tasks == ["encode"] and pooling_task is None: + pooling_task = "encode" + if pooling_task is None: if "embed" in self.supported_tasks: pooling_task = "embed" diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index c07e5364814ac..ee32587f6b1b4 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -611,3 +611,55 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, positions=positions, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) + + +@default_pooling_type("ALL") +class BertForTokenClassification(nn.Module): + is_pooling_model = True + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + self.head_dtype = vllm_config.model_config.head_dtype + self.num_labels = config.num_labels + self.bert = BertModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "bert"), + embedding_class=BertEmbedding) + self.classifier = nn.Linear(config.hidden_size, + config.num_labels, + dtype=self.head_dtype) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + }) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + loaded_params = loader.load_weights(weights) + return loaded_params + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + if token_type_ids is not None: + assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT) + assert input_ids is not None + _encode_token_type_ids(input_ids, token_type_ids) + + hidden_states = self.bert(input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) + + hidden_states = hidden_states.to(self.head_dtype) + return self.classifier(hidden_states) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 707b57106e6d9..1382fd9e93ea3 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -193,6 +193,7 @@ _EMBEDDING_MODELS = { _CROSS_ENCODER_MODELS = { "BertForSequenceClassification": ("bert", "BertForSequenceClassification"), + "BertForTokenClassification": ("bert", "BertForTokenClassification"), "GteNewForSequenceClassification": ("bert_with_rope", "GteNewForSequenceClassification"), "ModernBertForSequenceClassification": ("modernbert", diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index cb983494216a7..662d3984554ad 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -720,6 +720,15 @@ class FlexAttentionImpl(AttentionImpl): (query, key, value), ) + query = query[:, :, :num_actual_tokens, :] + if ((key_tensor.size(-2) > num_actual_tokens) + or (value_tensor.size(-2) > num_actual_tokens)): + # In the encoder-only model with torch.compile, + # qkv might be padded, which might cause exception. + # see: https://github.com/vllm-project/vllm/pull/24872#discussion_r2353252290 + key_tensor = key_tensor[:, :, :num_actual_tokens, :] + value_tensor = value_tensor[:, :, :num_actual_tokens, :] + else: assert self.attn_type == AttentionType.DECODER key_cache, value_cache = kv_cache.unbind(0) @@ -744,7 +753,8 @@ class FlexAttentionImpl(AttentionImpl): (query, key_cache, value_cache), ) - query = query[:, :, :num_actual_tokens, :] + query = query[:, :, :num_actual_tokens, :] + # Doesn't work for now -> constraint violation # torch._dynamo.try_mark_dynamic(query, 2) From b419937c78017dc4c5bfa19f11547f4832ea2290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Fri, 19 Sep 2025 00:23:26 +0900 Subject: [PATCH 433/584] [Docs] Fix warnings in mkdocs build (continued) (#25163) Signed-off-by: Zerohertz <ohg3417@gmail.com> --- .../device_communicators/shm_object_storage.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py index 352e7525d4c84..0310fc14da256 100644 --- a/vllm/distributed/device_communicators/shm_object_storage.py +++ b/vllm/distributed/device_communicators/shm_object_storage.py @@ -253,7 +253,7 @@ class SingleWriterShmRingBuffer: Args: nbytes (int, optional): The size of the buffer to free. If None, - frees the maximum size of the ring buffer. + frees the maximum size of the ring buffer. ''' assert self.is_writer, "Only the writer can free buffers." diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d391cc50ad232..4eb1f8b89d64f 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -697,9 +697,7 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> TextTokensPrompt: """ - A simpler implementation of - [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] - that assumes single input. + A simpler implementation that tokenizes a single prompt input. """ async for result in self._tokenize_prompt_inputs_async( request, @@ -718,9 +716,7 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> AsyncGenerator[TextTokensPrompt, None]: """ - A simpler implementation of - [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] - that assumes multiple inputs. + A simpler implementation that tokenizes multiple prompt inputs. """ for prompt in prompt_inputs: if isinstance(prompt, str): From 2ea50e977aac00c63e78990a7477bb91295df183 Mon Sep 17 00:00:00 2001 From: Shu Wang <shuw@nvidia.com> Date: Thu, 18 Sep 2025 10:52:58 -0500 Subject: [PATCH 434/584] Enable Allgather/ReduceScatter backend for NaiveAllToAll (#23964) Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Shu Wang <shuw@nvidia.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- .../device_communicators/all2all.py | 39 +++++++++++++++++++ .../device_communicators/cuda_communicator.py | 4 ++ vllm/envs.py | 17 +++++--- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 427fd040fcb71..149df73d8667b 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -5,6 +5,7 @@ from typing import Any import torch import torch.distributed as dist +from vllm.distributed import get_dp_group from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.utils import has_deep_ep, has_pplx @@ -69,6 +70,44 @@ class NaiveAll2AllManager(All2AllManagerBase): pass +class AgRsAll2AllManager(All2AllManagerBase): + """ + An implementation of all2all communication based on + all-gather (dispatch) and reduce-scatter (combine). + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + """ + Gather hidden_states and router_logits from all dp ranks. + """ + sizes = get_forward_context( + ).dp_metadata.get_chunk_sizes_across_dp_rank() + hidden_states, router_logits = get_dp_group().all_gatherv( + [hidden_states, router_logits], + dim=0, + sizes=sizes, + ) + return hidden_states, router_logits + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Reduce-scatter hidden_states across all dp ranks. + """ + sizes = get_forward_context( + ).dp_metadata.get_chunk_sizes_across_dp_rank() + hidden_states = get_dp_group().reduce_scatterv(hidden_states, + dim=0, + sizes=sizes) + return hidden_states + + def destroy(self): + pass + + class PPLXAll2AllManager(All2AllManagerBase): """ All2All communication based on PPLX kernels. diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 78c90b006ffc8..b2bf3bc3cc2ed 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -87,6 +87,10 @@ class CudaCommunicator(DeviceCommunicatorBase): from .all2all import NaiveAll2AllManager self.all2all_manager = NaiveAll2AllManager(self.cpu_group) logger.info("Using naive all2all manager.") + elif all2all_backend == "allgather_reducescatter": + from .all2all import AgRsAll2AllManager + self.all2all_manager = AgRsAll2AllManager(self.cpu_group) + logger.info("Using AllGather-ReduceScatter all2all manager.") elif all2all_backend == "pplx": from .all2all import PPLXAll2AllManager self.all2all_manager = PPLXAll2AllManager(self.cpu_group) diff --git a/vllm/envs.py b/vllm/envs.py index 72e1d5b0ede81..19e2f8635275d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -149,8 +149,11 @@ if TYPE_CHECKING: VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 - VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput", - "deepep_low_latency"] = "naive" + VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter"] = \ + "allgather_reducescatter" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_SLEEP_WHEN_IDLE: bool = False @@ -1124,14 +1127,18 @@ environment_variables: dict[str, Callable[[], Any]] = { # all2all backend for vllm's expert parallel communication # Available options: - # - "naive": naive all2all implementation using all-reduce + # - "naive": naive all2all implementation using broadcasts + # - "allgather_reducescatter": all2all implementation based on allgather and + # reducescatter # - "pplx": use pplx kernels # - "deepep_high_throughput", use deepep high-throughput kernels # - "deepep_low_latency", use deepep low-latency kernels "VLLM_ALL2ALL_BACKEND": - env_with_choices("VLLM_ALL2ALL_BACKEND", "naive", + env_with_choices("VLLM_ALL2ALL_BACKEND", "allgather_reducescatter", ["naive", "pplx", - "deepep_high_throughput", "deepep_low_latency"]), + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter"]), # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. # Both require compute capability 10.0 or above. From 1c3b1634aa9d4be56fa6e931e96ec8145fedcc0a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:01:50 +0100 Subject: [PATCH 435/584] [Misc] Add codeowner for Transformers backend (#25180) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b8d6db06548d5..08717cdde643a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -61,6 +61,10 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/v1/kv_connector @ApostaC /tests/v1/offloading @ApostaC +# Transformers backend +/vllm/model_executor/models/transformers.py @hmellor +/tests/models/test_transformers.py @hmellor + # Docs /docs @hmellor mkdocs.yaml @hmellor From c4cb0af98a8e39950fa9b99acf7c241959a14ac8 Mon Sep 17 00:00:00 2001 From: qizixi <22851944+zixi-qi@users.noreply.github.com> Date: Thu, 18 Sep 2025 09:12:19 -0700 Subject: [PATCH 436/584] [spec decode] Fix MTP inference path for MiMo-7B model (#25136) Signed-off-by: zixi-qi <qizixi@meta.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- examples/offline_inference/spec_decode.py | 6 +++++- vllm/config/speculative.py | 2 +- vllm/model_executor/models/mimo_mtp.py | 18 ++++++++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 5af232cb6af6a..004e75b204642 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -53,7 +53,6 @@ def parse_args(): "--method", type=str, default="eagle", - choices=["ngram", "eagle", "eagle3", "mtp"], ) parser.add_argument("--num-spec-tokens", type=int, default=2) parser.add_argument("--prompt-lookup-max", type=int, default=5) @@ -118,6 +117,11 @@ def main(): "prompt_lookup_max": args.prompt_lookup_max, "prompt_lookup_min": args.prompt_lookup_min, } + elif args.method.endswith("mtp"): + speculative_config = { + "method": args.method, + "num_speculative_tokens": args.num_spec_tokens, + } else: raise ValueError(f"unknown method: {args.method}") diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index fca8c28e5c61e..2c861723c3966 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -31,7 +31,7 @@ logger = init_logger(__name__) SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", "mlp_speculator", "draft_model", "deepseek_mtp", - "ernie_mtp", "qwen3_next_mtp"] + "ernie_mtp", "qwen3_next_mtp", "mimo_mtp"] @config diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index ac835edc001ea..09194e9f95d0e 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -241,6 +241,15 @@ class MiMoMTP(nn.Module): def map_model_name_to_mtp_param_name(self, name: str) -> str: import regex as re + + # append mtp_start_layer_idx + pattern = r"(model\.mtp_layers\.)(\d+)(\.)" + match = re.match(pattern, name) + if match: + original_num = int(match.group(2)) + new_num = original_num + self.config.num_hidden_layers + name = name.replace(match.group(), f"{match.group(1)}{new_num}.") + # check for early turn name_without_prefix = [ "token_layernorm", "hidden_layernorm", "input_proj", "final_layernorm" @@ -248,10 +257,11 @@ class MiMoMTP(nn.Module): for sub_name in name_without_prefix: if sub_name in name: return name - pattern = r"model.mtp_layers.(\d+)." - group = re.match(pattern, name) - if group is not None: - name = name.replace(group.group(), group.group() + "mtp_block.") + # add mtp_block + pattern = r"(model\.mtp_layers\.\d+\.)" + match = re.match(pattern, name) + if match: + name = name.replace(match.group(), match.group() + "mtp_block.") return name def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str: From dc3405936090f5c964a5b38c9de8c8400f01541c Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:36:55 -0400 Subject: [PATCH 437/584] [ROCm][CI/Build] Use ROCm7.0 as the base (#25178) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> --- docker/Dockerfile.rocm | 5 ++- docker/Dockerfile.rocm_base | 61 ++++++++----------------------------- 2 files changed, 16 insertions(+), 50 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 063fc49693288..c8900212e5a1b 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -29,7 +29,10 @@ ARG VLLM_BRANCH="main" ONBUILD RUN git clone ${VLLM_REPO} \ && cd vllm \ && git fetch -v --prune -- origin ${VLLM_BRANCH} \ - && git checkout FETCH_HEAD + && git checkout FETCH_HEAD \ + && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \ + git remote add upstream "https://github.com/vllm-project/vllm.git" \ + && git fetch upstream ; fi FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # ----------------------- diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 2ba5461dfe551..4973b57f76563 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,25 +1,23 @@ -ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete -ARG HIPBLASLT_BRANCH="aa0bda7b" -ARG HIPBLAS_COMMON_BRANCH="9b80ba8e" -ARG LEGACY_HIPBLASLT_OPTION= -ARG TRITON_BRANCH="e5be006" -ARG TRITON_REPO="https://github.com/triton-lang/triton.git" -ARG PYTORCH_BRANCH="f717b2af" -ARG PYTORCH_VISION_BRANCH="v0.21.0" +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete +ARG TRITON_BRANCH="f9e5bf54" +ARG TRITON_REPO="https://github.com/ROCm/triton.git" +ARG PYTORCH_BRANCH="b2fb6885" +ARG PYTORCH_VISION_BRANCH="v0.23.0" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" -ARG FA_BRANCH="1a7f4dfa" +ARG FA_BRANCH="0e60e394" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="4822e675" +ARG AITER_BRANCH="2ab9f4cd" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base -ENV PATH=/opt/rocm/llvm/bin:$PATH +ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV ROCM_PATH=/opt/rocm ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: -ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201 +ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +ENV AITER_ROCM_ARCH=gfx942;gfx950 ARG PYTHON_VERSION=3.12 @@ -45,29 +43,6 @@ RUN apt-get update -y \ RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython -FROM base AS build_hipblaslt -ARG HIPBLASLT_BRANCH -ARG HIPBLAS_COMMON_BRANCH -# Set to "--legacy_hipblas_direct" for ROCm<=6.2 -ARG LEGACY_HIPBLASLT_OPTION -RUN git clone https://github.com/ROCm/hipBLAS-common.git -RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y -RUN cd hipBLAS-common \ - && git checkout ${HIPBLAS_COMMON_BRANCH} \ - && mkdir build \ - && cd build \ - && cmake .. \ - && make package \ - && dpkg -i ./*.deb -RUN git clone https://github.com/ROCm/hipBLASLt -RUN cd hipBLASLt \ - && git checkout ${HIPBLASLT_BRANCH} \ - && apt-get install -y llvm-dev \ - && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ - && cd build/release \ - && make package -RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install - FROM base AS build_triton ARG TRITON_BRANCH ARG TRITON_REPO @@ -121,13 +96,11 @@ RUN cd aiter \ && git checkout ${AITER_BRANCH} \ && git submodule update --init --recursive \ && pip install -r requirements.txt -RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl +RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install FROM base AS debs RUN mkdir /app/debs -RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ - cp /install/*.deb /app/debs RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ cp /install/*.whl /app/debs RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ @@ -138,11 +111,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ cp /install/*.whl /app/debs FROM base AS final -RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ - dpkg -i /install/*deb \ - && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ - && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ - && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ pip install /install/*.whl RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ @@ -153,9 +121,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ pip install /install/*.whl ARG BASE_IMAGE -ARG HIPBLAS_COMMON_BRANCH -ARG HIPBLASLT_BRANCH -ARG LEGACY_HIPBLASLT_OPTION ARG TRITON_BRANCH ARG TRITON_REPO ARG PYTORCH_BRANCH @@ -167,9 +132,6 @@ ARG FA_REPO ARG AITER_BRANCH ARG AITER_REPO RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ - && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ - && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ - && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ @@ -177,5 +139,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file From bbdc0f2366997536207abc212fcdae7a1b688159 Mon Sep 17 00:00:00 2001 From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:46:47 -0500 Subject: [PATCH 438/584] [ROCm][AITER][Bugfix] Switch AITER to use PIECEWISE_AND_FULL compilation (#25104) Signed-off-by: Rohan138 <rohanpotdar138@gmail.com> --- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 8eb3505cf274d..afb2283c44d37 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -232,7 +232,7 @@ class AiterFlashAttentionMetadata: class AiterFlashAttentionMetadataBuilder( AttentionMetadataBuilder[AiterFlashAttentionMetadata]): - cudagraph_support = AttentionCGSupport.ALWAYS + cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): From 505805b645649be6a8e788a1f048b851fa123ef1 Mon Sep 17 00:00:00 2001 From: Or Ozeri <oro@il.ibm.com> Date: Thu, 18 Sep 2025 20:57:07 +0300 Subject: [PATCH 439/584] [KV offload][1/N] Introduce an offloading component (#19848) Signed-off-by: Or Ozeri <oro@il.ibm.com> --- .buildkite/test-pipeline.yaml | 1 + tests/v1/offloading/test_worker.py | 152 +++++++++++++++++++++++++ vllm/v1/offloading/abstract.py | 165 ++++++++++++++++++++++++++++ vllm/v1/offloading/mediums.py | 39 +++++++ vllm/v1/offloading/worker/worker.py | 142 ++++++++++++++++++++++++ 5 files changed, 499 insertions(+) create mode 100644 tests/v1/offloading/test_worker.py create mode 100644 vllm/v1/offloading/abstract.py create mode 100644 vllm/v1/offloading/mediums.py create mode 100644 vllm/v1/offloading/worker/worker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 66dfc990805f2..5fd08296625ad 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -280,6 +280,7 @@ steps: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/executor + - pytest -v -s v1/offloading - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/offloading/test_worker.py new file mode 100644 index 0000000000000..2391b565773aa --- /dev/null +++ b/tests/v1/offloading/test_worker.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.offloading.worker.worker import (OffloadingHandler, + OffloadingWorker, TransferResult, + TransferSpec) + + +class LoadStoreSpec1(LoadStoreSpec): + + def __init__(self, + submit_success: bool = True, + async_success: bool = True, + exception: bool = False): + self.finished = False + self.submit_success = submit_success + self.async_success = async_success + self.exception = exception + + @staticmethod + def medium() -> str: + return "1" + + def __repr__(self): + return f"{self.medium()}: {id(self)}" + + +class LoadStoreSpec2(LoadStoreSpec): + + @staticmethod + def medium() -> str: + return "2" + + def __repr__(self): + return f"{self.medium()}: {id(self)}" + + +class OffloadingHandler1To2(OffloadingHandler): + + def __init__(self): + self.transfers: dict[int, LoadStoreSpec1] = {} + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + src, dst = spec + assert isinstance(src, LoadStoreSpec1) + assert isinstance(dst, LoadStoreSpec2) + + if src.exception: + raise Exception("An expected exception. Don't worry!") + if not src.submit_success: + return False + + self.transfers[job_id] = src + return True + + def get_finished(self) -> list[TransferResult]: + finished = [] + for job_id, spec in list(self.transfers.items()): + if spec.finished: + finished.append((job_id, spec.async_success)) + del self.transfers[job_id] + return finished + + +class OffloadingHandler2To1(OffloadingHandler): + + def __init__(self): + self.transfers: dict[int, LoadStoreSpec1] = {} + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + src, dst = spec + assert isinstance(src, LoadStoreSpec2) + assert isinstance(dst, LoadStoreSpec1) + + self.transfers[job_id] = dst + return True + + def get_finished(self) -> list[TransferResult]: + finished = [] + for job_id, spec in list(self.transfers.items()): + if spec.finished: + finished.append((job_id, spec.async_success)) + del self.transfers[job_id] + return finished + + +def test_offloading_worker(): + """ + Tests OffloadingWorker with 2 handlers. + One handler performs 1->2 transfers, and the other handles 2->1. + """ + worker = OffloadingWorker() + handler1to2 = OffloadingHandler1To2() + handler2to1 = OffloadingHandler2To1() + worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2) + worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1) + + # 1st transfer 1->2 (exception) + src1 = LoadStoreSpec1(exception=True) + dst1 = LoadStoreSpec2() + assert not worker.transfer_async(1, (src1, dst1)) + + # 2ed transfer 1->2 (failure to submit) + src2 = LoadStoreSpec1(submit_success=False) + dst2 = LoadStoreSpec2() + assert not worker.transfer_async(2, (src2, dst2)) + + # 3rd transfer 1->2 (failure) + src3 = LoadStoreSpec1(async_success=False) + dst3 = LoadStoreSpec2() + assert worker.transfer_async(3, (src3, dst3)) + + # 4th transfer 1->2 (success) + src4 = LoadStoreSpec1() + dst4 = LoadStoreSpec2() + worker.transfer_async(4, (src4, dst4)) + assert set(handler1to2.transfers.keys()) == {3, 4} + + # 5th transfer 2->1 + src5 = LoadStoreSpec2() + dst5 = LoadStoreSpec1() + worker.transfer_async(5, (src5, dst5)) + assert set(handler2to1.transfers.keys()) == {5} + + # no transfer completed yet + assert worker.get_finished() == [] + + # complete 3rd, 4th + src3.finished = True + src4.finished = True + + # 6th transfer 1->2 + src6 = LoadStoreSpec1() + dst6 = LoadStoreSpec2() + worker.transfer_async(6, (src6, dst6)) + + # 7th transfer 2->1 + src7 = LoadStoreSpec2() + dst7 = LoadStoreSpec1() + worker.transfer_async(7, (src7, dst7)) + + # 6th and 7th transfers started + assert 6 in handler1to2.transfers + assert 7 in handler2to1.transfers + + # verify result of 3rd and 4th transfers + assert (sorted(worker.get_finished()) == [(3, False), (4, True)]) + + # complete 6th and 7th transfers + src6.finished = True + dst7.finished = True + assert (sorted(worker.get_finished()) == [(6, True), (7, True)]) diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/offloading/abstract.py new file mode 100644 index 0000000000000..9f9c044ea1c53 --- /dev/null +++ b/vllm/v1/offloading/abstract.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +OffloadingManager class for managing KV data offloading in vLLM v1 + +This class runs in the scheduler, tracks which blocks are offloaded +and their address. + +The class provides the following primitives: + lookup() - find the length of the maximal series of blocks, + starting from the first one, that are all offloaded. + prepare_load() - prepare given blocks to be read. + The given blocks will be protected from eviction. + This function returns a LoadSpec which encapsulates + information required for performing the load. + touch() - marks the give blocks as recently used. Can be used + to track block's LRU. This function is separated from the + prepare_load function to allow setting block recency even + for blocks which do not need reading from the cache, such as + blocks that are cached by the GPU prefix cache. + complete_load() - mark blocks which were previously prepared to be + loaded as done loading. This is to re-allow their eviction. + prepare_store() - prepare the given blocks to be written. + Returns a StoreSpec encapsulating offloading information, + as well as a list of blocks that were evicted as a result. + complete_store() - marks a previous store as completed. + Following this call, the given blocks will become loadable. +""" + +from abc import ABC, abstractmethod +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Optional + +from vllm.v1.core.kv_cache_utils import BlockHash + + +class LoadStoreSpec(ABC): + """ + Abstract metadata that encapsulates information allowing a worker + to load, and optionally also to store, blocks of KV data. + """ + + @staticmethod + @abstractmethod + def medium() -> str: + """ + Returns a string representation of the medium type + this store/load targets. + """ + pass + + +@dataclass +class PrepareStoreOutput: + block_hashes_to_store: list[BlockHash] + store_spec: LoadStoreSpec + block_hashes_evicted: list[BlockHash] + + +@dataclass +class OffloadingEvent: + block_hashes: list[BlockHash] + block_size: int + medium: str + # True if blocks are removed, False if stored + removed: bool + + +class OffloadingManager(ABC): + + @abstractmethod + def lookup(self, block_hashes: Iterable[BlockHash]) -> int: + """ + Finds the length of the maximal series of blocks, starting from the + first one, that are all offloaded. + + Args: + block_hashes: the hashes identifying the blocks to lookup. + + Returns: + An integer representing the maximal number of blocks that + are currently offloaded. + """ + pass + + @abstractmethod + def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec: + """ + Prepare the given blocks to be read. + The given blocks will be protected from eviction until + complete_load is called. + It assumes all given blocks are offloaded. + + Args: + block_hashes: the hashes identifying the blocks. + + Returns: + A LoadStoreSpec that can be used by a worker to locate and load + the actual offloaded KV data. + """ + pass + + def touch(self, block_hashes: Iterable[BlockHash]): + """ + Mark the given blocks as recently used. + This could in practice mean moving them to the end of an LRU list. + + Args: + block_hashes: the hashes identifying the blocks. + """ + return + + def complete_load(self, block_hashes: Iterable[BlockHash]): + """ + Marks previous blocks that were prepared to load as done loading. + + Args: + block_hashes: the hashes identifying the blocks. + """ + return + + @abstractmethod + def prepare_store( + self, + block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]: + """ + Prepare the given blocks to be offloaded. + The given blocks will be protected from eviction until + complete_store is called. + + Args: + block_hashes: the hashes identifying the blocks. + + Returns: + A PrepareStoreOutput indicating which blocks need storing, + where to store them (LoadStoreSpec), and list of blocks that + were evicted as a result. + None is returned if the blocks cannot be stored. + """ + pass + + def complete_store(self, + block_hashes: Iterable[BlockHash], + success: bool = True): + """ + Marks blocks which were previously prepared to be stored, as stored. + Following this call, the blocks become loadable. + If if_success is False, blocks that were not marked as stored will be + removed. + + Args: + block_hashes: the hashes identifying the blocks. + success: whether the blocks were stored successfully. + """ + return + + def take_events(self) -> Iterable[OffloadingEvent]: + """ + Take the offloading events from the manager. + + Yields: + New OffloadingEvents collected since the last call. + """ + return () diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/offloading/mediums.py new file mode 100644 index 0000000000000..5a1887848c9fc --- /dev/null +++ b/vllm/v1/offloading/mediums.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC + +import numpy as np + +from vllm.v1.offloading.abstract import LoadStoreSpec + + +class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC): + """ + Spec for loading/storing KV blocks from given block numbers. + """ + + def __init__(self, block_ids: list[int]): + self.block_ids = np.array(block_ids, dtype=np.int64) + + def __repr__(self) -> str: + return repr(self.block_ids) + + +class GPULoadStoreSpec(BlockIDsLoadStoreSpec): + """ + Spec for loading/storing a KV block to GPU memory. + """ + + @staticmethod + def medium() -> str: + return "GPU" + + +class CPULoadStoreSpec(BlockIDsLoadStoreSpec): + """ + Spec for loading/storing a KV block to CPU memory. + """ + + @staticmethod + def medium() -> str: + return "CPU" diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/offloading/worker/worker.py new file mode 100644 index 0000000000000..d2c2045d1f1f6 --- /dev/null +++ b/vllm/v1/offloading/worker/worker.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod + +from vllm.logger import init_logger +from vllm.v1.offloading.abstract import LoadStoreSpec + +# a single transfer spec (src_blocks_spec, dst_blocks_spec) +TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec] +# transfers are forwarded to workers by (src_medium, dst_medium) +TransferType = tuple[str, str] +# transfer result (job_id, success) +TransferResult = tuple[int, bool] + +logger = init_logger(__name__) + + +class OffloadingHandler(ABC): + """ + OffloadingHandler class for managing asynchronous KV data transfers + + This class runs in the worker. + It kicks off async KV data transfer requests, and allows + collecting back completion statuses. + + The class provides the following primitives: + transfer_async() - kicks off a new transfer job + get_finished() - returns a list of newly finished job IDs. + """ + + @abstractmethod + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + """ + Initiates an asynchronous transfer of KV data. + + Args: + job_id: a unique ID that will be used when notifying back on + transfer completion. + spec: the (src, dst) spec of the KV data transfer. + + Returns: + True if transfer was submitted successfully. + """ + pass + + @abstractmethod + def get_finished(self) -> list[TransferResult]: + """ + Get transfers finished since last call. + + Returns: + A list of (job_id, success) of transfers. + """ + pass + + +class OffloadingWorker: + """ + OffloadingWorker class for managing asynchronous KV data transfers + using multiple OffloadingHandlers + + This class runs in the worker. + It kicks off async KV data transfer requests, by delegating + to one of its registered OffloadingHandlers, based on the transfer type. + + The class provides the following primitives: + register_handler() - registers a new handler to handle + a specific transfer type + transfer_async() - kicks off a new transfer job + using one of the registered handlers. + get_finished() - returns a list of newly finished job IDs + from all handlers. + """ + + def __init__(self): + self.handlers: set[OffloadingHandler] = set() + self.transfer_type_to_handler: dict[TransferType, + OffloadingHandler] = {} + + def register_handler(self, src_cls: type[LoadStoreSpec], + dst_cls: type[LoadStoreSpec], + handler: OffloadingHandler) -> None: + """ + Registers a new handler. + + Args: + src_cls: the source type of transfers handled by this handler. + dst_cls: the destination type of transfers handled by this handler. + handler: the handler that will handle transfers. + """ + transfer_type = (src_cls.medium(), dst_cls.medium()) + assert transfer_type not in self.transfer_type_to_handler + self.handlers.add(handler) + self.transfer_type_to_handler[transfer_type] = handler + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + """ + Initiates an asynchronous transfer of KV data. + + Args: + job_id: a unique ID that will be used when notifying back on + transfer completion. + spec: the (src, dst) spec of the KV data transfer. + + Returns: + True if transfer was submitted successfully. + """ + src, dst = spec + transfer_type = (src.medium(), dst.medium()) + handler = self.transfer_type_to_handler.get(transfer_type) + assert handler is not None + + try: + success = handler.transfer_async(job_id, spec) + except Exception as e: + logger.warning("Exception in %r transfer %d: %r", + transfer_type, + job_id, + e, + exc_info=True) + return False + + if not success: + logger.warning("Failed to submit %r transfer %d", transfer_type, + job_id) + else: + logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id, + spec) + + return success + + def get_finished(self) -> list[TransferResult]: + """ + Get transfers finished since last call. + + Returns: + A list of (job_id, success) of transfers. + """ + finished = [] + for handler in self.handlers: + finished.extend(handler.get_finished()) + return finished From e19bce40a1660cb7c03b790d0b000db155cf925d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Thu, 18 Sep 2025 11:07:42 -0700 Subject: [PATCH 440/584] [V0 Deprecation] Remove AsyncLLMEngine (#25025) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/entrypoints/openai/test_chat.py | 54 +- tests/entrypoints/openai/test_completion.py | 830 ------------- .../test_completion_with_prompt_embeds.py | 3 + .../entrypoints/openai/test_lora_adapters.py | 5 +- tests/entrypoints/openai/test_metrics.py | 2 +- .../openai/test_return_tokens_as_ids.py | 26 +- .../entrypoints/openai/test_skip_tokenizer.py | 8 - tests/v1/test_oracle.py | 18 - vllm/engine/async_llm_engine.py | 1030 +---------------- vllm/entrypoints/launcher.py | 2 - vllm/entrypoints/openai/api_server.py | 65 +- 11 files changed, 76 insertions(+), 1967 deletions(-) delete mode 100644 tests/entrypoints/openai/test_completion.py diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index a827f94cfbfe5..3bdfef7b4adbc 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -28,11 +28,9 @@ def monkeypatch_module(): mpatch.undo() -@pytest.fixture(scope="module", params=[False, True]) -def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 - - use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') +@pytest.fixture(scope="module") +def server(monkeypatch_module, zephyr_lora_files): #noqa: F811 + monkeypatch_module.setenv('VLLM_USE_V1', '1') args = [ # use half precision for speed and memory savings in CI environment @@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 yield remote_server -@pytest.fixture -def is_v1_server(server): - import os - assert os.environ['VLLM_USE_V1'] in ['0', '1'] - return os.environ['VLLM_USE_V1'] == '1' - - @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: @@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_structured_outputs_choice_chat( - client: openai.AsyncOpenAI, sample_structured_outputs_choices, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") + client: openai.AsyncOpenAI, + sample_structured_outputs_choices, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat( @pytest.mark.asyncio -async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, - sample_json_schema, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") - +async def test_structured_outputs_json_chat( + client: openai.AsyncOpenAI, + sample_json_schema, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI, - sample_regex, is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") +async def test_structured_outputs_regex_chat( + client: openai.AsyncOpenAI, + sample_regex, +): messages = [{ "role": "system", @@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs( @pytest.mark.asyncio -async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Tool use is only supported in v1 engine") +async def test_named_tool_use( + client: openai.AsyncOpenAI, + sample_json_schema, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_response_format_json_schema(client: openai.AsyncOpenAI, - is_v1_server: bool): - if not is_v1_server: - pytest.skip( - "JSON schema response format is only supported in v1 engine") +async def test_response_format_json_schema(client: openai.AsyncOpenAI): prompt = 'what is 1+1? The format is "result": 2' # Check that this prompt cannot lead to a valid JSON without json_schema for _ in range(2): diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py deleted file mode 100644 index 0347513befe32..0000000000000 --- a/tests/entrypoints/openai/test_completion.py +++ /dev/null @@ -1,830 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for structured outputs tests -import json -import os -from typing import Optional - -import jsonschema -import openai # use the official client for correctness check -import pytest -import pytest_asyncio -import regex as re -import requests -# downloading lora to test lora requests -from openai import BadRequestError - -from vllm.transformers_utils.tokenizer import get_tokenizer - -from ...utils import RemoteOpenAIServer - -# any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically these adapters use a different base model, -# but we're not testing generation quality here - - -@pytest.fixture(scope="module") -def default_server_args(zephyr_lora_files): - return [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--max-num-seqs", - "128", - "--enforce-eager", - # lora config - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - ] - - -@pytest.fixture(scope="module", - params=["", "--disable-frontend-multiprocessing"]) -def server(default_server_args, request): - if request.param: - default_server_args.append(request.param) - - original_value = os.environ.get('VLLM_USE_V1') - os.environ['VLLM_USE_V1'] = '0' - try: - with RemoteOpenAIServer(MODEL_NAME, - default_server_args) as remote_server: - yield remote_server - finally: - # Restore original env value - if original_value is None: - os.environ.pop('VLLM_USE_V1', None) - else: - os.environ['VLLM_USE_V1'] = original_value - - -@pytest.fixture -def is_v1_server(server): - import os - - # For completion tests, we assume v0 since there's no explicit v1 setup - return os.environ.get('VLLM_USE_V1', '0') == '1' - - -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): - completion = await client.completions.create(model=model_name, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 1 - assert completion.choices[0].prompt_logprobs is None - - -@pytest.mark.asyncio -async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): - # test using token IDs - with pytest.raises(openai.BadRequestError, match="out of vocabulary"): - # Added tokens should be rejected by the base model - await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=None, - ) - choice = completion.choices[0] - assert choice.logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # just test 1 lora - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=0, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert len(choice.logprobs.top_logprobs[0]) == 1 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=5, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, - model_name: str): - - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=21, - ) - ... - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - stream = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=30, - stream=True, - ) - async for chunk in stream: - ... - - # the server should still work afterwards - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1), - (MODEL_NAME, 0), - (MODEL_NAME, 1), - (MODEL_NAME, None)]) -async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, - model_name: str, - prompt_logprobs: Optional[int]): - params: dict = { - "prompt": ["A robot may not injure another robot", "My name is"], - "model": model_name, - } - if prompt_logprobs is not None: - params["extra_body"] = {"prompt_logprobs": prompt_logprobs} - - if prompt_logprobs is not None and prompt_logprobs < 0: - with pytest.raises(BadRequestError): - await client.completions.create(**params) - else: - completion = await client.completions.create(**params) - if prompt_logprobs is not None: - assert completion.choices[0].prompt_logprobs is not None - assert len(completion.choices[0].prompt_logprobs) > 0 - - assert completion.choices[1].prompt_logprobs is not None - assert len(completion.choices[1].prompt_logprobs) > 0 - - else: - assert completion.choices[0].prompt_logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_streaming(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is an LLM?" - - single_completion = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - ) - single_output = single_completion.choices[0].text - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True) - chunks: list[str] = [] - finish_reason_count = 0 - async for chunk in stream: - chunks.append(chunk.choices[0].text) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == "length" - assert chunk.choices[0].text - assert "".join(chunks) == single_output - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): - """Streaming for parallel sampling. - The tokens from multiple samples, are flattened into a single stream, - with an index to indicate which sample the token belongs to. - """ - - prompt = "What is an LLM?" - n = 3 - max_tokens = 5 - - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=max_tokens, - n=n, - stream=True) - chunks: list[list[str]] = [[] for i in range(n)] - finish_reason_count = 0 - async for chunk in stream: - index = chunk.choices[0].index - text = chunk.choices[0].text - chunks[index].append(text) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - assert finish_reason_count == n - for chunk in chunks: - assert len(chunk) == max_tokens - print("".join(chunk)) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_stream_options(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is the capital of France?" - - # Test stream=True, stream_options= - # {"include_usage": False, "continuous_usage_stats": False} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": False, - "continuous_usage_stats": - False, - }) - - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options= - # {"include_usage": False, "continuous_usage_stats": True} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": False, - "continuous_usage_stats": - True, - }) - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options= - # {"include_usage": True, "continuous_usage_stats": False} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": True, - "continuous_usage_stats": - False, - }) - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - else: - assert chunk.usage is None - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=True, stream_options= - # {"include_usage": True, "continuous_usage_stats": True} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": True, - "continuous_usage_stats": - True, - }) - async for chunk in stream: - assert chunk.usage is not None - assert chunk.usage.prompt_tokens > 0 - assert chunk.usage.completion_tokens > 0 - assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + - chunk.usage.completion_tokens) - if chunk.choices[0].finish_reason is not None: - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=False, stream_options= - # {"include_usage": None} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": None}) - - # Test stream=False, stream_options= - # {"include_usage": True} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - - # Test stream=False, stream_options= - # {"continuous_usage_stats": None} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"continuous_usage_stats": None}) - - # Test stream=False, stream_options= - # {"continuous_usage_stats": True} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"continuous_usage_stats": True}) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): - # test both text and token IDs - for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): - # test simple list - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=model_name, - prompt=prompts, - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but - # not necessary for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" - - # test streaming - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] - - -@pytest.mark.asyncio -async def test_logits_bias(client: openai.AsyncOpenAI): - prompt = "Hello, my name is" - max_tokens = 5 - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - - # Test exclusive selection - token_id = 1000 - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token_id): 100}, - seed=42, - ) - assert len(completion.choices[0].text) >= 5 - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), - add_special_tokens=False)["input_ids"] - assert all([ - response == expected - for response, expected in zip(response_tokens, expected_tokens) - ]) - - # Test ban - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - ) - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - first_response = completion.choices[0].text - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token): -100 - for token in response_tokens}, - ) - assert first_response != completion.choices[0].text - - -@pytest.mark.asyncio -async def test_allowed_token_ids(client: openai.AsyncOpenAI): - prompt = "Hello, my name is" - max_tokens = 1 - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - - # Test exclusive selection - allowed_ids = [21555, 21557, 21558] - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - seed=42, - extra_body=dict(allowed_token_ids=allowed_ids), - logprobs=1, - ) - response_tokens = completion.choices[0].logprobs.tokens - assert len(response_tokens) == 1 - assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids - - -@pytest.mark.asyncio -async def test_structured_outputs_json_completion( - client: openai.AsyncOpenAI, - sample_json_schema, - is_v1_server: bool, -): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example JSON for an employee profile " - f"that fits this schema: {sample_json_schema}", - n=3, - temperature=1.0, - max_tokens=500, - extra_body=dict(structured_outputs=dict(json=sample_json_schema))) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - output_json = json.loads(completion.choices[i].text) - jsonschema.validate(instance=output_json, schema=sample_json_schema) - - -@pytest.mark.asyncio -async def test_structured_outputs_regex_completion( - client: openai.AsyncOpenAI, - sample_regex, - is_v1_server: bool, -): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example IPv4 address with this regex: {sample_regex}", - n=3, - temperature=1.0, - max_tokens=20, - extra_body=dict(structured_outputs=dict(regex=sample_regex))) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - assert re.fullmatch(sample_regex, - completion.choices[i].text) is not None - - -@pytest.mark.asyncio -async def test_structured_outputs_choice_completion( - client: openai.AsyncOpenAI, - sample_structured_outputs_choices, - is_v1_server: bool, -): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt="The best language for type-safe systems programming is ", - n=2, - temperature=1.0, - max_tokens=10, - extra_body=dict(structured_outputs=dict( - choice=sample_structured_outputs_choices))) - - assert completion.id is not None - assert len(completion.choices) == 2 - for i in range(2): - assert completion.choices[i].text in sample_structured_outputs_choices - - -@pytest.mark.asyncio -async def test_structured_outputs_grammar(client: openai.AsyncOpenAI, - sample_sql_statements, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("grammar is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=("Generate a sql state that select col_1 from " - "table_1 where it is equals to 1"), - temperature=1.0, - max_tokens=500, - extra_body=dict( - structured_outputs=dict(grammar=sample_sql_statements), )) - - content = completion.choices[0].text - - # use Lark to parse the output, and make sure it's a valid parse tree - from lark import Lark - parser = Lark(sample_sql_statements) - parser.parse(content) - - # remove spaces for comparison b/c we removed them in the grammar - ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "") - - assert content.strip() == ground_truth - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -@pytest.mark.parametrize("logprobs_arg", [1, 0]) -async def test_echo_logprob_completion(client: openai.AsyncOpenAI, - model_name: str, logprobs_arg: int): - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - # test using text and token IDs - for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]): - completion = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - echo=True, - logprobs=logprobs_arg) - - prompt_text = tokenizer.decode(prompt) if isinstance(prompt, - list) else prompt - assert re.search(r"^" + prompt_text, completion.choices[0].text) - logprobs = completion.choices[0].logprobs - assert logprobs is not None - assert len(logprobs.text_offset) > 5 - assert (len(logprobs.token_logprobs) > 5 - and logprobs.token_logprobs[0] is None) - assert (len(logprobs.top_logprobs) > 5 - and logprobs.top_logprobs[0] is None) - for top_logprobs in logprobs.top_logprobs[1:]: - assert max(logprobs_arg, - 1) <= len(top_logprobs) <= logprobs_arg + 1 - assert len(logprobs.tokens) > 5 - - -@pytest.mark.asyncio -async def test_structured_outputs_type_error(client: openai.AsyncOpenAI, - sample_json_schema, sample_regex, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example JSON that fits this schema: 42", - extra_body=dict(structured_outputs=dict(json=42))) - - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example string that fits this regex", - extra_body=dict(structured_outputs=dict( - regex=sample_regex, - json=sample_json_schema, - ))) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name,stream,echo", - [ - (MODEL_NAME, False, False), - (MODEL_NAME, False, True), - (MODEL_NAME, True, False), - (MODEL_NAME, True, True) # should not raise BadRequestError error - ], -) -async def test_echo_stream_completion(client: openai.AsyncOpenAI, - model_name: str, stream: bool, - echo: bool): - saying: str = "Hello, my name is" - result = await client.completions.create(model=model_name, - prompt=saying, - max_tokens=10, - temperature=0.0, - echo=echo, - stream=stream) - - stop_reason = "length" - - if not stream: - completion = result - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == stop_reason - - if echo: - assert choice.text is not None and saying in choice.text - else: - assert choice.text is not None and saying not in choice.text - - else: - chunks: list[str] = [] - final_finish_reason = None - async for chunk in result: - if chunk.choices and chunk.choices[0].text: - chunks.append(chunk.choices[0].text) - if chunk.choices and chunk.choices[0].finish_reason: - final_finish_reason = chunk.choices[0].finish_reason - - assert final_finish_reason == stop_reason - content = "".join(chunks) - if echo: - assert content is not None and saying in content - else: - assert content is not None and saying not in content - - -@pytest.mark.asyncio -async def test_invocations(server: RemoteOpenAIServer, - client: openai.AsyncOpenAI): - request_args = { - "model": MODEL_NAME, - "prompt": "Hello, my name is", - "max_tokens": 5, - "temperature": 0.0, - "logprobs": None, - } - - completion = await client.completions.create(**request_args) - - invocation_response = requests.post(server.url_for("invocations"), - json=request_args) - invocation_response.raise_for_status() - - completion_output = completion.model_dump() - invocation_output = invocation_response.json() - - assert completion_output.keys() == invocation_output.keys() - assert completion_output["choices"] == invocation_output["choices"] diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 7b58f851a4d21..3d56291bc793c 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -14,6 +14,9 @@ from transformers import AutoConfig from ...utils import RemoteOpenAIServer +pytest.skip("Skipping prompt_embeds test until V1 supports it.", + allow_module_level=True) + # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 10c0cb5f4d151..6f2addd3649da 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -53,12 +53,13 @@ def monkeypatch_module(): mpatch.undo() -@pytest.fixture(scope="module", params=[False, True]) +@pytest.fixture(scope="module", params=[True]) def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files): use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') + assert use_v1 + monkeypatch_module.setenv('VLLM_USE_V1', '1') # Define the json format LoRA module configurations lora_module_1 = { diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 0c9e0f3a51429..8917aa5a5efb9 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" PREV_MINOR_VERSION = version._prev_minor_version() -@pytest.fixture(scope="module", params=[True, False]) +@pytest.fixture(scope="module", params=[True]) def use_v1(request): # Module-scoped variant of run_with_both_engines # diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 5f43fdc9588f3..ef9d5234f2317 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -10,8 +10,30 @@ import pytest from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer -from .test_completion import default_server_args # noqa: F401 -from .test_completion import MODEL_NAME + +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" + + +@pytest.fixture(scope="module") +def default_server_args(zephyr_lora_files): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--max-num-seqs", + "128", + "--enforce-eager", + # lora config + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + ] @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py index 840e0dac81c97..b469fc76fc7a2 100644 --- a/tests/entrypoints/openai/test_skip_tokenizer.py +++ b/tests/entrypoints/openai/test_skip_tokenizer.py @@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" DTYPE = "float16" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def server(): args = [ diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 794c1f68f1471..28c24f62895ab 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -7,7 +7,6 @@ import pytest import vllm.envs as envs from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine MODEL = "meta-llama/Llama-3.2-1B-Instruct" @@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch): _ = AsyncEngineArgs(model=MODEL).create_engine_config() assert envs.VLLM_USE_V1 m.delenv("VLLM_USE_V1") - - -def test_reject_using_constructor_directly(monkeypatch): - with monkeypatch.context() as m: - if os.getenv("VLLM_USE_V1", None): - m.delenv("VLLM_USE_V1") - - # Sets VLLM_USE_V1=1. - vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config() - - # This uses the V0 constructor directly. - with pytest.raises(ValueError): - AsyncLLMEngine(vllm_config, - AsyncLLMEngine._get_executor_cls(vllm_config), - log_stats=True) - - m.delenv("VLLM_USE_V1") diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6793041abc502..ede027759a8b2 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,1032 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio -import time -import weakref -from functools import partial -from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, - Mapping, Optional, Set, Tuple, Type, Union) -from weakref import ReferenceType +from vllm.v1.engine.async_llm import AsyncLLM -import vllm.envs as envs -from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig, VllmConfig) -from vllm.core.scheduler import SchedulerOutputs -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_timeout import asyncio_timeout -from vllm.engine.llm_engine import LLMEngine -from vllm.engine.metrics_types import StatLoggerBase -from vllm.engine.protocol import EngineClient -from vllm.executor.executor_base import ExecutorBase -from vllm.inputs import PromptType -from vllm.inputs.preprocess import InputPreprocessor -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import SamplingParams -from vllm.sequence import ExecuteModelRequest -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, deprecate_kwargs, weak_bind - -logger = init_logger(__name__) -ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S - - -class AsyncEngineDeadError(RuntimeError): - pass - - -def _log_task_completion(task: asyncio.Task, - error_callback: Callable[[Exception], None]) -> None: - """This function is only intended for the `engine.run_engine_loop()` task. - - In particular, that task runs a `while True` loop that can only exit if - there is an exception. - """ - - exception = None - try: - return_value = task.result() - raise AssertionError( - f"The engine background task should never finish without an " - f"exception. {return_value}") - except asyncio.exceptions.CancelledError: - # We assume that if the task is cancelled, we are gracefully shutting - # down. This should only happen on program exit. - logger.info("Engine is gracefully shutting down.") - except Exception as e: - exception = e - logger.error("Engine background task failed", exc_info=e) - error_callback(exception) - raise AsyncEngineDeadError( - "Task finished unexpectedly. This should never happen! " - "Please open an issue on GitHub. See stack trace above for the " - "actual cause.") from e - - -STOP_ITERATION = Exception() # Sentinel - - -class AsyncStream: - """A stream of RequestOutputs for a request that can be iterated over - asynchronously via an async generator.""" - - def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: - self.request_id = request_id - self._cancel = cancel - self._queue: asyncio.Queue = asyncio.Queue() - self._finished = False - - def put(self, item: Union[RequestOutput, Exception]) -> None: - if not self._finished: - self._queue.put_nowait(item) - - def finish( - self, - exception: Optional[Union[BaseException, Type[BaseException]]] = None, - ) -> None: - if not self._finished: - self._finished = True - self._queue.put_nowait( - exception if self._is_raisable(exception) else STOP_ITERATION) - - @property - def finished(self) -> bool: - return self._finished - - async def generator(self) -> AsyncGenerator[RequestOutput, None]: - try: - while True: - result = await self._queue.get() - if self._is_raisable(result): - if result == STOP_ITERATION: - return - raise result - yield result - except GeneratorExit: - self._cancel(self.request_id) - raise asyncio.CancelledError from None - - @staticmethod - def _is_raisable(value: Any): - return isinstance(value, BaseException) or \ - (isinstance(value, type) and \ - issubclass(value, BaseException)) - - -class RequestTracker: - """Synchronous abstraction for tracking requests.""" - - def __init__(self) -> None: - self._request_streams: Dict[str, AsyncStream] = {} - self._aborted_requests: asyncio.Queue[str] = asyncio.Queue() - self._new_requests: asyncio.Queue[Tuple[AsyncStream, - dict]] = asyncio.Queue() - self.new_requests_event = asyncio.Event() - - def __contains__(self, item): - return item in self._request_streams - - def __len__(self) -> int: - return len(self._request_streams) - - def propagate_exception(self, - exc: Exception, - request_id: Optional[str] = None) -> None: - """Propagate an exception to request streams - (all if request_id is None).""" - if request_id is not None: - self.abort_request(request_id, exception=exc) - else: - # NB: tuple() used here because self.abort_request pops the stream - # out of self._request_streams, so we can't iterate on it directly - for rid in tuple(self._request_streams.keys()): - self.abort_request(rid, exception=exc) - - def process_request_output(self, - request_output: RequestOutput, - *, - verbose: bool = False) -> None: - """Process a request output from the engine.""" - request_id = request_output.request_id - finished = request_output.finished - - if finished: - stream = self._request_streams.pop(request_id, None) - else: - stream = self._request_streams.get(request_id) - # Guard against a KeyError which can occur if the request was aborted - # while the output was generated - if stream is not None: - stream.put(request_output) - if finished: - stream.finish() - - if verbose and finished: - logger.info("Finished request %s.", request_id) - - def process_exception(self, - request_id: str, - exception: BaseException, - *, - verbose: bool = False) -> None: - """Propagate an exception from the engine.""" - if verbose: - logger.info("Finished request %s.", request_id) - self.abort_request(request_id, exception=exception) - - def add_request(self, - request_id: str, - *, - verbose: bool = False, - **engine_add_request_kwargs) -> AsyncStream: - """Add a request to be sent to the engine on the next background - loop iteration.""" - if request_id in self._request_streams: - raise KeyError(f"Request {request_id} already exists.") - - abort_request = partial(self.abort_request, verbose=verbose) - stream = AsyncStream(request_id, abort_request) - self._new_requests.put_nowait((stream, { - "request_id": request_id, - **engine_add_request_kwargs - })) - - self.new_requests_event.set() - - if verbose: - logger.info("Added request %s.", request_id) - - return stream - - def abort_request(self, - request_id: str, - *, - exception: Optional[Union[BaseException, - Type[BaseException]]] = None, - verbose: bool = False) -> None: - """Abort a request during next background loop iteration.""" - if verbose: - logger.info("Aborted request %s.", request_id) - - self._aborted_requests.put_nowait(request_id) - - stream = self._request_streams.pop(request_id, None) - if stream is not None: - stream.finish(exception=exception) - - def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]: - """Get the new requests and finished requests to be - sent to the engine.""" - new_requests: List[Dict] = [] - finished_requests: Set[str] = set() - - while not self._aborted_requests.empty(): - request_id = self._aborted_requests.get_nowait() - finished_requests.add(request_id) - - while not self._new_requests.empty(): - stream, new_request = self._new_requests.get_nowait() - request_id = stream.request_id - if request_id in finished_requests: - # The request has already been aborted. - stream.finish(asyncio.CancelledError) - finished_requests.discard(request_id) - else: - self._request_streams[request_id] = stream - new_requests.append(new_request) - - return new_requests, finished_requests - - async def wait_for_new_requests(self): - if not self.has_new_requests(): - await self.new_requests_event.wait() - self.new_requests_event.clear() - - def has_new_requests(self): - return not self._new_requests.empty() - - -class _AsyncLLMEngine(LLMEngine): - """Extension of LLMEngine to add async methods.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - async def step_async(self, virtual_engine: int) -> List[RequestOutput]: - """Performs one decoding iteration and returns newly generated results. - The workers are ran asynchronously if possible. - - This function performs one decoding iteration of the engine. It first - schedules the sequences to be executed in the next iteration and the - token blocks to be swapped in/out/copy. Then, it executes the model - and updates the scheduler with the model outputs. Finally, it decodes - the sequences and returns the newly generated results. - """ - # these are cached outputs from previous iterations. None if on first - # iteration - cached_outputs = self.cached_scheduler_outputs[virtual_engine] - seq_group_metadata_list = cached_outputs.seq_group_metadata_list - scheduler_outputs = cached_outputs.scheduler_outputs - allow_async_output_proc = cached_outputs.allow_async_output_proc - - ctx = self.scheduler_contexts[virtual_engine] - - # Clear outputs for each new scheduler iteration - ctx.request_outputs.clear() - - # skip the scheduler if there are any remaining steps in the seq groups. - # This ensures that the scheduler is only called again when the current - # batch has completed. - if not self._has_remaining_steps(seq_group_metadata_list): - - # Schedule iteration - (seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc - ) = self.scheduler[virtual_engine].schedule() - - ctx.seq_group_metadata_list = seq_group_metadata_list - ctx.scheduler_outputs = scheduler_outputs - - if not scheduler_outputs.is_empty(): - # this will cause mamba_cache/minimax_cache failed - # to release finished_requests_ids of the last steps - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() - - # Maybe switch from async mode to sync mode - if not allow_async_output_proc and len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - - else: - finished_requests_ids = list() - - assert seq_group_metadata_list is not None - assert scheduler_outputs is not None - - if not scheduler_outputs.is_empty(): - - # Check if we have a cached last_output from the previous iteration. - # For supporting PP this is probably the best way to pass the - # sampled_token_ids, as a separate broadcast over all the PP stages - # will cause one virtual engine's microbatch to block the pipeline. - last_sampled_token_ids = \ - self._get_last_sampled_token_ids(virtual_engine) - - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, - blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, - blocks_to_copy=scheduler_outputs.blocks_to_copy, - virtual_engine=virtual_engine, - num_lookahead_slots=scheduler_outputs.num_lookahead_slots, - running_queue_size=scheduler_outputs.running_queue_size, - finished_requests_ids=finished_requests_ids, - # We use ExecuteModelRequest to pass the last sampled_token_ids - # to each of the non-last PP stages for in-place prepare_input. - last_sampled_token_ids=last_sampled_token_ids) - - if allow_async_output_proc: - execute_model_req.async_callback = self.async_callbacks[ - virtual_engine] - - # Execute the model. - outputs = await self.model_executor.execute_model_async( - execute_model_req) - - else: - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - outputs = [] - - if not self._has_remaining_steps(seq_group_metadata_list): - # is_first_step_output is True only when the num_steps of all - # the sequences are 1. - is_first_step_output: bool = False if not seq_group_metadata_list \ - else seq_group_metadata_list[0].state.num_steps == 1 - - ctx.append_output(outputs=outputs, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - is_async=allow_async_output_proc, - is_last_step=True, - is_first_step_output=is_first_step_output) - - if outputs and allow_async_output_proc: - assert len( - outputs - ) == 1, "Async postprocessor expects only a single output set" - self._advance_to_next_step( - outputs[0], seq_group_metadata_list, - scheduler_outputs.scheduled_seq_groups) - - if not allow_async_output_proc: - self._process_model_outputs(ctx=ctx) - - # Log stats. - self.do_log_stats(scheduler_outputs, outputs) - - # Tracing - self.do_tracing(scheduler_outputs) - - else: - # Multi-step case - return ctx.request_outputs - - if not self.has_unfinished_requests(): - # Drain async postprocessor (if exists) - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - assert len(ctx.output_queue) == 0 - - return ctx.request_outputs - - async def stop_remote_worker_execution_loop_async(self) -> None: - """Stop the remote worker execution loop.""" - await self.model_executor.stop_remote_worker_execution_loop_async() - - async def get_tokenizer_async(self) -> AnyTokenizer: - return self.get_tokenizer() - - async def add_request_async( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> None: - """ - Async version of - [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]. - """ - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - if priority != 0 and not self.scheduler_config.policy == "priority": - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - if arrival_time is None: - arrival_time = time.time() - - if data_parallel_rank is not None: - raise ValueError("Targeting data_parallel_rank only supported " - "in v1 client.") - - if (isinstance(prompt, dict) - and prompt.get("prompt_embeds", None) is not None - and not prompt.get("prompt_token_ids", None)): - # We use the -2 dimension (instead of 0) in case a batched input - # of batch size 1 is passed in. - prompt["prompt_token_ids"] = [0 - ] * prompt["prompt_embeds"].shape[-2] - - processed_inputs = await self.input_preprocessor.preprocess_async( - prompt, - tokenization_kwargs=tokenization_kwargs, - ) - - self._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=params, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - - async def check_health_async(self) -> None: - self.model_executor.check_health() - - async def collective_rpc_async(self, - method: str, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None): - raise NotImplementedError - - -class AsyncLLMEngine(EngineClient): - """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine]. - - This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to - make it asynchronous. It uses asyncio to create a background loop that keeps - processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked - by the generate method when there are requests in the waiting queue. The - generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine] - to the caller. - - Args: - log_requests: Whether to log the requests. - start_engine_loop: If True, the background task to run the engine - will be automatically started in the generate call. - *args: Arguments for [`LLMEngine`][vllm.LLMEngine]. - **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine]. - """ - - _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine - - def __init__(self, - *args: Any, - log_requests: bool = True, - start_engine_loop: bool = True, - **kwargs: Any) -> None: - if envs.VLLM_USE_V1: - raise ValueError( - "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. " - "This should not happen. As a workaround, try using " - "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github.") - - self.log_requests = log_requests - self.engine = self._engine_class(*args, **kwargs) - - # This ensures quick processing of request outputs - # so the append to asyncio queues is not delayed, - # especially for multi-step. - self.use_process_request_outputs_callback = ( - self.engine.model_config.use_async_output_proc) - - if self.use_process_request_outputs_callback: - self.engine.process_request_outputs_callback = \ - weak_bind(self.process_request_outputs) - - self.background_loop: Optional[asyncio.Future] = None - # We need to keep a reference to unshielded - # task as well to prevent it from being garbage - # collected - self._background_loop_unshielded: Optional[asyncio.Task] = None - self.start_engine_loop = start_engine_loop - self._errored_with: Optional[BaseException] = None - - # Lazy initialized fields - self._request_tracker: RequestTracker - - def __del__(self): - if rt := getattr(self, "request_tracker", None): - # Wake up engine loop so that it will exit cleanly - rt.new_requests_event.set() - - @classmethod - def _get_executor_cls(cls, - engine_config: VllmConfig) -> Type[ExecutorBase]: - return LLMEngine._get_executor_cls(engine_config) - - @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=("This argument will have no effect. " - "Use `enable_log_requests` instead."), - ) - def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - enable_log_requests: bool = False, - disable_log_stats: bool = False, - disable_log_requests: bool = True, # Deprecated, will be removed - ) -> "AsyncLLMEngine": - """Create an AsyncLLMEngine from the EngineArgs.""" - - return cls( - vllm_config=vllm_config, - executor_class=cls._get_executor_cls(vllm_config), - start_engine_loop=start_engine_loop, - log_requests=enable_log_requests, - log_stats=not disable_log_stats, - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: AsyncEngineArgs, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "AsyncLLMEngine": - """Creates an async LLM engine from the engine arguments.""" - - vllm_config = engine_args.create_engine_config(usage_context) - - async_engine_cls = cls - if envs.VLLM_USE_V1: - from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine - async_engine_cls = V1AsyncLLMEngine - - return async_engine_cls.from_vllm_config( - vllm_config=vllm_config, - start_engine_loop=start_engine_loop, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - enable_log_requests=engine_args.enable_log_requests, - ) - - @property - def is_running(self) -> bool: - return (self.background_loop is not None - and self._background_loop_unshielded is not None - and not self._background_loop_unshielded.done()) - - @property - def is_stopped(self) -> bool: - return self.errored or (self.background_loop is not None and - self._background_loop_unshielded is not None - and self._background_loop_unshielded.done()) - - @property - def errored(self) -> bool: - return self._errored_with is not None - - @property - def dead_error(self) -> BaseException: - return AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - def set_errored(self, exc: Exception) -> None: - self._errored_with = exc - - def _error_callback(self, exc: Exception) -> None: - self.set_errored(exc) - self._request_tracker.propagate_exception(exc) - - async def get_input_preprocessor(self) -> InputPreprocessor: - return self.engine.input_preprocessor - - async def get_tokenizer(self) -> AnyTokenizer: - return self.engine.get_tokenizer() - - def start_background_loop(self) -> None: - """Start the background loop.""" - if self.errored: - raise AsyncEngineDeadError( - "Background loop has errored already.") from self._errored_with - if self.is_running: - raise RuntimeError("Background loop is already running.") - # Initialize the RequestTracker here so it uses the right event loop. - self._request_tracker = RequestTracker() - - self._background_loop_unshielded = asyncio.get_event_loop( - ).create_task(self.run_engine_loop(weakref.ref(self))) - self._background_loop_unshielded.add_done_callback( - partial(_log_task_completion, error_callback=self._error_callback)) - self.background_loop = asyncio.shield(self._background_loop_unshielded) - - def shutdown_background_loop(self) -> None: - """ - Shut down the background loop. - - This method needs to be called during cleanup to remove - references to `self` and properly GC the resources held - by the async LLM engine (e.g., the executors as well as - their resources). - """ - if self._background_loop_unshielded is not None: - self._background_loop_unshielded.cancel() - self._background_loop_unshielded = None - self.background_loop = None - - async def engine_step(self, virtual_engine: int) -> bool: - """Kick the engine to process the waiting requests. - - Returns True if there are in-progress requests.""" - - new_requests, aborted_requests = ( - self._request_tracker.get_new_and_aborted_requests()) - - for new_request in new_requests: - # Add the request into the vLLM engine's waiting queue. - try: - await self.engine.add_request_async(**new_request) - except ValueError as e: - # TODO: use a vLLM specific error for failed validation - self._request_tracker.process_exception( - new_request["request_id"], - e, - verbose=self.log_requests, - ) - - if aborted_requests: - await self._engine_abort(aborted_requests) - - request_outputs = await self.engine.step_async(virtual_engine) - - # Put the outputs into the corresponding streams. - # If used as a callback, then already invoked inside - # LLMEngine's _process_model_outputs - if not self.use_process_request_outputs_callback: - all_finished = self.process_request_outputs(request_outputs) - else: - # For callback case, we only need to detect when all - # requests are finished - all_finished = all(request_output.finished - for request_output in request_outputs) - - return not all_finished - - def process_request_outputs(self, request_outputs) -> bool: - # Put the outputs into the corresponding streams. - all_finished = True - for request_output in request_outputs: - self._request_tracker.process_request_output( - request_output, verbose=self.log_requests) - all_finished = all_finished and request_output.finished - - return all_finished - - async def _engine_abort(self, request_ids: Iterable[str]): - self.engine.abort_request(request_ids) - - @staticmethod - async def run_engine_loop(engine_ref: ReferenceType): - """We use a weakref to the engine so that the running loop - doesn't prevent the engine being garbage collected.""" - engine: Optional[AsyncLLMEngine] = engine_ref() - if not engine: - return - - pipeline_parallel_size = \ - engine.engine.parallel_config.pipeline_parallel_size - has_requests_in_progress = [False] * pipeline_parallel_size - while True: - if not any(has_requests_in_progress): - logger.debug("Waiting for new requests...") - # Stop the execute model loop in parallel workers until there - # are more requests to process. This avoids waiting - # indefinitely in torch.distributed ops which may otherwise - # time out, and unblocks the RPC thread in the workers so that - # they can process any other queued control plane messages, - # such as add/remove lora adapters. - await engine.engine.stop_remote_worker_execution_loop_async() - request_tracker = engine._request_tracker - # Allow engine to be garbage collected while - # waiting for new requests - del engine - await asyncio.sleep(0) - if engine_ref() is None: - return - await request_tracker.wait_for_new_requests() - engine = engine_ref() - if not engine: - return - logger.debug("Got new requests!") - requests_in_progress = [ - asyncio.create_task(engine.engine_step(ve)) - for ve in range(pipeline_parallel_size) - ] - has_requests_in_progress = [True] * pipeline_parallel_size - - # Abort if iteration takes too long due to unrecoverable errors - # (eg. NCCL timeouts). - try: - async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S): - done, _ = await asyncio.wait( - requests_in_progress, - return_when=asyncio.FIRST_COMPLETED) - for _ in range(pipeline_parallel_size): - await asyncio.sleep(0) - for task in done: - result = task.result() - virtual_engine = requests_in_progress.index(task) - has_unfinished_requests = ( - engine.engine. - has_unfinished_requests_for_virtual_engine( - virtual_engine)) - if result or has_unfinished_requests: - requests_in_progress[virtual_engine] = ( - asyncio.create_task( - engine.engine_step(virtual_engine))) - has_requests_in_progress[virtual_engine] = True - else: - has_requests_in_progress[virtual_engine] = False - except asyncio.TimeoutError as exc: - logger.error( - "Engine iteration timed out. This should never happen!") - engine.set_errored(exc) - raise - await asyncio.sleep(0) - - async def add_request( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[RequestOutput, None]: - if not self.is_running: - if self.start_engine_loop: - self.start_background_loop() - else: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - if (priority != 0 - and not self.engine.scheduler_config.policy == "priority"): - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - - stream = self._request_tracker.add_request( - request_id, - verbose=self.log_requests, - prompt=prompt, - params=params, - arrival_time=arrival_time or time.time(), - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - data_parallel_rank=data_parallel_rank, - tokenization_kwargs=tokenization_kwargs, - ) - - return stream.generator() - - async def generate( - self, - prompt: PromptType, - sampling_params: SamplingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - ) -> AsyncGenerator[RequestOutput, None]: - """Generate outputs for a request. - - Generate outputs for a request. This method is a coroutine. It adds the - request into the waiting queue of the LLMEngine and streams the outputs - from the LLMEngine to the caller. - - Args: - prompt: The prompt to the LLM. See - [`PromptType`][vllm.inputs.PromptType] for more details about - the format of each input. - sampling_params: The sampling parameters of the request. - request_id: The unique id of the request. - lora_request: LoRA request to use for generation, if any. - trace_headers: OpenTelemetry trace headers. - priority: The priority of the request. - Only applicable with priority scheduling. - data_parallel_rank: The (global) data parallel rank that must - handle this request. Only applicable if DP is enabled. - Yields: - The output `RequestOutput` objects from the LLMEngine - for the request. - - Details: - - If the engine is not running, start the background loop, - which iteratively invokes - [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step] - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. - - Example: - >>> # Please refer to entrypoints/api_server.py for - >>> # the complete example. - >>> - >>> # initialize the engine and the example input - >>> # note that engine_args here is AsyncEngineArgs instance - >>> engine = AsyncLLMEngine.from_engine_args(engine_args) - >>> example_input = { - >>> "prompt": "What is LLM?", - >>> "stream": False, # assume the non-streaming case - >>> "temperature": 0.0, - >>> "request_id": 0, - >>> } - >>> - >>> # start the generation - >>> results_generator = engine.generate( - >>> example_input["prompt"], - >>> SamplingParams(temperature=example_input["temperature"]), - >>> example_input["request_id"]) - >>> - >>> # get the results - >>> final_output = None - >>> async for request_output in results_generator: - >>> if await request.is_disconnected(): - >>> # Abort the request if the client disconnects. - >>> await engine.abort(request_id) - >>> # Return or raise an error - >>> ... - >>> final_output = request_output - >>> - >>> # Process and return the final output - >>> ... - """ - try: - async for output in await self.add_request( - request_id, - prompt, - sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - data_parallel_rank=data_parallel_rank, - ): - yield LLMEngine.validate_output(output, RequestOutput) - except asyncio.CancelledError: - await self.abort(request_id) - raise - - def encode( - self, - prompt: PromptType, - pooling_params: PoolingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - raise NotImplementedError( - "Pooling models are not supported in vLLM V0") - - async def abort(self, request_id: Union[str, Iterable[str]]) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - if not isinstance(request_id, str): - raise RuntimeError("Only single-request abort supported in" - " deprecated V0") - if not self.is_running: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - return self._abort(request_id) - - def _abort(self, request_id: str) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - self._request_tracker.abort_request(request_id, - exception=asyncio.CancelledError, - verbose=self.log_requests) - - async def get_vllm_config(self) -> VllmConfig: - """Get the vllm configuration of the vLLM engine.""" - return self.engine.get_vllm_config() - - async def get_model_config(self) -> ModelConfig: - """Get the model configuration of the vLLM engine.""" - return self.engine.get_model_config() - - async def get_parallel_config(self) -> ParallelConfig: - """Get the parallel configuration of the vLLM engine.""" - return self.engine.get_parallel_config() - - async def get_scheduler_config(self) -> SchedulerConfig: - """Get the scheduling configuration of the vLLM engine.""" - return self.engine.get_scheduler_config() - - async def get_lora_config(self) -> LoRAConfig: - """Get the lora configuration of the vLLM engine.""" - return self.engine.get_lora_config() - - async def do_log_stats( - self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None) -> None: - self.engine.do_log_stats() - - async def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - t = time.perf_counter() - logger.debug("Starting health check...") - if self.is_stopped: - raise AsyncEngineDeadError("Background loop is stopped.") - - await self.engine.check_health_async() - logger.debug("Health check took %fs", time.perf_counter() - t) - - async def is_tracing_enabled(self) -> bool: - return self.engine.is_tracing_enabled() - - def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: - self.engine.add_logger(logger_name=logger_name, logger=logger) - - def remove_logger(self, logger_name: str) -> None: - self.engine.remove_logger(logger_name=logger_name) - - async def start_profile(self) -> None: - self.engine.start_profile() - - async def stop_profile(self) -> None: - self.engine.stop_profile() - - async def reset_mm_cache(self) -> None: - self.engine.reset_mm_cache() - - async def reset_prefix_cache(self, - device: Optional[Device] = None) -> None: - self.engine.reset_prefix_cache(device) - - async def sleep(self, level: int = 1) -> None: - await self.reset_prefix_cache() - self.engine.sleep(level) - - async def wake_up(self, tags: Optional[list[str]] = None) -> None: - self.engine.wake_up(tags) - - async def is_sleeping(self) -> bool: - return self.engine.is_sleeping() - - async def add_lora(self, lora_request: LoRARequest) -> bool: - return self.engine.add_lora(lora_request) - - async def collective_rpc(self, - method: str, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None): - """ - Perform a collective RPC call to the given path. - """ - return await self.engine.collective_rpc_async(method, timeout, args, - kwargs) - - -# TODO(v1): Remove this class proxy when V1 goes default. -if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: - from vllm.v1.engine.async_llm import AsyncLLM - - AsyncLLMEngine = AsyncLLM # type: ignore +AsyncLLMEngine = AsyncLLM # type: ignore diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index c3195dbc4697f..8b2acedf805c1 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -11,7 +11,6 @@ import uvicorn from fastapi import FastAPI, Request, Response from vllm import envs -from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.protocol import EngineClient from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) @@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: """ @app.exception_handler(RuntimeError) - @app.exception_handler(AsyncEngineDeadError) @app.exception_handler(EngineDeadError) @app.exception_handler(EngineGenerateError) async def runtime_exception_handler(request: Request, __): diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 912e664120929..11031cd616d20 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -38,7 +38,6 @@ from typing_extensions import assert_never import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (load_chat_template, resolve_hf_chat_template, @@ -201,50 +200,34 @@ async def build_async_engine_client_from_engine_args( vllm_config = engine_args.create_engine_config(usage_context=usage_context) # V1 AsyncLLM. - if envs.VLLM_USE_V1: - if disable_frontend_multiprocessing: - logger.warning( - "V1 is enabled, but got --disable-frontend-multiprocessing. " - "To disable frontend multiprocessing, set VLLM_USE_V1=0.") + assert envs.VLLM_USE_V1 - from vllm.v1.engine.async_llm import AsyncLLM - async_llm: Optional[AsyncLLM] = None - client_count = client_config.pop( - "client_count") if client_config else 1 - client_index = client_config.pop( - "client_index") if client_config else 0 - try: - async_llm = AsyncLLM.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, - disable_log_stats=engine_args.disable_log_stats, - client_addresses=client_config, - client_count=client_count, - client_index=client_index) + if disable_frontend_multiprocessing: + logger.warning( + "V1 is enabled, but got --disable-frontend-multiprocessing. " + "To disable frontend multiprocessing, set VLLM_USE_V1=0.") - # Don't keep the dummy data in memory - await async_llm.reset_mm_cache() + from vllm.v1.engine.async_llm import AsyncLLM + async_llm: Optional[AsyncLLM] = None + client_count = client_config.pop("client_count") if client_config else 1 + client_index = client_config.pop("client_index") if client_config else 0 + try: + async_llm = AsyncLLM.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + enable_log_requests=engine_args.enable_log_requests, + disable_log_stats=engine_args.disable_log_stats, + client_addresses=client_config, + client_count=client_count, + client_index=client_index) - yield async_llm - finally: - if async_llm: - async_llm.shutdown() + # Don't keep the dummy data in memory + await async_llm.reset_mm_cache() - # V0 AsyncLLM. - else: - - engine_client: Optional[EngineClient] = None - try: - engine_client = AsyncLLMEngine.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, - disable_log_stats=engine_args.disable_log_stats) - yield engine_client - finally: - if engine_client and hasattr(engine_client, "shutdown"): - engine_client.shutdown() + yield async_llm + finally: + if async_llm: + async_llm.shutdown() async def validate_json_request(raw_request: Request): From 064cac7bb7251862a841d8057d83581350edf837 Mon Sep 17 00:00:00 2001 From: Nikhil Gupta <nikhil.gupta2@arm.com> Date: Thu, 18 Sep 2025 19:15:23 +0100 Subject: [PATCH 441/584] [fix]: remove data type hardcoding from gptoss model implementation (#23807) Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com> --- vllm/model_executor/models/gpt_oss.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 990a1d6d883a1..b49fd0d8f88af 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -76,7 +76,6 @@ class OAIAttention(nn.Module): self.sinks = torch.nn.Parameter( torch.empty(config.num_attention_heads // tp_size, - dtype=torch.bfloat16, requires_grad=False)) self.q_size = self.num_attention_heads * self.head_dim // tp_size @@ -145,8 +144,7 @@ class MLPBlock(torch.nn.Module): self.experts_per_token = config.num_experts_per_tok self.world_size = dist.get_world_size() if dist.is_initialized() else 1 self.router = torch.nn.Linear(config.hidden_size, - config.num_local_experts, - dtype=torch.bfloat16) + config.num_local_experts) assert config.intermediate_size % self.world_size == 0 self.experts = FusedMoE(num_experts=config.num_local_experts, top_k=config.num_experts_per_tok, From 38db529f66712502a3cf93488229fc9fd2dc76fc Mon Sep 17 00:00:00 2001 From: Aziz <azizbenothman76@gmail.com> Date: Thu, 18 Sep 2025 21:18:56 +0200 Subject: [PATCH 442/584] [feat]: Create interface for model-specific M-RoPE (#24194) Signed-off-by: AzizCode92 <azizbenothman76@gmail.com> Signed-off-by: Aziz <azizbenothman76@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- vllm/model_executor/models/__init__.py | 11 ++- vllm/model_executor/models/interfaces.py | 68 +++++++++++++ vllm/model_executor/models/qwen2_vl.py | 118 ++++++++++++++++++++++- vllm/v1/worker/gpu_model_runner.py | 33 +++++-- vllm/worker/model_runner.py | 42 +++++--- 5 files changed, 242 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index d3ee6872dd8bf..4ccba64f2c110 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, - SupportsPP, SupportsTranscription, SupportsV0Only, - has_inner_state, supports_lora, supports_multimodal, - supports_pp, supports_transcription, supports_v0_only) +from .interfaces import (HasInnerState, SupportsLoRA, SupportsMRoPE, + SupportsMultiModal, SupportsPP, SupportsTranscription, + SupportsV0Only, has_inner_state, supports_lora, + supports_mrope, supports_multimodal, supports_pp, + supports_transcription, supports_v0_only) from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration, is_pooling_model, is_text_generation_model) from .registry import ModelRegistry @@ -21,6 +22,8 @@ __all__ = [ "supports_lora", "SupportsMultiModal", "supports_multimodal", + "SupportsMRoPE", + "supports_mrope", "SupportsPP", "supports_pp", "SupportsTranscription", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8f8e300c84d71..e9c600e36cfa7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, import numpy as np import torch from torch import Tensor +from transformers import PretrainedConfig from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs @@ -852,3 +853,70 @@ def supports_eagle3( model: Union[type[object], object], ) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]: return isinstance(model, SupportsEagle3) + + +@runtime_checkable +class SupportsMRoPE(Protocol): + """The interface required for all models that support M-RoPE.""" + + supports_mrope: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports M-RoPE. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """ + Get M-RoPE input positions and delta value for this specific model. + + This method should be implemented by each model that supports M-RoPE + to provide model-specific logic for computing input positions. + + Args: + input_tokens: List of input token IDs + hf_config: HuggingFace model configuration + image_grid_thw: Image grid dimensions (t, h, w) + video_grid_thw: Video grid dimensions (t, h, w) + second_per_grid_ts: Seconds per grid timestep for videos + context_len: Context length + seq_len: Sequence length + audio_feature_lengths: Audio feature lengths for multimodal models + use_audio_in_video: Whether to use audio in video for interleaving + + Returns: + Tuple of (llm_positions, mrope_position_delta) + - llm_positions: Tensor of shape [3, num_tokens] + with T/H/W positions + - mrope_position_delta: Delta for position calculations + """ + ... + + +@overload +def supports_mrope(model: type[object]) -> TypeIs[type[SupportsMRoPE]]: + ... + + +@overload +def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]: + ... + + +def supports_mrope( + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsMRoPE]], TypeIs[SupportsMRoPE]]: + return isinstance(model, SupportsMRoPE) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b6576b783b64a..7f361678ba72e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -32,7 +32,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import AutoConfig, BatchFeature +from transformers import AutoConfig, BatchFeature, PretrainedConfig from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) from transformers.models.qwen2_vl.configuration_qwen2_vl import ( @@ -73,7 +73,7 @@ from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import (MultiModalEmbeddings, SupportsLoRA, +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, @@ -1096,7 +1096,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] info=Qwen2VLProcessingInfo, dummy_inputs=Qwen2VLDummyInputsBuilder) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsLoRA, SupportsPP): + SupportsLoRA, SupportsPP, SupportsMRoPE): # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( @@ -1109,6 +1109,118 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, "model.": "language_model.model.", }) + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get M-RoPE input positions for Qwen2-VL model.""" + if image_grid_thw is None: + image_grid_thw = [] + if video_grid_thw is None: + video_grid_thw = [] + if second_per_grid_ts is None: + second_per_grid_ts = [] + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, + "tokens_per_second", 1.0) + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_videos > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts: + video_second_per_grid_t = second_per_grid_ts[video_index] + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t * + tokens_per_second).long().flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4873b586724ec..053e8f0537ed9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import (is_mixture_of_experts, supports_eagle3, + supports_mrope, supports_transcription) from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) @@ -730,16 +731,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if mm_input.get("use_audio_in_video") is True: use_audio_in_video = True - req_state.mrope_positions, req_state.mrope_position_delta = \ - MRotaryEmbedding.get_input_positions_tensor( - req_state.prompt_token_ids, - hf_config=self.model_config.hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) + if supports_mrope(self.model): + req_state.mrope_positions, req_state.mrope_position_delta = \ + self.model.get_mrope_input_positions( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + else: + req_state.mrope_positions, req_state.mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) def _extract_mm_kwargs( self, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 88f83c9dd7e6c..594382650e3c1 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -41,7 +41,8 @@ from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, get_sampler) from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.model_executor.models import supports_lora, supports_multimodal +from vllm.model_executor.models import (supports_lora, supports_mrope, + supports_multimodal) from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, @@ -670,18 +671,33 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): inter_data.seq_ids[seq_idx]] token_ids = seq_data.get_token_ids() - mrope_input_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) + if supports_mrope(self.runner.model): + mrope_input_positions, mrope_position_delta = \ + self.runner.model.get_mrope_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + mrope_input_positions = mrope_input_positions.tolist() + else: + mrope_input_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) seq_data.mrope_position_delta = mrope_position_delta inter_data.mrope_input_positions[ From 75fb112d80f680624dc99a00e02be6a45661f948 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:32:24 -0400 Subject: [PATCH 443/584] [Bug] Fix `returned_lse` not Defined issue (#25106) Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> --- vllm/v1/attention/backends/mla/cutlass_mla.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 21be17a750df4..ae534f3207b51 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -206,12 +206,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): ) if H < MAX_HEADS: - # Extract the subsets of the outputs - returned_lse = lse[:, :H].contiguous( - ) if self.need_to_return_lse_for_decode else lse out = out[:, :H] + if self.need_to_return_lse_for_decode: + lse = lse[:, :H].contiguous() - return out, returned_lse + return out, lse def _forward_decode( self, From d2a30a2d933226d3951ad98cb5de0c74e2e64826 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:38:37 -0400 Subject: [PATCH 444/584] [Bug] Fix torch Compilation Cache Hit Error (#25093) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/config/compilation.py | 12 ------------ vllm/platforms/cuda.py | 17 ++++++++++------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index f8ccc20222615..3618f472e742d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -563,18 +563,6 @@ class CompilationConfig: self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] - if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": - # exclude MoE dispatch/combine from capture by ensuring - # piecewise splitting includes them, so communication remains - # outside CUDA graphs while compute can still be graphed. - moe_ops = [ - "vllm.moe_forward", - "vllm.moe_forward_shared", - ] - for op in moe_ops: - if op not in self.splitting_ops: - self.splitting_ops.append(op) - def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 8e3436a9e73c5..87d8f2b7481bb 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -191,14 +191,17 @@ class CudaPlatformBase(Platform): compilation_config = vllm_config.compilation_config if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" and parallel_config.data_parallel_size > 1 - and compilation_config.cudagraph_mode - not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]): + and compilation_config.cudagraph_mode != CUDAGraphMode.NONE): + # TODO: Piecewise Cuda graph might be enabled + # if torch compile cache key issue fixed + # See https://github.com/vllm-project/vllm/pull/25093 logger.info( - "Data Parallel with DeepEP high-throughput: using PIECEWISE " - "CUDA graphs and excluding MoE ops from capture. Set " - "VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE " - "graphs captured as well.") - compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + "Data Parallel: disabling cudagraphs since DP " + "with DeepEP high-throughput kernels are not CUDA Graph " + "compatible. The DeepEP low-latency kernels are CUDA Graph " + "compatible. Set the all_to_all backend to deepep_low_latency " + "to use those kernels instead.") + compilation_config.cudagraph_mode = CUDAGraphMode.NONE @classmethod def get_current_memory_usage(cls, From 1c3dad22ff92cbf84e0fa8ad1643c560a07944ea Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Thu, 18 Sep 2025 13:35:21 -0700 Subject: [PATCH 445/584] [V0 Deprecation] Remove unused async_timeout.py (#25190) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/engine/async_timeout.py | 173 ----------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 vllm/engine/async_timeout.py diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py deleted file mode 100644 index 3b9c055160c1b..0000000000000 --- a/vllm/engine/async_timeout.py +++ /dev/null @@ -1,173 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Workaround for https://github.com/python/cpython/issues/86296 -# -# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py -# Licensed under the Apache License (Apache-2.0) - -import asyncio -import enum -import sys -from types import TracebackType -from typing import Any, Optional, Type - -if sys.version_info[:2] >= (3, 11): - from asyncio import timeout as asyncio_timeout -else: - - class _State(enum.Enum): - INIT = "INIT" - ENTER = "ENTER" - TIMEOUT = "TIMEOUT" - EXIT = "EXIT" - - class Timeout: - # Internal class, please don't instantiate it directly - # Use timeout() and timeout_at() public factories instead. - # - # Implementation note: `async with timeout()` is preferred - # over `with timeout()`. - # While technically the Timeout class implementation - # doesn't need to be async at all, - # the `async with` statement explicitly points that - # the context manager should be used from async function context. - # - # This design allows to avoid many silly misusages. - # - # TimeoutError is raised immediately when scheduled - # if the deadline is passed. - # The purpose is to time out as soon as possible - # without waiting for the next await expression. - - __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler") - - def __init__(self, deadline: Optional[float], - loop: asyncio.AbstractEventLoop) -> None: - self._loop = loop - self._state = _State.INIT - - self._timeout_handler = None # type: Optional[asyncio.Handle] - if deadline is None: - self._deadline = None # type: Optional[float] - else: - self.update(deadline) - - async def __aenter__(self) -> "Timeout": - self._do_enter() - return self - - async def __aexit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> Optional[bool]: - self._do_exit(exc_type) - return None - - @property - def expired(self) -> bool: - """Is timeout expired during execution?""" - return self._state == _State.TIMEOUT - - @property - def deadline(self) -> Optional[float]: - return self._deadline - - def reject(self) -> None: - """Reject scheduled timeout if any.""" - # cancel is maybe better name but - # task.cancel() raises CancelledError in asyncio world. - if self._state not in (_State.INIT, _State.ENTER): - raise RuntimeError(f"invalid state {self._state.value}") - self._reject() - - def _reject(self) -> None: - if self._timeout_handler is not None: - self._timeout_handler.cancel() - self._timeout_handler = None - - def shift(self, delay: float) -> None: - """Advance timeout on delay seconds. - The delay can be negative. - Raise RuntimeError if shift is called when deadline is not scheduled - """ - deadline = self._deadline - if deadline is None: - raise RuntimeError( - "cannot shift timeout if deadline is not scheduled") - self.update(deadline + delay) - - def update(self, deadline: float) -> None: - """Set deadline to absolute value. - deadline argument points on the time in the same clock system - as loop.time(). - If new deadline is in the past the timeout is raised immediately. - Please note: it is not POSIX time but a time with - undefined starting base, e.g. the time of the system power on. - """ - if self._state == _State.EXIT: - raise RuntimeError( - "cannot reschedule after exit from context manager") - if self._state == _State.TIMEOUT: - raise RuntimeError("cannot reschedule expired timeout") - if self._timeout_handler is not None: - self._timeout_handler.cancel() - self._deadline = deadline - if self._state != _State.INIT: - self._reschedule() - - def _reschedule(self) -> None: - assert self._state == _State.ENTER - deadline = self._deadline - if deadline is None: - return - - now = self._loop.time() - if self._timeout_handler is not None: - self._timeout_handler.cancel() - - task = asyncio.current_task() - if deadline <= now: - self._timeout_handler = self._loop.call_soon( - self._on_timeout, task) - else: - self._timeout_handler = self._loop.call_at( - deadline, self._on_timeout, task) - - def _do_enter(self) -> None: - if self._state != _State.INIT: - raise RuntimeError(f"invalid state {self._state.value}") - self._state = _State.ENTER - self._reschedule() - - def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: - if exc_type is asyncio.CancelledError and \ - self._state == _State.TIMEOUT: - self._timeout_handler = None - raise asyncio.TimeoutError - # timeout has not expired - self._state = _State.EXIT - self._reject() - return None - - def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None: - if task: - task.cancel() - self._state = _State.TIMEOUT - # drop the reference early - self._timeout_handler = None - - def asyncio_timeout(delay: Optional[float]) -> Timeout: - """timeout context manager. - Useful in cases when you want to apply timeout logic around block - of code or in cases when asyncio.wait_for is not suitable. For example: - >>> async with timeout(0.001): - ... async with aiohttp.get('https://github.com') as r: - ... await r.text() - delay - value in seconds or None to disable timeout logic - """ - loop = asyncio.get_running_loop() - deadline = loop.time() + delay if delay is not None else None - return Timeout(deadline, loop) From a53ad626d629e79264f0a6ab6820a4b547f3b1c4 Mon Sep 17 00:00:00 2001 From: Or Ozeri <oro@il.ibm.com> Date: Thu, 18 Sep 2025 23:53:52 +0300 Subject: [PATCH 446/584] [KV offload][1b/N] rename offloading to kv_offload (#25191) Signed-off-by: Or Ozeri <oro@il.ibm.com> --- .buildkite/test-pipeline.yaml | 2 +- tests/v1/{offloading => kv_offload}/test_worker.py | 4 ++-- vllm/v1/{offloading => kv_offload}/abstract.py | 0 vllm/v1/{offloading => kv_offload}/mediums.py | 2 +- vllm/v1/{offloading => kv_offload}/worker/worker.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename tests/v1/{offloading => kv_offload}/test_worker.py (97%) rename vllm/v1/{offloading => kv_offload}/abstract.py (100%) rename vllm/v1/{offloading => kv_offload}/mediums.py (93%) rename vllm/v1/{offloading => kv_offload}/worker/worker.py (98%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5fd08296625ad..c42ec4f2503d0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -280,7 +280,7 @@ steps: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/executor - - pytest -v -s v1/offloading + - pytest -v -s v1/kv_offload - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/kv_offload/test_worker.py similarity index 97% rename from tests/v1/offloading/test_worker.py rename to tests/v1/kv_offload/test_worker.py index 2391b565773aa..6cf8aa0875d62 100644 --- a/tests/v1/offloading/test_worker.py +++ b/tests/v1/kv_offload/test_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.v1.offloading.abstract import LoadStoreSpec -from vllm.v1.offloading.worker.worker import (OffloadingHandler, +from vllm.v1.kv_offload.abstract import LoadStoreSpec +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, OffloadingWorker, TransferResult, TransferSpec) diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/kv_offload/abstract.py similarity index 100% rename from vllm/v1/offloading/abstract.py rename to vllm/v1/kv_offload/abstract.py diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/kv_offload/mediums.py similarity index 93% rename from vllm/v1/offloading/mediums.py rename to vllm/v1/kv_offload/mediums.py index 5a1887848c9fc..8962819178459 100644 --- a/vllm/v1/offloading/mediums.py +++ b/vllm/v1/kv_offload/mediums.py @@ -4,7 +4,7 @@ from abc import ABC import numpy as np -from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.kv_offload.abstract import LoadStoreSpec class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC): diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/kv_offload/worker/worker.py similarity index 98% rename from vllm/v1/offloading/worker/worker.py rename to vllm/v1/kv_offload/worker/worker.py index d2c2045d1f1f6..b7a52a088fb90 100644 --- a/vllm/v1/offloading/worker/worker.py +++ b/vllm/v1/kv_offload/worker/worker.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from vllm.logger import init_logger -from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.kv_offload.abstract import LoadStoreSpec # a single transfer spec (src_blocks_spec, dst_blocks_spec) TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec] From 9fac6aa30b669de75d8718164cd99676d3530e7d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:26:28 -0400 Subject: [PATCH 447/584] [BugFix] Fix DeepGEMM warmup, no m.weight_scale_inv (#25206) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> --- vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index a636a714145cf..4d1829cd228cd 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -36,7 +36,7 @@ def _extract_data_from_linear_base_module( assert m.quant_method.quant_config is not None w = m.weight - ws = m.weight_scale_inv + ws = m.weight_scale quant_block_size = m.quant_method.quant_config.weight_block_size assert isinstance(w, torch.Tensor) From 9a4600e4dcbbd13988c31d5198d3ab8b4172ecca Mon Sep 17 00:00:00 2001 From: Andrew Sansom <andrew@protopia.ai> Date: Thu, 18 Sep 2025 19:03:09 -0500 Subject: [PATCH 448/584] [CORE] Prompt Embeddings Support for v1 Engine (#24278) Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Andrew Sansom <qthequartermasterman@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> --- .../test_basic_correctness.py | 10 -- .../test_completion_with_prompt_embeds.py | 1 - .../models/language/generation/test_common.py | 6 -- vllm/engine/arg_utils.py | 24 +++-- vllm/entrypoints/openai/protocol.py | 2 +- vllm/utils/__init__.py | 27 +++++ vllm/v1/core/sched/output.py | 24 +++-- vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/detokenizer.py | 33 ++++--- vllm/v1/engine/output_processor.py | 25 ++++- vllm/v1/engine/processor.py | 38 +++++-- vllm/v1/request.py | 15 ++- vllm/v1/sample/logits_processor/__init__.py | 2 +- vllm/v1/sample/logits_processor/builtin.py | 6 +- vllm/v1/sample/logits_processor/interface.py | 2 +- vllm/v1/serial_utils.py | 2 +- vllm/v1/worker/gpu_input_batch.py | 55 +++++++++-- vllm/v1/worker/gpu_model_runner.py | 99 ++++++++++++++++++- vllm/v1/worker/tpu_input_batch.py | 6 +- vllm/v1/worker/tpu_model_runner.py | 1 + 20 files changed, 305 insertions(+), 76 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index fba18f197074b..24b1c9a93126c 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -76,11 +76,6 @@ def test_models( model_executor: str, enable_prompt_embeds: bool, ) -> None: - - if enable_prompt_embeds and envs.is_set( - "VLLM_USE_V1") and envs.VLLM_USE_V1: - pytest.skip("enable_prompt_embeds is not supported in v1.") - if not envs.VLLM_USE_V1: if async_scheduling: pytest.skip("async_scheduling only supported in v1.") @@ -164,11 +159,6 @@ def test_models_distributed( extra_env: dict[str, str], enable_prompt_embeds: bool, ) -> None: - - if enable_prompt_embeds and envs.is_set( - "VLLM_USE_V1") and envs.VLLM_USE_V1: - pytest.skip("enable_prompt_embeds is not supported in v1.") - if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 3d56291bc793c..0e3fc82f0c033 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -36,7 +36,6 @@ def default_server_args() -> list[str]: "--enforce-eager", # Prompt Embeds server args "--enable-prompt-embeds", - "--no-enable-chunked-prefill", ] diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index a5aa1e3f49743..c14e71cbdb96d 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -125,12 +125,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, # in parts of the operators pytest.skip(f"Skipping '{model}' model test with AITER kernel.") - # Note: can be removed when - # https://github.com/vllm-project/vllm/pull/24278 finished - if current_platform.is_cpu() and use_prompt_embeds: - pytest.skip("Skipping use_prompt_embeds=True with " - "V1-only CPU backend.") - with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fb5beab77b270..63282c4253509 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1513,12 +1513,6 @@ class EngineArgs: recommend_to_remove=False) return False - # No text embedding inputs so far. - if self.enable_prompt_embeds: - _raise_or_fallback(feature_name="--enable-prompt-embeds", - recommend_to_remove=False) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, @@ -1651,6 +1645,13 @@ class EngineArgs: "models in V0 and has been disabled.") self.enable_prefix_caching = False + if self.enable_prompt_embeds: + logger.warning( + "--enable-prompt-embeds and --enable-prefix-caching " + "are not supported together in V0. Prefix caching has " + "been disabled.") + self.enable_prefix_caching = False + # Set max_num_seqs to 256 for VLLM_V0. if self.max_num_seqs is None: self.max_num_seqs = 256 @@ -1664,6 +1665,17 @@ class EngineArgs: # For pooling tasks the default is False if model_config.runner_type != "pooling": self.enable_chunked_prefill = True + + # TODO: When prefix caching supports prompt embeds inputs, this + # check can be removed. + if (self.enable_prompt_embeds + and self.enable_prefix_caching is not False): + logger.warning( + "--enable-prompt-embeds and --enable-prefix-caching " + "are not supported together in V1. Prefix caching has " + "been disabled.") + self.enable_prefix_caching = False + if self.enable_prefix_caching is None: self.enable_prefix_caching = True else: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 7ad8e73d89d59..6b54511a66f33 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -973,7 +973,6 @@ class CompletionRequest(OpenAIBaseModel): # https://platform.openai.com/docs/api-reference/completions/create model: Optional[str] = None prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None - prompt_embeds: Optional[Union[bytes, list[bytes]]] = None best_of: Optional[int] = None echo: Optional[bool] = False frequency_penalty: Optional[float] = 0.0 @@ -1009,6 +1008,7 @@ class CompletionRequest(OpenAIBaseModel): # --8<-- [end:completion-sampling-params] # --8<-- [start:completion-extra-params] + prompt_embeds: Optional[Union[bytes, list[bytes]]] = None add_special_tokens: bool = Field( default=True, description=( diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index f13381ecd9ff3..d4013a69e99fe 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3443,3 +3443,30 @@ def decorate_logs(process_name: Optional[str] = None) -> None: pid = os.getpid() _add_prefix(sys.stdout, process_name, pid) _add_prefix(sys.stderr, process_name, pid) + + +def length_from_prompt_token_ids_or_embeds( + prompt_token_ids: Optional[list[int]], + prompt_embeds: Optional[torch.Tensor], +) -> int: + """Calculate the request length (in number of tokens) give either + prompt_token_ids or prompt_embeds. + """ + prompt_token_len = None if prompt_token_ids is None else len( + prompt_token_ids) + prompt_embeds_len = \ + None if prompt_embeds is None else len(prompt_embeds) + + if prompt_token_len is None: + if prompt_embeds_len is None: + raise ValueError( + "Neither prompt_token_ids nor prompt_embeds were defined.") + return prompt_embeds_len + else: + if (prompt_embeds_len is not None + and prompt_embeds_len != prompt_token_len): + raise ValueError( + "Prompt token ids and prompt embeds had different lengths" + f" prompt_token_ids={prompt_token_len}" + f" prompt_embeds={prompt_embeds_len}") + return prompt_token_len diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 3ec5b91bf2860..209fc2a4404f3 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -11,6 +11,7 @@ from vllm._bc_linter import bc_linter_include if TYPE_CHECKING: import numpy as np import numpy.typing as npt + import torch from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorMetadata) @@ -26,13 +27,14 @@ if TYPE_CHECKING: class NewRequestData: req_id: str - prompt_token_ids: list[int] + prompt_token_ids: Optional[list[int]] mm_features: list[MultiModalFeatureSpec] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] block_ids: tuple[list[int], ...] num_computed_tokens: int lora_request: Optional[LoRARequest] + prompt_embeds: Optional[torch.Tensor] = None @classmethod def from_request( @@ -49,9 +51,12 @@ class NewRequestData: block_ids=block_ids, num_computed_tokens=request.num_computed_tokens, lora_request=request.lora_request, + prompt_embeds=request.prompt_embeds, ) - def __repr__(self): + def __repr__(self) -> str: + prompt_embeds_shape = (self.prompt_embeds.shape + if self.prompt_embeds else None) return (f"NewRequestData(" f"req_id={self.req_id}," f"prompt_token_ids={self.prompt_token_ids}," @@ -59,19 +64,26 @@ class NewRequestData: f"sampling_params={self.sampling_params}," f"block_ids={self.block_ids}," f"num_computed_tokens={self.num_computed_tokens}," - f"lora_request={self.lora_request}" + f"lora_request={self.lora_request}," + f"prompt_embeds_shape={prompt_embeds_shape}" ")") # Version of __repr__ with the prompt data obfuscated - def anon_repr(self): + def anon_repr(self) -> str: + prompt_token_ids_len = len( + self.prompt_token_ids + ) if self.prompt_token_ids is not None else None + prompt_embeds_shape = (self.prompt_embeds.shape + if self.prompt_embeds else None) return (f"NewRequestData(" f"req_id={self.req_id}," - f"prompt_token_ids_len={len(self.prompt_token_ids)}," + f"prompt_token_ids_len={prompt_token_ids_len}," f"mm_features={self.mm_features}," f"sampling_params={self.sampling_params}," f"block_ids={self.block_ids}," f"num_computed_tokens={self.num_computed_tokens}," - f"lora_request={self.lora_request}" + f"lora_request={self.lora_request}," + f"prompt_embeds_shape={prompt_embeds_shape}" ")") diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index dec4abec519bd..345f5a464c2cc 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -47,7 +47,7 @@ class EngineCoreRequest( gc=False): # type: ignore[call-arg] request_id: str - prompt_token_ids: list[int] + prompt_token_ids: Optional[list[int]] mm_features: Optional[list[MultiModalFeatureSpec]] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] @@ -56,6 +56,7 @@ class EngineCoreRequest( lora_request: Optional[LoRARequest] cache_salt: Optional[str] data_parallel_rank: Optional[int] + prompt_embeds: Optional[torch.Tensor] = None # Index of the client, used to ensure outputs are sent back to the same # client for this request when scaling out the front-end. diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index cf4b06db843bd..8aa36d6a439c1 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -13,6 +13,7 @@ from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest logger = init_logger(__name__) @@ -179,11 +180,12 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): self.tokenizer: Tokenizer = tokenizer._tokenizer # Find a safe place to start. - prompt_suffix = request.prompt_token_ids + prompt_token_ids = request.prompt_token_ids or [] + prompt_suffix = prompt_token_ids prompt_len = len(prompt_suffix) if prompt_len > 4: for i in range(4, min(prompt_len + 1, 24)): - suffix = request.prompt_token_ids[-i:] + suffix = prompt_token_ids[-i:] if '�' not in self.tokenizer.decode(suffix): prompt_suffix = suffix break @@ -260,16 +262,25 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): params = request.sampling_params assert params is not None - # Metadata for incremental detokenization. - self.tokens, self.prefix_offset, self.read_offset = ( - convert_prompt_ids_to_tokens( - tokenizer=tokenizer, - prompt_ids=request.prompt_token_ids, - skip_special_tokens=params.skip_special_tokens, - )) + self.prompt_len = length_from_prompt_token_ids_or_embeds( + request.prompt_token_ids, request.prompt_embeds) - self.token_ids.extend(request.prompt_token_ids) - self.prompt_len = len(request.prompt_token_ids) + # Metadata for incremental detokenization. + if request.prompt_token_ids is not None: + self.tokens, self.prefix_offset, self.read_offset = ( + convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=request.prompt_token_ids, + skip_special_tokens=params.skip_special_tokens, + )) + else: + # Prompt embedding requests cannot be detokenized, in general. + self.tokens = [""] * self.prompt_len + self.prefix_offset = 0 + self.read_offest = 0 + + self.token_ids.extend(request.prompt_token_ids + or [0] * self.prompt_len) self.skip_special_tokens = params.skip_special_tokens self.spaces_between_special_tokens = ( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 5dad63988daa4..c17dc3e204ecd 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -14,6 +14,7 @@ from vllm.sampling_params import RequestOutputKind from vllm.tracing import (SpanAttributes, SpanKind, Tracer, extract_trace_context) from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason from vllm.v1.engine.detokenizer import IncrementalDetokenizer from vllm.v1.engine.logprobs import LogprobsProcessor @@ -86,7 +87,8 @@ class RequestState: lora_name: Optional[str], output_kind: RequestOutputKind, prompt: Optional[str], - prompt_token_ids: list[int], + prompt_token_ids: Optional[list[int]], + prompt_embeds: Optional[torch.Tensor], logprobs_processor: Optional[LogprobsProcessor], detokenizer: Optional[IncrementalDetokenizer], max_tokens_param: Optional[int], @@ -104,7 +106,9 @@ class RequestState: self.output_kind = output_kind self.prompt = prompt self.prompt_token_ids = prompt_token_ids - self.prompt_len = len(prompt_token_ids) + self.prompt_embeds = prompt_embeds + self.prompt_len = length_from_prompt_token_ids_or_embeds( + self.prompt_token_ids, self.prompt_embeds) self.logprobs_processor = logprobs_processor self.detokenizer = detokenizer self.max_tokens_param = max_tokens_param @@ -165,6 +169,7 @@ class RequestState: output_kind=output_kind, prompt=prompt, prompt_token_ids=request.prompt_token_ids, + prompt_embeds=request.prompt_embeds, logprobs_processor=logprobs_processor, detokenizer=detokenizer, max_tokens_param=max_tokens_param, @@ -223,6 +228,8 @@ class RequestState: first_output = outputs[0] if isinstance(first_output, PoolingOutput): assert len(outputs) == 1 + # Prompt embeddings are currently not supported by pooling requests. + assert self.prompt_token_ids is not None return PoolingRequestOutput( request_id=request_id, outputs=first_output, @@ -236,10 +243,15 @@ class RequestState: else: prompt_logprobs = self.logprobs_processor.prompt_logprobs + # If prompt embeds were used, put placeholder prompt token ids + prompt_token_ids = self.prompt_token_ids + if prompt_token_ids is None and self.prompt_embeds is not None: + prompt_token_ids = [0] * len(self.prompt_embeds) + return RequestOutput( request_id=request_id, prompt=self.prompt, - prompt_token_ids=self.prompt_token_ids, + prompt_token_ids=prompt_token_ids, prompt_logprobs=prompt_logprobs, outputs=cast(list[CompletionOutput], outputs), finished=finished, @@ -469,6 +481,8 @@ class OutputProcessor: arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9) trace_context = extract_trace_context(engine_core_output.trace_headers) + prompt_length = length_from_prompt_token_ids_or_embeds( + req_state.prompt_token_ids, req_state.prompt_embeds) with (self.tracer.start_as_current_span( "llm_request", kind=SpanKind.SERVER, @@ -488,7 +502,7 @@ class OutputProcessor: span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time) span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, - len(req_state.prompt_token_ids)) + prompt_length) span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, metrics.num_generation_tokens) span.set_attribute( @@ -544,7 +558,8 @@ class OutputProcessor: assert req_state.stats is not None iteration_stats.update_from_finished_request( finish_reason=finish_reason, - num_prompt_tokens=len(req_state.prompt_token_ids), + num_prompt_tokens=length_from_prompt_token_ids_or_embeds( + req_state.prompt_token_ids, req_state.prompt_embeds), max_tokens_param=req_state.max_tokens_param, req_stats=req_state.stats) self.lora_states.finish_request(req_state) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 71f539583a1be..507e2cd3223fd 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -19,6 +19,7 @@ from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest from vllm.v1.structured_output.backend_guidance import ( validate_guidance_grammar) @@ -390,6 +391,16 @@ class Processor: self._validate_model_inputs(processed_inputs) encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) + # Mypy does not always properly infer the types of some elements of + # discriminated unions of TypedDicts, because of how it handles + # inheritance of TypedDict. If we explicitly extract the items we want + # we can avoid type errors from using `dict.get` later in the method. + prompt_str: Optional[str] = None if decoder_inputs[ + "type"] == "embeds" else decoder_inputs.get("prompt") + prompt_token_ids = decoder_inputs[ + "prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None + prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[ + "type"] == "embeds" else None sampling_params = None pooling_params = None @@ -398,9 +409,10 @@ class Processor: sampling_params = params.clone() # If unset max tokens, then generate up to the max_model_len. if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) + seq_len = length_from_prompt_token_ids_or_embeds( + prompt_token_ids, prompt_embeds) + sampling_params.max_tokens = \ + self.model_config.max_model_len - seq_len sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) if self.tokenizer is not None: @@ -430,9 +442,10 @@ class Processor: identifier=decoder_mm_hashes[modality][idx], mm_position=decoder_mm_positions[modality][idx])) - return decoder_inputs.get("prompt"), EngineCoreRequest( + return prompt_str, EngineCoreRequest( request_id=request_id, - prompt_token_ids=decoder_inputs["prompt_token_ids"], + prompt_token_ids=prompt_token_ids, + prompt_embeds=prompt_embeds, mm_features=mm_features, sampling_params=sampling_params, pooling_params=pooling_params, @@ -461,10 +474,17 @@ class Processor: ): model_config = self.model_config - prompt_ids = prompt_inputs["prompt_token_ids"] + prompt_ids = None if prompt_inputs[ + "type"] == "embeds" else prompt_inputs["prompt_token_ids"] + prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[ + "type"] == "embeds" else None + prompt_len = length_from_prompt_token_ids_or_embeds( + prompt_ids, prompt_embeds) if not prompt_ids: if prompt_type == "encoder" and model_config.is_multimodal_model: pass # Mllama may have empty encoder inputs for text-only data + elif prompt_inputs["type"] == "embeds": + pass # Prompt embeds should not have prompt_ids. else: raise ValueError(f"The {prompt_type} prompt cannot be empty") @@ -472,7 +492,7 @@ class Processor: tokenizer = None else: tokenizer = self.tokenizer - max_input_id = max(prompt_ids, default=0) + max_input_id = max(prompt_ids or [], default=0) # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while # self.model_config.get_vocab_size() is the model’s vocab size. @@ -490,7 +510,7 @@ class Processor: f"Token id {max_input_id} is out of vocabulary") max_prompt_len = self.model_config.max_model_len - if len(prompt_ids) > max_prompt_len: + if prompt_len > max_prompt_len: if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( @@ -514,7 +534,7 @@ class Processor: "number of text tokens.") raise ValueError( - f"The {prompt_type} prompt (length {len(prompt_ids)}) is " + f"The {prompt_type} prompt (length {prompt_len}) is " f"longer than the maximum model length of {max_prompt_len}. " f"{suggestion}") diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 145af788d2372..ff10fa00c1cf6 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -7,9 +7,12 @@ from collections.abc import Mapping from functools import partial from typing import TYPE_CHECKING, Any, Callable, Optional, Union +import torch + from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType, EngineCoreRequest, FinishReason) from vllm.v1.structured_output.request import StructuredOutputRequest @@ -25,12 +28,13 @@ class Request: def __init__( self, request_id: str, - prompt_token_ids: list[int], + prompt_token_ids: Optional[list[int]], sampling_params: Optional[SamplingParams], pooling_params: Optional[PoolingParams], eos_token_id: Optional[int], client_index: int = 0, arrival_time: Optional[float] = None, + prompt_embeds: Optional[torch.Tensor] = None, mm_features: Optional[list[MultiModalFeatureSpec]] = None, lora_request: Optional["LoRARequest"] = None, structured_output_request: Optional["StructuredOutputRequest"] = None, @@ -79,9 +83,13 @@ class Request: "sampling_params and pooling_params can't both be unset") self.prompt_token_ids = prompt_token_ids - self.num_prompt_tokens = len(self.prompt_token_ids) + self.prompt_embeds = prompt_embeds + self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + prompt_token_ids, prompt_embeds) self._output_token_ids: list[int] = [] - self._all_token_ids: list[int] = self.prompt_token_ids.copy() + self._all_token_ids: list[int] = self.prompt_token_ids.copy( + ) if self.prompt_token_ids is not None else [0 + ] * self.num_prompt_tokens self.num_output_placeholders = 0 # Used in async scheduling. self.spec_token_ids: list[int] = [] self.num_computed_tokens = 0 @@ -123,6 +131,7 @@ class Request: request_id=request.request_id, client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, + prompt_embeds=request.prompt_embeds, mm_features=request.mm_features, sampling_params=request.sampling_params, pooling_params=request.pooling_params, diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py index df944873bcaf3..10cad5b530716 100644 --- a/vllm/v1/sample/logits_processor/__init__.py +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -243,7 +243,7 @@ class AdapterLogitsProcessor(LogitsProcessor): def _new_state( self, params: SamplingParams, - prompt_ids: list[int], + prompt_ids: Optional[list[int]], output_ids: list[int], ) -> Optional[partial[torch.Tensor]]: """Return state representation for new request diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py index 60f9c0bdb6313..fc655d993cb4c 100644 --- a/vllm/v1/sample/logits_processor/builtin.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -187,7 +187,8 @@ class MinTokensLogitsProcessor(LogitsProcessor): @staticmethod def add_request( - params: SamplingParams, _: list[int], output_tok_ids: list[int] + params: SamplingParams, _: Optional[list[int]], + output_tok_ids: list[int] ) -> Optional[tuple[int, Sequence[int], set[int]]]: min_tokens = params.min_tokens if not min_tokens or len(output_tok_ids) >= min_tokens: @@ -234,7 +235,8 @@ class MinTokensLogitsProcessor(LogitsProcessor): def process_dict_updates( req_entries: dict[int, T], batch_update: Optional[BatchUpdate], - new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]] + new_state: Callable[[SamplingParams, Optional[list[int]], list[int]], + Optional[T]] ) -> bool: """Utility function to update dict state for sparse LogitsProcessors.""" diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 04027359909a6..a84afc2f347a0 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -26,7 +26,7 @@ RemovedRequest = int # (index, params, prompt_tok_ids, output_tok_ids) tuples for new # requests added to the batch. -AddedRequest = tuple[int, SamplingParams, list[int], list[int]] +AddedRequest = tuple[int, SamplingParams, Optional[list[int]], list[int]] # (index 1, index 2, directionality) tuples representing # one-way moves or two-way swaps of requests in batch diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index c8375d6f15517..50c1470c67edc 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -174,7 +174,7 @@ class MsgpackEncoder: ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]: assert self.aux_buffers is not None # view the tensor as a contiguous 1D array of bytes - arr = obj.flatten().contiguous().view(torch.uint8).numpy() + arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy() if obj.nbytes < self.size_threshold: # Smaller tensors are encoded inline, just like ndarrays. data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 6717622efb801..79a392337574f 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -13,7 +13,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams, SamplingType -from vllm.utils import swap_dict_values +from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, @@ -29,7 +29,7 @@ from vllm.v1.worker.block_table import MultiGroupBlockTable class CachedRequestState: req_id: str - prompt_token_ids: list[int] + prompt_token_ids: Optional[list[int]] mm_features: list[MultiModalFeatureSpec] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] @@ -43,9 +43,11 @@ class CachedRequestState: mrope_position_delta: Optional[int] = None lora_request: Optional[LoRARequest] = None + prompt_embeds: Optional[torch.Tensor] = None def __post_init__(self): - self.num_prompt_tokens = len(self.prompt_token_ids) + self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + self.prompt_token_ids, self.prompt_embeds) @property def num_tokens(self) -> int: @@ -63,6 +65,10 @@ class CachedRequestState: def get_token_id(self, idx: int) -> int: if idx < self.num_prompt_tokens: + if self.prompt_token_ids is None: + raise ValueError( + f"Tried to access token index {idx}, but that token was " + "provided via prompt_embeds, and its ID is unknown.") return self.prompt_token_ids[idx] elif idx - self.num_prompt_tokens < len(self.output_token_ids): return self.output_token_ids[idx - self.num_prompt_tokens] @@ -109,6 +115,14 @@ class InputBatch: pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() + self.is_token_ids = torch.zeros((max_num_reqs, max_model_len), + device="cpu", + dtype=bool, + pin_memory=False) + # Store prompt embeddings per request to avoid OOM from large upfront + # allocation if max_model_len is big. + # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size) + self.req_prompt_embeds: dict[int, torch.Tensor] = {} self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) @@ -310,15 +324,23 @@ class InputBatch: self.req_id_to_index[req_id] = req_index # Copy the prompt token ids and output token ids. - num_prompt_tokens = len(request.prompt_token_ids) + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + request.prompt_token_ids, request.prompt_embeds) self.num_prompt_tokens[req_index] = num_prompt_tokens - self.token_ids_cpu[ - req_index, :num_prompt_tokens] = request.prompt_token_ids start_idx = num_prompt_tokens end_idx = start_idx + len(request.output_token_ids) + if request.prompt_token_ids is not None: + self.token_ids_cpu[ + req_index, :num_prompt_tokens] = request.prompt_token_ids + self.is_token_ids[req_index, :num_prompt_tokens] = True + else: + self.is_token_ids[req_index, :num_prompt_tokens] = False + if request.prompt_embeds is not None: + self.req_prompt_embeds[req_index] = request.prompt_embeds self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids - # Number of token ids in token_ids_cpu. + self.is_token_ids[req_index, start_idx:end_idx] = True + # Number of token ids in prompt (token_ids_cpu or prompt_embeds). # NOTE(woosuk): This may include spec decode tokens. self.num_tokens[req_index] = request.num_tokens # Number of tokens without spec decode tokens. @@ -503,6 +525,20 @@ class InputBatch: self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...] self.token_ids_cpu[i2, ...] = tmp + self.is_token_ids[[i1, i2], ...] = self.is_token_ids[[i2, i1], ...] + + # Swap prompt embeddings if they exist + embeds_i1 = self.req_prompt_embeds.get(i1) + embeds_i2 = self.req_prompt_embeds.get(i2) + if embeds_i1 is not None: + self.req_prompt_embeds[i2] = embeds_i1 + else: + self.req_prompt_embeds.pop(i2, None) + if embeds_i2 is not None: + self.req_prompt_embeds[i1] = embeds_i2 + else: + self.req_prompt_embeds.pop(i1, None) + self.block_table.swap_row(i1, i2) self.request_lora_mapping[i1], self.request_lora_mapping[i2] = \ @@ -592,6 +628,11 @@ class InputBatch: num_tokens = self.num_tokens[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ last_req_index, :num_tokens] + self.is_token_ids[empty_index, :num_tokens] = self.is_token_ids[ + last_req_index, :num_tokens] + if last_req_index in self.req_prompt_embeds: + self.req_prompt_embeds[ + empty_index] = self.req_prompt_embeds.pop(last_req_index) self.num_tokens[empty_index] = num_tokens self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[ last_req_index] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 053e8f0537ed9..3ee2160a42ffe 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -56,7 +56,9 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, check_use_alibi, get_dtype_size, - is_pin_memory_available, round_up, supports_dynamo) + is_pin_memory_available, + length_from_prompt_token_ids_or_embeds, round_up, + supports_dynamo) from vllm.v1.attention.backends.flash_attn import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( @@ -197,6 +199,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cache_config.cache_dtype] self.is_pooling_model = (model_config.runner_type == 'pooling') + self.enable_prompt_embeds = model_config.enable_prompt_embeds self.is_multimodal_raw_input_only_model = ( model_config.is_multimodal_raw_input_only_model) @@ -342,6 +345,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.hidden_size, dtype=self.dtype, numpy=False) + self.is_token_ids = self._make_buffer(self.max_num_tokens, + dtype=torch.bool) self.discard_request_indices = self._make_buffer(self.max_num_reqs, dtype=torch.int64) self.num_discarded_requests = 0 @@ -574,6 +579,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, + prompt_embeds=new_req_data.prompt_embeds, mm_features=new_req_data.mm_features, sampling_params=sampling_params, pooling_params=pooling_params, @@ -819,6 +825,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if self.input_batch.prev_sampled_token_ids is None: # Normal scheduling case self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) return # Async scheduling case, where some decode requests from the previous @@ -844,6 +852,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # If not all requests are decodes from the last iteration, # We need to copy the input_ids_cpu to the GPU first. self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) if num_commmon_tokens == 0: # No requests in common with the previous iteration # So input_ids_cpu will have all the input ids. @@ -857,6 +867,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0], non_blocking=True) + self.is_token_ids.gpu[:num_commmon_tokens] = True return # Upload the index tensors asynchronously # so the scatter can be non-blocking. @@ -947,14 +958,60 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # where M is the max_model_len. token_indices = (positions_np + req_indices * self.input_batch.token_ids_cpu.shape[1]) + token_indices_tensor = torch.from_numpy(token_indices) # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(), 0, - torch.from_numpy(token_indices), + token_indices_tensor, out=self.input_ids.cpu[:total_num_scheduled_tokens]) + is_token_ids = self.input_batch.is_token_ids.flatten() + torch.index_select( + is_token_ids, + 0, + token_indices_tensor, + out=self.is_token_ids.cpu[:total_num_scheduled_tokens]) + + # Because we did not pre-allocate a massive prompt_embeds CPU tensor on + # the InputBatch, we need to fill in the prompt embeds into the expected + # spots in the GpuModelRunner's pre-allocated prompt_embeds tensor. + if self.input_batch.req_prompt_embeds: + output_idx = 0 + for req_idx in range(num_reqs): + num_sched = num_scheduled_tokens[req_idx] + + # Skip if this request doesn't have embeddings + if req_idx not in self.input_batch.req_prompt_embeds: + output_idx += num_sched + continue + + # Skip if no tokens scheduled + if num_sched <= 0: + output_idx += num_sched + continue + + req_embeds = self.input_batch.req_prompt_embeds[req_idx] + start_pos = self.input_batch.num_computed_tokens_cpu[req_idx] + + # Skip if trying to read beyond available embeddings + if start_pos >= req_embeds.shape[0]: + output_idx += num_sched + continue + + # Copy available embeddings + end_pos = start_pos + num_sched + actual_end = min(end_pos, req_embeds.shape[0]) + actual_num_sched = actual_end - start_pos + + if actual_num_sched > 0: + self.inputs_embeds.cpu[output_idx:output_idx + + actual_num_sched].copy_( + req_embeds[start_pos:actual_end] + ) + + output_idx += num_sched self.input_batch.block_table.compute_slot_mapping( req_indices, positions_np) @@ -1279,7 +1336,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_batch.num_computed_tokens_cpu[index] num_scheduled_tokens = \ scheduler_output.num_scheduled_tokens[req_id] - num_prompt_tokens = len(req.prompt_token_ids) + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + req.prompt_token_ids, req.prompt_embeds) if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: prompt_part_len = max(0, @@ -1845,6 +1903,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } + elif (self.enable_prompt_embeds and get_pp_group().is_first_rank): + # Get the input embeddings for the tokens that are not input embeds, + # then put them into the appropriate positions. + # TODO(qthequartermasterman): Since even when prompt embeds are + # enabled, (a) not all requests will use prompt embeds, and (b) + # after the initial prompt is processed, the rest of the generated + # tokens will be token ids, it is not desirable to have the + # embedding layer outside of the CUDA graph all the time. The v0 + # engine avoids this by "double compiling" the CUDA graph, once + # with input_ids and again with inputs_embeds, for all num_tokens. + # If a batch only has token ids, then including the embedding layer + # in the CUDA graph will be more performant (like in the else case + # below). + token_ids_idx = self.is_token_ids.gpu[:num_scheduled_tokens] \ + .nonzero(as_tuple=False) \ + .squeeze(1) + # Some tokens ids may need to become embeds + if token_ids_idx.numel() > 0: + token_ids = self.input_ids.gpu[token_ids_idx] + tokens_to_embeds = self.model.get_input_embeddings( + input_ids=token_ids) + self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds + + inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] + model_kwargs = self._init_model_kwargs(num_input_tokens) + input_ids = None else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the @@ -2023,6 +2107,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids + self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True self.input_batch.num_tokens_no_spec[req_idx] = end_idx self.input_batch.num_tokens[req_idx] = end_idx @@ -2570,6 +2655,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Get metadata for this request. request = self.requests[req_id] + if request.prompt_token_ids is None: + # Prompt logprobs is incompatible with prompt embeddings + continue + num_prompt_tokens = len(request.prompt_token_ids) prompt_token_ids = torch.tensor(request.prompt_token_ids).to( self.device, non_blocking=True) @@ -2922,6 +3011,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): **model_kwargs, **self._dummy_mm_kwargs(num_reqs), } + elif self.enable_prompt_embeds: + input_ids = None + inputs_embeds = self.inputs_embeds.gpu[:num_tokens] + model_kwargs = self._init_model_kwargs(num_tokens) else: input_ids = self.input_ids.gpu[:num_tokens] inputs_embeds = None diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index dfa54d0ad83b6..4cd0ac352de0f 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -9,7 +9,7 @@ import torch from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingType -from vllm.utils import swap_dict_values +from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.worker.block_table import MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState @@ -213,7 +213,9 @@ class InputBatch: self.req_id_to_index[req_id] = req_index # Copy the prompt token ids and output token ids. - num_prompt_tokens = len(request.prompt_token_ids) + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + request.prompt_token_ids, request.prompt_embeds) + # TODO: copy prompt_embeds self.num_prompt_tokens[req_index] = num_prompt_tokens self.token_ids_cpu[ req_index, :num_prompt_tokens] = request.prompt_token_ids diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 43f12912707f1..01a8e5c3f0dba 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -387,6 +387,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, + prompt_embeds=new_req_data.prompt_embeds, mm_features=new_req_data.mm_features, sampling_params=sampling_params, pooling_params=None, From 9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26 Mon Sep 17 00:00:00 2001 From: Or Ozeri <oro@il.ibm.com> Date: Fri, 19 Sep 2025 03:20:51 +0300 Subject: [PATCH 449/584] [KV offload][2/N] Introduce LRU-based CPU offloading management (#20075) Signed-off-by: Or Ozeri <oro@il.ibm.com> --- tests/v1/kv_offload/test_cpu.py | 175 +++++++++++++++++++++++++++++ vllm/v1/kv_offload/backend.py | 96 ++++++++++++++++ vllm/v1/kv_offload/backends/cpu.py | 61 ++++++++++ vllm/v1/kv_offload/lru_manager.py | 132 ++++++++++++++++++++++ 4 files changed, 464 insertions(+) create mode 100644 tests/v1/kv_offload/test_cpu.py create mode 100644 vllm/v1/kv_offload/backend.py create mode 100644 vllm/v1/kv_offload/backends/cpu.py create mode 100644 vllm/v1/kv_offload/lru_manager.py diff --git a/tests/v1/kv_offload/test_cpu.py b/tests/v1/kv_offload/test_cpu.py new file mode 100644 index 0000000000000..cdee7811d85b3 --- /dev/null +++ b/tests/v1/kv_offload/test_cpu.py @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Optional + +import numpy as np + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent, + PrepareStoreOutput) +from vllm.v1.kv_offload.backends.cpu import CPUBackend +from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec + + +@dataclass +class ExpectedPrepareStoreOutput: + block_hashes_to_store: list[int] + store_block_ids: list[int] + block_hashes_evicted: list[int] + + +def to_hashes(int_hashes: list[int]) -> list[BlockHash]: + return [BlockHash(str(i).encode()) for i in int_hashes] + + +def verify_store_output( + prepare_store_output: Optional[PrepareStoreOutput], + expected_prepare_store_output: ExpectedPrepareStoreOutput): + assert prepare_store_output is not None + assert (prepare_store_output.block_hashes_to_store == to_hashes( + expected_prepare_store_output.block_hashes_to_store)) + assert (prepare_store_output.block_hashes_evicted == to_hashes( + expected_prepare_store_output.block_hashes_evicted)) + store_spec = prepare_store_output.store_spec + assert isinstance(store_spec, CPULoadStoreSpec) + expected_array = np.array(expected_prepare_store_output.store_block_ids, + dtype=np.int64) + assert np.array_equal(expected_array, store_spec.block_ids) + + +def verify_load_output(prepare_load_output: LoadStoreSpec, + expected_prepare_load_output: list[int]): + assert isinstance(prepare_load_output, CPULoadStoreSpec) + expected_array = np.array(expected_prepare_load_output, dtype=np.int64) + assert np.array_equal(expected_array, prepare_load_output.block_ids) + + +def verify_events(events: Iterable[OffloadingEvent], + block_size: int, + expected_stores: tuple[set[int], ...] = (), + expected_evictions: tuple[set[int], ...] = ()): + stores: list[set[BlockHash]] = [] + evictions: list[set[BlockHash]] = [] + for event in events: + assert event.medium == CPULoadStoreSpec.medium() + assert event.block_size == block_size + if event.removed: + evictions.append(set(event.block_hashes)) + else: + stores.append(set(event.block_hashes)) + + def to_hash_sets( + int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]: + return tuple([set(to_hashes(list(int_set))) for int_set in int_sets]) + + assert tuple(evictions) == to_hash_sets(expected_evictions) + assert tuple(stores) == to_hash_sets(expected_stores) + + +def test_cpu_manager(): + """ + Tests LRUOffloadingManager with a CPUBackend. + """ + # initialize a CPU backend with a capacity of 4 blocks + block_size = 256 + cpu_backend = CPUBackend(block_size=block_size, num_blocks=4) + cpu_manager = LRUOffloadingManager(cpu_backend, enable_events=True) + + # prepare store [1, 2] + prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[1, 2], + store_block_ids=[0, 1], + block_hashes_evicted=[], + )) + + # lookup [1, 2] -> not ready + assert cpu_manager.lookup(to_hashes([1, 2])) == 0 + + # no events so far + assert list(cpu_manager.take_events()) == [] + + # complete store [1, 2] + cpu_manager.complete_store(to_hashes([1, 2])) + verify_events(cpu_manager.take_events(), + block_size=block_size, + expected_stores=({1, 2}, )) + + # lookup [1, 2] + assert cpu_manager.lookup(to_hashes([1])) == 1 + assert cpu_manager.lookup(to_hashes([1, 2])) == 2 + assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2 + + # prepare store [2, 3, 4, 5] -> evicts [1] + prepare_store_output = cpu_manager.prepare_store(to_hashes([2, 3, 4, 5])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[3, 4, 5], + store_block_ids=[2, 3, 0], + block_hashes_evicted=[1], + )) + + # verify eviction event + verify_events(cpu_manager.take_events(), + block_size=block_size, + expected_evictions=({1}, )) + + # prepare store with no space + assert cpu_manager.prepare_store(to_hashes([1, 6])) is None + + # complete store [2, 3, 4, 5] + cpu_manager.complete_store(to_hashes([2, 3, 4, 5])) + + # prepare load [2, 3] + prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3])) + verify_load_output(prepare_load_output, [1, 2]) + + # prepare store with no space ([2, 3] is being loaded) + assert cpu_manager.prepare_store(to_hashes([6, 7, 8])) is None + + # complete load [2, 3] + cpu_manager.complete_load(to_hashes([2, 3])) + + # prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest) + prepare_store_output = cpu_manager.prepare_store(to_hashes([6, 7, 8])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[6, 7, 8], + store_block_ids=[3, 2, 1], + block_hashes_evicted=[2, 3, 4], + )) + + # complete store [6, 7, 8] + cpu_manager.complete_store(to_hashes([6, 7, 8])) + + # touch [5, 6, 7] (move to end of LRU order) + cpu_manager.touch(to_hashes([5, 6, 7])) + + # prepare store [7, 9] -> evicts [8] (oldest following previous touch) + prepare_store_output = cpu_manager.prepare_store(to_hashes([9])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[9], + store_block_ids=[1], + block_hashes_evicted=[8], + )) + + # complete store [7, 9] with failure + cpu_manager.complete_store(to_hashes([7, 9]), success=False) + + # assert [7] is still stored, but [9] is not + assert cpu_manager.lookup(to_hashes([7])) == 1 + assert cpu_manager.lookup(to_hashes([9])) == 0 + + verify_events(cpu_manager.take_events(), + block_size=block_size, + expected_stores=({3, 4, 5}, {6, 7, 8}), + expected_evictions=({2, 3, 4}, {8})) diff --git a/vllm/v1/kv_offload/backend.py b/vllm/v1/kv_offload/backend.py new file mode 100644 index 0000000000000..87a74200116bb --- /dev/null +++ b/vllm/v1/kv_offload/backend.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ctypes +from abc import ABC, abstractmethod +from collections.abc import Iterable + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import LoadStoreSpec + + +class BlockStatus(ctypes.Structure): + """ + Offloading status for a single block of KV data. + Holds the following information: + + ref_cnt - the current number of transfers using this block as a source. + A value of -1 indicates the block is not yet ready to be read. + load_store_spec - backend-specific information on how to actually + read/write the block. + """ + _fields_ = [("ref_cnt", ctypes.c_int32)] + + def __init__(self): + super().__init__() + # initialize block as "not ready" (ref_cnt = -1) + self.ref_cnt = -1 + + @property + def is_ready(self) -> bool: + """ + Returns whether the block is ready to be read. + """ + return self.ref_cnt >= 0 + + +class Backend(ABC): + """ + An abstract class for allocating and returning specs for writing + KV blocks to some backend. + """ + + def __init__(self, block_size: int, medium: str): + self.block_size = block_size + self.medium = medium + + @abstractmethod + def get_num_free_blocks(self): + """ + Returns the number of current number of blocks that can be allocated. + """ + pass + + @abstractmethod + def allocate_blocks(self, + block_hashes: list[BlockHash]) -> list[BlockStatus]: + """ + Allocate space for writing blocks. + This method assumes there is enough space for allocation. + It is unsafe to use without checking get_num_free_blocks beforehand. + + Args: + block_hashes: the hashes identifying the blocks to be written. + + Returns: + A list of BlockStatus for the allocated blocks. + The ref_cnt of each returned item will be -1, meaning the block + is not yet ready to be read. + """ + pass + + @abstractmethod + def free(self, block: BlockStatus): + """ + Free a previously allocated block. + You should only call this function with blocks returned by + allocate_blocks, and only once per each block. + + Args: + block: The block to be freed. + """ + pass + + def get_load_store_spec(self, block_hashes: Iterable[BlockHash], + blocks: Iterable[BlockStatus]) -> LoadStoreSpec: + """ + Get backend-specific information on how to read/write blocks. + + Args: + block_hashes: the list of block hashes identifying the blocks. + blocks: the list of blocks. + + Returns: + A LoadStoreSpec that can be used by a worker + to read/write the blocks. + """ + raise NotImplementedError diff --git a/vllm/v1/kv_offload/backends/cpu.py b/vllm/v1/kv_offload/backends/cpu.py new file mode 100644 index 0000000000000..eb1123d1d83ac --- /dev/null +++ b/vllm/v1/kv_offload/backends/cpu.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ctypes +from collections.abc import Iterable + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import LoadStoreSpec +from vllm.v1.kv_offload.backend import Backend, BlockStatus +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec + + +class CPUBlockStatus(BlockStatus): + _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64) + ] # type: ignore + + def __init__(self, block_id: int): + super().__init__() + self.block_id = block_id + + +class CPUBackend(Backend): + + def __init__(self, block_size: int, num_blocks: int): + super().__init__(block_size=block_size, + medium=CPULoadStoreSpec.medium()) + + self.num_blocks: int = num_blocks + self.num_allocated_blocks: int = 0 + self.allocated_blocks_free_list: list[int] = [] + + def get_num_free_blocks(self): + return (len(self.allocated_blocks_free_list) + self.num_blocks - + self.num_allocated_blocks) + + def allocate_blocks(self, + block_hashes: list[BlockHash]) -> list[BlockStatus]: + num_fresh_blocks = min(len(block_hashes), + self.num_blocks - self.num_allocated_blocks) + num_reused_blocks = len(block_hashes) - num_fresh_blocks + assert len(self.allocated_blocks_free_list) >= num_reused_blocks + + # allocate fresh blocks + blocks: list[BlockStatus] = [] + for _ in range(num_fresh_blocks): + blocks.append(CPUBlockStatus(self.num_allocated_blocks)) + self.num_allocated_blocks += 1 + + # allocate reused blocks + for _ in range(num_reused_blocks): + block_id = self.allocated_blocks_free_list.pop() + blocks.append(CPUBlockStatus(block_id)) + + return blocks + + def free(self, block: BlockStatus): + assert isinstance(block, CPUBlockStatus) + self.allocated_blocks_free_list.append(block.block_id) + + def get_load_store_spec(self, block_hashes: Iterable[BlockHash], + blocks: Iterable[BlockStatus]) -> LoadStoreSpec: + return CPULoadStoreSpec([block.block_id for block in blocks]) diff --git a/vllm/v1/kv_offload/lru_manager.py b/vllm/v1/kv_offload/lru_manager.py new file mode 100644 index 0000000000000..18d3b1d637b32 --- /dev/null +++ b/vllm/v1/kv_offload/lru_manager.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections import OrderedDict +from collections.abc import Iterable +from typing import Optional + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent, + OffloadingManager, PrepareStoreOutput) +from vllm.v1.kv_offload.backend import Backend, BlockStatus + + +class LRUOffloadingManager(OffloadingManager): + """ + An OffloadingManager with a pluggable backend, which evicts blocks by LRU. + """ + + def __init__(self, backend: Backend, enable_events: bool = False): + self.backend: Backend = backend + # block_hash -> BlockStatus + self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict() + self.events: Optional[list[OffloadingEvent]] = \ + [] if enable_events else None + + def lookup(self, block_hashes: Iterable[BlockHash]) -> int: + hit_count = 0 + for block_hash in block_hashes: + block = self.blocks.get(block_hash) + if block is None or not block.is_ready: + break + hit_count += 1 + return hit_count + + def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec: + blocks = [] + for block_hash in block_hashes: + block = self.blocks[block_hash] + assert block.is_ready + block.ref_cnt += 1 + blocks.append(block) + + return self.backend.get_load_store_spec(block_hashes, blocks) + + def touch(self, block_hashes: Iterable[BlockHash]): + for block_hash in reversed(list(block_hashes)): + if self.blocks.get(block_hash): + self.blocks.move_to_end(block_hash) + + def complete_load(self, block_hashes: Iterable[BlockHash]): + for block_hash in block_hashes: + block = self.blocks[block_hash] + assert block.ref_cnt > 0 + block.ref_cnt -= 1 + + def prepare_store( + self, + block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]: + # filter out blocks that are already stored + block_hashes_to_store = [ + block_hash for block_hash in block_hashes + if block_hash not in self.blocks + ] + + num_blocks_to_evict = (len(block_hashes_to_store) - + self.backend.get_num_free_blocks()) + + # build list of blocks to evict + to_evict = [] + if num_blocks_to_evict > 0: + for block_hash, block in self.blocks.items(): + if block.ref_cnt == 0: + to_evict.append(block_hash) + num_blocks_to_evict -= 1 + if num_blocks_to_evict == 0: + break + else: + # we could not evict enough blocks + return None + + # evict blocks + for block_hash in to_evict: + self.backend.free(self.blocks.pop(block_hash)) + + if to_evict and self.events is not None: + self.events.append( + OffloadingEvent(block_hashes=to_evict, + block_size=self.backend.block_size, + medium=self.backend.medium, + removed=True)) + + blocks = self.backend.allocate_blocks(block_hashes_to_store) + assert len(blocks) == len(block_hashes_to_store) + + for block_hash, block in zip(block_hashes_to_store, blocks): + self.blocks[block_hash] = block + + # build store specs for allocated blocks + store_spec = self.backend.get_load_store_spec(block_hashes_to_store, + blocks) + + return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store, + store_spec=store_spec, + block_hashes_evicted=to_evict) + + def complete_store(self, + block_hashes: Iterable[BlockHash], + success: bool = True): + stored_block_hashes: list[BlockHash] = [] + if success: + for block_hash in block_hashes: + block = self.blocks[block_hash] + if not block.is_ready: + block.ref_cnt = 0 + stored_block_hashes.append(block_hash) + else: + for block_hash in block_hashes: + block = self.blocks[block_hash] + if not block.is_ready: + self.backend.free(block) + del self.blocks[block_hash] + + if stored_block_hashes and self.events is not None: + self.events.append( + OffloadingEvent(block_hashes=stored_block_hashes, + block_size=self.backend.block_size, + medium=self.backend.medium, + removed=False)) + + def take_events(self) -> Iterable[OffloadingEvent]: + if self.events is not None: + yield from self.events + self.events.clear() From 6d8246aaffff3ebec84767e373212a7b8da328e2 Mon Sep 17 00:00:00 2001 From: Andrew Xia <axia@meta.com> Date: Thu, 18 Sep 2025 19:11:59 -0700 Subject: [PATCH 450/584] [gpt-oss] Add ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent for streaming (#24938) Signed-off-by: Andrew Xia <axia@meta.com> --- .../openai/test_response_api_with_harmony.py | 56 +++++++++++- vllm/entrypoints/openai/protocol.py | 88 ++++++++++++++----- vllm/entrypoints/openai/serving_responses.py | 32 ++++--- 3 files changed, 143 insertions(+), 33 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index eceaff672112f..8d974d56b4450 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str): assert response3.status == "completed" +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_streaming_types(client: OpenAI, model_name: str): + prompts = [ + "tell me a story about a cat in 20 words", + ] + + # this links the "done" type with the "start" type + # so every "done" type should have a corresponding "start" type + # and every open block should be closed by the end of the stream + pairs_of_event_types = { + "response.completed": "response.created", + "response.output_item.done": "response.output_item.added", + "response.content_part.done": "response.content_part.added", + "response.output_text.done": "response.output_text.delta", + "response.web_search_call.done": "response.web_search_call.added", + "response.reasoning_text.done": "response.reasoning_text.delta", + "response.reasoning_part.done": "response.reasoning_part.added", + } + + for prompt in prompts: + response = await client.responses.create( + model=model_name, + input=prompt, + reasoning={"effort": "low"}, + tools=[], + stream=True, + background=False, + ) + + stack_of_event_types = [] + async for event in response: + if event.type == 'response.created': + stack_of_event_types.append(event.type) + elif event.type == 'response.completed': + assert stack_of_event_types[-1] == pairs_of_event_types[ + event.type] + stack_of_event_types.pop() + if event.type.endswith("added"): + stack_of_event_types.append(event.type) + elif event.type.endswith("delta"): + if stack_of_event_types[-1] == event.type: + continue + stack_of_event_types.append(event.type) + elif event.type.endswith("done"): + assert stack_of_event_types[-1] == pairs_of_event_types[ + event.type] + stack_of_event_types.pop() + assert len(stack_of_event_types) == 0 + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("background", [True, False]) @@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool): assert event.item_id == current_item_id # verify content_index_id is correct - if event.type == "response.content_part.added": + if event.type in [ + "response.content_part.added", + "response.reasoning_part.added" + ]: assert event.content_index != current_content_index current_content_index = event.content_index elif event.type in [ diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6b54511a66f33..05d5d6d964dd3 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -31,6 +31,8 @@ from openai.types.responses import ( ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, ResponseStatus, ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent) +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent) # Backward compatibility for OpenAI client versions try: # For older openai versions (< 1.100.0) @@ -260,26 +262,6 @@ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam, ResponseReasoningItem, ResponseFunctionToolCall] -StreamingResponsesResponse: TypeAlias = Union[ - ResponseCreatedEvent, - ResponseInProgressEvent, - ResponseCompletedEvent, - ResponseOutputItemAddedEvent, - ResponseOutputItemDoneEvent, - ResponseContentPartAddedEvent, - ResponseContentPartDoneEvent, - ResponseReasoningTextDeltaEvent, - ResponseReasoningTextDoneEvent, - ResponseCodeInterpreterCallInProgressEvent, - ResponseCodeInterpreterCallCodeDeltaEvent, - ResponseWebSearchCallInProgressEvent, - ResponseWebSearchCallSearchingEvent, - ResponseWebSearchCallCompletedEvent, - ResponseCodeInterpreterCallCodeDoneEvent, - ResponseCodeInterpreterCallInterpretingEvent, - ResponseCodeInterpreterCallCompletedEvent, -] - class ResponsesRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation @@ -1916,6 +1898,72 @@ class ResponsesResponse(OpenAIBaseModel): ) +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartDoneEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.done"] + """The type of the event. Always `response.reasoning_part.done`.""" + + +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartAddedEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.added"] + """The type of the event. Always `response.reasoning_part.added`.""" + + +StreamingResponsesResponse: TypeAlias = Union[ + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseCompletedEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseReasoningPartAddedEvent, + ResponseReasoningPartDoneEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent, + ResponseWebSearchCallCompletedEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallInterpretingEvent, + ResponseCodeInterpreterCallCompletedEvent, +] + BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest, RerankRequest] diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 469d74272b0e6..4894623aeac28 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -58,6 +58,8 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse, InputTokensDetails, OutputTokensDetails, RequestResponseMetadata, + ResponseReasoningPartAddedEvent, + ResponseReasoningPartDoneEvent, ResponsesRequest, ResponsesResponse, ResponseUsage, StreamingResponsesResponse) @@ -1280,14 +1282,13 @@ class OpenAIServingResponses(OpenAIServing): # Deal with tool call here pass elif previous_item.channel == "analysis": + content = ResponseReasoningTextContent( + text=previous_item.content[0].text, + type="reasoning_text", + ) reasoning_item = ResponseReasoningItem( type="reasoning", - content=[ - ResponseReasoningTextContent( - text=previous_item.content[0].text, - type="reasoning_text", - ), - ], + content=[content], status="completed", id=current_item_id, summary=[], @@ -1301,6 +1302,15 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, text=previous_item.content[0].text, )) + yield _increment_sequence_number_and_return( + ResponseReasoningPartDoneEvent( + type="response.reasoning_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=content, + )) yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", @@ -1412,17 +1422,15 @@ class OpenAIServingResponses(OpenAIServing): )) current_content_index += 1 yield _increment_sequence_number_and_return( - ResponseContentPartAddedEvent( - type="response.content_part.added", + ResponseReasoningPartAddedEvent( + type="response.reasoning_part.added", sequence_number=-1, output_index=current_output_index, item_id=current_item_id, content_index=current_content_index, - part=ResponseOutputText( - type="output_text", + part=ResponseReasoningTextContent( text="", - annotations=[], - logprobs=[], + type="reasoning_text", ), )) yield _increment_sequence_number_and_return( From 1a0a04dae94b7a768c0d59b4f687bcf5e12d3127 Mon Sep 17 00:00:00 2001 From: Chen Ding <dingchen.mail@gmail.com> Date: Fri, 19 Sep 2025 11:31:16 +0800 Subject: [PATCH 451/584] [Perf] Optimize memory peak during EAGLE model loading. (#24585) Signed-off-by: Chen Ding <candy.dc@alibaba-inc.com> --- vllm/model_executor/models/deepseek_eagle.py | 15 ++++++------- vllm/model_executor/models/llama4_eagle.py | 22 +++++++++----------- vllm/model_executor/models/llama_eagle.py | 15 ++++++------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index b1d7f24c2f18b..2770ddebc48ab 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -229,14 +229,15 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + + def transform(inputs): + name, loaded_weight = inputs + if "lm_head" not in name: + name = "model." + name + return name, loaded_weight + loader = AutoWeightsLoader( self, skip_prefixes=None, ) - - model_weights = {} - for name, loaded_weight in weights: - if "lm_head" not in name: - name = "model." + name - model_weights[name] = loaded_weight - loader.load_weights(model_weights.items()) + loader.load_weights(map(transform, weights)) diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index ece490ff2f2a8..a203af53205cd 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -205,23 +205,21 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None: + + def transform(inputs): + name, loaded_weight = inputs + name, weight = self.permute_qk_weight_for_rotary( + name, loaded_weight) + if "lm_head" not in name: + name = "model." + name + return name, weight + loader = AutoWeightsLoader( self, # lm_head is tied with target model (Llama4ForCausalLM) skip_prefixes=(["lm_head."]), ) - - model_weights = {} - weights = [ - self.permute_qk_weight_for_rotary(name, loaded_weight) - for name, loaded_weight in weights - ] - for name, loaded_weight in weights: - if "lm_head" not in name: - name = "model." + name - model_weights[name] = loaded_weight - - loader.load_weights(model_weights.items()) + loader.load_weights(map(transform, weights)) def get_input_embeddings( self, diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index a4933b77e3a53..dfae3c3ea5437 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -158,14 +158,15 @@ class EagleLlamaForCausalLM(LlamaForCausalLM): return self.model(input_ids, positions, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + + def transform(inputs): + name, loaded_weight = inputs + if "lm_head" not in name: + name = "model." + name + return name, loaded_weight + loader = AutoWeightsLoader( self, skip_prefixes=None, ) - - model_weights = {} - for name, loaded_weight in weights: - if "lm_head" not in name: - name = "model." + name - model_weights[name] = loaded_weight - loader.load_weights(model_weights.items()) + loader.load_weights(map(transform, weights)) From 31a8a2a7bccb29612bc58c9a69252bfb78f5abe4 Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Thu, 18 Sep 2025 21:46:57 -0700 Subject: [PATCH 452/584] [Misc] Clean up MM profiling warnings (#25222) Signed-off-by: Roger Wang <hey@rogerw.io> --- vllm/multimodal/profiling.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index fbbc55d3524ca..9b463e212bb49 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -234,19 +234,6 @@ class MultiModalProfiler(Generic[_I]): prompt_token_ids = mm_inputs["prompt_token_ids"] total_len = len(prompt_token_ids) - # V0 does not support chunked prefill. - if total_len > seq_len and not envs.VLLM_USE_V1: - # `max_num_batched_tokens` is defined by `SchedulerConfig` - logger.warning_once( - "The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) " # noqa: E501 - "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). " # noqa: E501 - "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. " # noqa: E501 - "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.", # noqa: E501 - seq_len, - total_len, - str(self._get_mm_num_tokens(mm_inputs)), - ) - if total_len < seq_len: prompt_token_ids.extend([0] * (seq_len - total_len)) @@ -270,22 +257,6 @@ class MultiModalProfiler(Generic[_I]): mm_counts=mm_counts, ) if max_tokens_per_item is not None: - if mm_counts is None: - total_mm_tokens = sum(max_tokens_per_item.values()) - else: - total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k] - for k in max_tokens_per_item.keys() - & mm_counts.keys()) - if total_mm_tokens > seq_len: - logger.warning_once( - "The sequence length (%d) is smaller than the pre-defined" - " worst-case total number of multimodal tokens (%d). " - "This may cause certain multi-modal inputs to fail during " - "inference. To avoid this, you should increase " - "`max_model_len` or reduce `mm_counts`.", - seq_len, - total_mm_tokens, - ) return max_tokens_per_item mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) From 6c8a3c099bc12bb00fa3899753e41183f05fe9fc Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Fri, 19 Sep 2025 13:10:44 +0800 Subject: [PATCH 453/584] [Docs] Fix griffe warnings in vllm/multimodal (#25216) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- vllm/multimodal/inputs.py | 6 +++--- vllm/multimodal/utils.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 240e34e139cfe..e00c10fb66eeb 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -569,8 +569,8 @@ class MultiModalFieldConfig: Args: modality: The modality of the multi-modal item that uses this keyword argument. - slices: For each multi-modal item, the size of the slice that - is used to extract the data corresponding to it. + size_per_item: For each multi-modal item, the size of the slice + that is used to extract the data corresponding to it. dim: The dimension to slice, default to 0. Example: @@ -590,7 +590,7 @@ class MultiModalFieldConfig: ``` Given: - slices: [3, 4, 2] + size_per_item: [3, 4, 2] dim: 1 Input: diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index b308366fca282..f4e2ed72e2d7d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -395,7 +395,9 @@ def group_mm_kwargs_by_modality( modality together into the same `MultiModalKwargs` instance. Args: - mm_inputs: List of `MultiModalKwargsItem`. + mm_kwargs: List of `MultiModalKwargsItem`. + device: The device to place the grouped tensors on. + pin_memory: Whether to pin memory for faster host-to-device transfer. Yields: A tuple `(modality, num_items, grouped_kwargs)`. From a6149aa587d6582545b7878a2dffed3a2419605d Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" <chendi.xue@intel.com> Date: Fri, 19 Sep 2025 00:41:53 -0500 Subject: [PATCH 454/584] [OOT] Support sync_model_loading for OOT (#25126) Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> --- vllm/model_executor/parameter.py | 6 +++--- vllm/model_executor/utils.py | 17 +++-------------- vllm/platforms/interface.py | 23 +++++++++++++++++++++++ vllm/platforms/tpu.py | 4 ++++ 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 221712ba9a338..03e5e5809b678 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -12,7 +12,6 @@ from torch.nn import Parameter from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.logger import init_logger -from vllm.model_executor.utils import _make_synced_weight_loader __all__ = [ "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter", @@ -53,8 +52,9 @@ class BasevLLMParameter(Parameter): # This sometimes causes OOM errors during model loading. To avoid this, # we sync the param tensor after its weight loader is called. from vllm.platforms import current_platform - if current_platform.is_tpu(): - weight_loader = _make_synced_weight_loader(weight_loader) + if current_platform.use_sync_weight_loader(): + weight_loader = current_platform.make_synced_weight_loader( + weight_loader) self._weight_loader = weight_loader self.tp_rank = get_tensor_model_parallel_rank() diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 65436786f82ac..543918418953b 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -44,23 +44,12 @@ def set_weight_attrs( # TODO(woosuk): Remove this hack once we have a better solution. from vllm.platforms import current_platform - if current_platform.is_tpu() and key == "weight_loader": - value = _make_synced_weight_loader(value) + if current_platform.use_sync_weight_loader( + ) and key == "weight_loader": + value = current_platform.make_synced_weight_loader(value) setattr(weight, key, value) -def _make_synced_weight_loader(original_weight_loader): - - def _synced_weight_loader(param, *args, **kwargs): - out = original_weight_loader(param, *args, **kwargs) - # torch._sync doesn't support, is not needed for CPU tensors. - if param.device != torch.device("cpu"): - torch._sync(param) - return out - - return _synced_weight_loader - - def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: parent_map = getattr(model, "packed_modules_mapping", None) parent_map = copy.deepcopy(parent_map) if parent_map is not None else {} diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 054d08c3a85be..53fc762dce540 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -594,6 +594,29 @@ class Platform: """ return False + @classmethod + def use_sync_weight_loader(cls) -> bool: + """ + Returns if the current platform needs to sync weight loader. + """ + return False + + @classmethod + def make_synced_weight_loader(cls, original_weight_loader): + """ + Wrap the original weight loader to make it synced. + """ + if not cls.use_sync_weight_loader(): + return original_weight_loader + + def _synced_weight_loader(param, *args, **kwargs): + out = original_weight_loader(param, *args, **kwargs) + if param.device != torch.device("cpu"): + torch._sync(param) + return out + + return _synced_weight_loader + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 6a061956d8141..4e4db116abca0 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -226,6 +226,10 @@ class TpuPlatform(Platform): torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True) dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu() + @classmethod + def use_sync_weight_loader(cls) -> bool: + return True + try: from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform From 486c5599e3ab7d721c94dd01e89c87742c01e1ac Mon Sep 17 00:00:00 2001 From: Russell Bryant <rbryant@redhat.com> Date: Fri, 19 Sep 2025 02:27:17 -0400 Subject: [PATCH 455/584] [Build] Update Xgrammar to 0.1.24 to get a CVE fix (#25188) Signed-off-by: Russell Bryant <rbryant@redhat.com> --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index b8665104bd09a..7973da080c37d 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -24,7 +24,7 @@ outlines_core == 0.2.11 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" +xgrammar == 0.1.24; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs From 8c1d4acbfe70a1dffdb9a3db57b4d12329350295 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" <jiang1.li@intel.com> Date: Fri, 19 Sep 2025 15:27:22 +0800 Subject: [PATCH 456/584] [CPU] Disable oneDNN linear on non-x86 platforms (#25166) Signed-off-by: jiang1.li <jiang1.li@intel.com> --- vllm/model_executor/layers/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index d2b135c1e4d4e..a1675ffbaa950 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -7,7 +7,7 @@ import torch from vllm import _custom_ops as ops from vllm import envs -from vllm.platforms import current_platform +from vllm.platforms import CpuArchEnum, current_platform from vllm.utils import direct_register_custom_op @@ -167,7 +167,8 @@ def dispatch_cpu_unquantized_gemm( if remove_weight: layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) - elif ops._supports_onednn: + elif (ops._supports_onednn + and current_platform.get_cpu_architecture() == CpuArchEnum.X86): origin_weight = layer.weight if remove_weight: layer.weight = torch.nn.Parameter(torch.empty(0), From 825fdb11add30237e7f592f1a132d3913cd632ec Mon Sep 17 00:00:00 2001 From: "Li, Jiang" <jiang1.li@intel.com> Date: Fri, 19 Sep 2025 15:41:12 +0800 Subject: [PATCH 457/584] [Bugfix][CPU] Add placeholder to avoid import errors when using fused_moe ops on platforms without triton (#25137) Signed-off-by: jiang1.li <jiang1.li@intel.com> --- vllm/model_executor/layers/fused_moe/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 6730f051e3d71..75f56cd01a4ea 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -78,3 +78,12 @@ if HAS_TRITON: "TritonOrDeepGemmExperts", "BatchedTritonOrDeepGemmExperts", ] +else: + # Some model classes directly use the custom ops. Add placeholders + # to avoid import errors. + def _raise_exception(method: str): + raise NotImplementedError( + f"{method} is not implemented as lack of triton.") + + fused_topk = lambda *args, **kwargs: _raise_exception("fused_topk") + fused_experts = lambda *args, **kwargs: _raise_exception("fused_experts") From f2718d2948e83319d83dbbade1883fef2302357e Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Fri, 19 Sep 2025 15:44:56 +0800 Subject: [PATCH 458/584] [Misc] Cleanup test conftest for deprecated encoder-decoder models (#25231) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- tests/conftest.py | 138 ---------------------------------------------- 1 file changed, 138 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0440e859fe02d..9d433dedbf479 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -244,39 +244,6 @@ class DecoderPromptType(Enum): EMPTY_STR = 3 -@pytest.fixture -def example_encoder_decoder_prompts( -) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]: - ''' - Returns an encoder prompt list and a decoder prompt list, wherein each pair - of same-index entries in both lists corresponds to an (encoder prompt, - decoder prompt) tuple. - - Returns: - - * Encoder prompt list - * Decoder prompt list (reverse of encoder prompt list) - ''' - - encoder_prompts = [] - for filename in _TEST_PROMPTS: - encoder_prompts += _read_prompts(filename) - - custom_decoder_prompts = encoder_prompts[::-1] - empty_str_decoder_prompts = [""] * len(encoder_prompts) - none_decoder_prompts = [None] * len(encoder_prompts) - - # NONE decoder prompt type - return { - DecoderPromptType.NONE: - zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts), - DecoderPromptType.EMPTY_STR: - zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts), - DecoderPromptType.CUSTOM: - zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts), - } - - @pytest.fixture def example_long_prompts() -> list[str]: prompts = [] @@ -690,68 +657,6 @@ class HfRunner: return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs] - def generate_encoder_decoder_greedy_logprobs_limit( - self, - encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - max_tokens: int, - num_logprobs: Optional[int], - images: Optional[PromptImageInput] = None, - **kwargs: Any, - ) -> list[TokensTextLogprobs]: - ''' - Greedy logprobs generation for vLLM encoder/decoder models - ''' - - all_logprobs: list[list[dict[int, float]]] = [] - all_output_ids: list[list[int]] = [] - all_output_strs: list[str] = [] - - for i, (encoder_prompt, decoder_prompt) in enumerate( - to_enc_dec_tuple_list(encoder_decoder_prompts)): - processor_kwargs: dict[str, Any] = { - "text": encoder_prompt, - "return_tensors": "pt", - } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - - encoder_inputs = self.processor(**processor_kwargs) - encoder_inputs = self.wrap_device(encoder_inputs) - - if decoder_prompt is None: - decoder_input_ids = None - else: - decoder_inputs = self.tokenizer(decoder_prompt, - return_tensors="pt") - decoder_input_ids = self.wrap_device(decoder_inputs.input_ids) - - output = self.model.generate( - decoder_input_ids=decoder_input_ids, - use_cache=True, - do_sample=False, - max_new_tokens=max_tokens, - output_hidden_states=True, - return_dict_in_generate=True, - **encoder_inputs, - **kwargs, - ) - - ( - seq_logprobs_lst, - output_len, - ) = self._hidden_states_to_logprobs(output.decoder_hidden_states, - num_logprobs) - - all_logprobs.append(seq_logprobs_lst) - seq_ids = output.sequences[0] - output_ids = seq_ids[-output_len:] - all_output_ids.append(output_ids.tolist()) - all_output_strs.append(self.tokenizer.decode(output_ids)) - - outputs = zip(all_output_ids, all_output_strs, all_logprobs) - return [(output_ids, output_str, output_logprobs) - for output_ids, output_str, output_logprobs in outputs] - def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]: return self.model.encode(prompts, *args, **kwargs) @@ -940,26 +845,6 @@ class VllmRunner: if sampling_params.prompt_logprobs is None else toks_str_logsprobs_prompt_logprobs) - def generate_encoder_decoder_w_logprobs( - self, - encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - sampling_params: SamplingParams, - ) -> Union[list[TokensTextLogprobs], - list[TokensTextLogprobsPromptLogprobs]]: - ''' - Logprobs generation for vLLM encoder/decoder models - ''' - - assert sampling_params.logprobs is not None - req_outputs = self.llm.generate(encoder_decoder_prompts, - sampling_params=sampling_params) - toks_str_logsprobs_prompt_logprobs = ( - self._final_steps_generate_w_logprobs(req_outputs)) - # Omit prompt logprobs if not required by sampling params - return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] - if sampling_params.prompt_logprobs is None else - toks_str_logsprobs_prompt_logprobs) - def generate_greedy( self, prompts: Union[list[str], list[torch.Tensor]], @@ -1037,29 +922,6 @@ class VllmRunner: return perplexities - def generate_encoder_decoder_greedy_logprobs( - self, - encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - max_tokens: int, - num_logprobs: Optional[int], - num_prompt_logprobs: Optional[int] = None, - skip_special_tokens: bool = True, - ) -> Union[list[TokensTextLogprobs], - list[TokensTextLogprobsPromptLogprobs]]: - greedy_logprobs_params = SamplingParams( - temperature=0.0, - max_tokens=max_tokens, - logprobs=num_logprobs, - prompt_logprobs=(num_prompt_logprobs), - skip_special_tokens=skip_special_tokens, - ) - ''' - Greedy logprobs generation for vLLM encoder/decoder models - ''' - - return self.generate_encoder_decoder_w_logprobs( - encoder_decoder_prompts, greedy_logprobs_params) - def generate_beam_search( self, prompts: list[str], From a684c0124cb8ac04984b6fd621d99e1463016eac Mon Sep 17 00:00:00 2001 From: Yan Ma <yan.ma@intel.com> Date: Fri, 19 Sep 2025 16:45:06 +0800 Subject: [PATCH 459/584] [bugfix] fix MHA for models like OpenGVLab/InternVL3_5-38B (#25146) Signed-off-by: Yan Ma <yan.ma@intel.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/attention/layer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 15c0ce33e9659..8d5ebd93e063d 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -430,9 +430,11 @@ class MultiHeadAttention(nn.Module): key: torch.Tensor, value: torch.Tensor, ) -> torch.Tensor: - """Input shape: batch_size x seq_len x hidden_size""" - # TODO(Isotr0py): Use existing backend implementations and support FA3 - bsz, q_len, _ = query.size() + """Input shape: + (batch_size x seq_len x hidden_size) or + (batch_size x seq_len x num_heads x head_size) + """ + bsz, q_len = query.size()[:2] kv_len = key.size(1) query = query.view(bsz, q_len, self.num_heads, self.head_size) From cea91a32f2364d19d5e708026e84ce21a450c53d Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Fri, 19 Sep 2025 18:27:49 +0800 Subject: [PATCH 460/584] [Kernel][Performance] Add Triton kernel for Qwen3-VL interleaved MRoPE (#25055) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- tests/kernels/core/test_mrope.py | 92 +++++++++++++------ .../layers/rotary_embedding/mrope.py | 36 +++++--- 2 files changed, 85 insertions(+), 43 deletions(-) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 3f2f330f6dc3b..5a903438f5e99 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import NamedTuple import pytest import torch +from packaging.version import Version from transformers import AutoConfig +from transformers import __version__ as TRANSFORMERS_VERSION from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform @@ -15,6 +18,7 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int, head_size: int, max_position_embeddings: int, dtype: torch.dtype, device: torch.device): """Generate test data for given configuration.""" + current_platform.seed_everything(42) # Create 2D positions (3, num_tokens) for multimodal case positions = torch.randint(0, max_position_embeddings // 4, (3, num_tokens), @@ -33,22 +37,37 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int, return positions, query, key -def unroll_model_tp_dict(model_tp_dict): - return [(model_name, tp_size) - for model_name, tp_sizes in model_tp_dict.items() - for tp_size in tp_sizes] +class MRoPETestInfo(NamedTuple): + model_name: str + # https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317 + atol: float = 1e-2 + rtol: float = 1.6e-2 + marks: list[pytest.MarkDecorator] = [] -model_tp_dict = { - "Qwen/Qwen2-VL-7B-Instruct": [1, 2], - "Qwen/Qwen2-VL-72B-Instruct": [1, 2], - "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2], - "zai-org/GLM-4.1V-9B-Thinking": [1, 2], -} +TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version -# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317 -dtype_atol_rtol_list = [ - [torch.bfloat16, 1e-2, 1.6e-2], +MODELS_TO_TEST = [ + MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"), + MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"), + MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"), + MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"), + MRoPETestInfo( + model_name="Qwen/Qwen3-VL-4B-Instruct", + marks=[ + pytest.mark.skipif( + Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"), + reason="Qwen3-VL only available after Transformers v4.57", + ) + ]), + MRoPETestInfo( + model_name="Qwen/Qwen3-VL-30B-A3B-Instruct", + marks=[ + pytest.mark.skipif( + Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"), + reason="Qwen3-VL only available after Transformers v4.57", + ) + ]), ] num_tokens_list = [11, 8192] @@ -56,20 +75,29 @@ num_tokens_list = [11, 8192] @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests.") -@pytest.mark.parametrize("model_name, tp_size", - unroll_model_tp_dict(model_tp_dict)) -@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) +@pytest.mark.parametrize("model_info, model_name", [ + pytest.param(test_config, test_config.model_name, marks=test_config.marks) + for test_config in MODELS_TO_TEST +]) +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("num_tokens", num_tokens_list) -def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): +def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int, + dtype: torch.dtype, num_tokens: int): + + atol = model_info.atol + rtol = model_info.rtol config = AutoConfig.from_pretrained(model_name) + config = config.get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads total_num_heads = config.num_attention_heads num_heads = total_num_heads // tp_size num_kv_heads = max(1, total_num_kv_heads // tp_size) - head_dim = config.hidden_size // total_num_heads + head_dim = (config.head_dim if hasattr(config, "head_dim") else + config.hidden_size // total_num_heads) is_neox_style = True rope_theta = config.rope_theta @@ -111,24 +139,30 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens): @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests.") -@pytest.mark.parametrize( - "model_name, tp_size", - unroll_model_tp_dict({ - "Qwen/Qwen2-VL-7B-Instruct": [1, 2], - "zai-org/GLM-4.1V-9B-Thinking": [1, 2] - })) -@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list) -@pytest.mark.parametrize("num_tokens", [4]) -def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol, - num_tokens): +@pytest.mark.parametrize("model_info, model_name", [ + pytest.param(test_config, test_config.model_name, marks=test_config.marks) + for test_config in MODELS_TO_TEST +]) +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("num_tokens", num_tokens_list) +def test_mrope_torch_compile_tracing(model_name: str, + model_info: MRoPETestInfo, tp_size: int, + dtype: torch.dtype, num_tokens: int): + + atol = model_info.atol + rtol = model_info.rtol + config = AutoConfig.from_pretrained(model_name) + config = config.get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads total_num_heads = config.num_attention_heads num_heads = total_num_heads // tp_size num_kv_heads = max(1, total_num_kv_heads // tp_size) - head_dim = config.hidden_size // total_num_heads + head_dim = (config.head_dim if hasattr(config, "head_dim") else + config.hidden_size // total_num_heads) is_neox_style = True rope_theta = config.rope_theta max_position = config.max_position_embeddings diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index ef61dbc1a5ab1..ccc59bbbe233f 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -15,7 +15,7 @@ from .common import apply_rotary_emb_dispatch @triton.jit -def _triton_qwen2vl_mrope_forward( +def _triton_mrope_forward( q_ptr, k_ptr, cos, @@ -30,12 +30,14 @@ def _triton_qwen2vl_mrope_forward( pad_hd: tl.constexpr, mrope_section_t: tl.constexpr, mrope_section_h: tl.constexpr, + mrope_section_w: tl.constexpr, + is_interleaved: tl.constexpr, ): # Adapted from # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py # This version supports flatten input tensors from vllm # and supports cos and sin cache with shape (3, num_tokens, head_dim // 2) - # instead of (3, bsz, seq_len, head_dim) + # instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary pid = tl.program_id(0) # locate start address q_ptr = q_ptr + pid * (n_qh * hd) @@ -47,9 +49,6 @@ def _triton_qwen2vl_mrope_forward( # #################################################################### # Note: cos and sin now have shape (3, num_tokens, head_dim // 2) - t_end = mrope_section_t - h_end = t_end + mrope_section_h - # Updated stride calculation for half head_dim half_rd = rd // 2 t_cos = cos + pid * half_rd @@ -61,9 +60,18 @@ def _triton_qwen2vl_mrope_forward( # Updated offsets for half head_dim cos_offsets = tl.arange(0, pad_hd // 2) - t_mask = cos_offsets < t_end - h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) - w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd) + if is_interleaved: + h_mask = (((cos_offsets % 3) == 1) & + (cos_offsets <= 3 * mrope_section_h)) + w_mask = (((cos_offsets % 3) == 2) & + (cos_offsets <= 3 * mrope_section_w)) + t_mask = ~(h_mask | w_mask) + else: + t_end = mrope_section_t + h_end = t_end + mrope_section_h + t_mask = cos_offsets < mrope_section_t + h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) + w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd) t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0) h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0) @@ -131,6 +139,7 @@ def triton_mrope( mrope_section: list[int], head_size: int, rotary_dim: int, + mrope_interleaved: bool, ) -> tuple[torch.Tensor, torch.Tensor]: """Qwen2VL mrope kernel. @@ -158,7 +167,7 @@ def triton_mrope( cos = cos.contiguous() sin = sin.contiguous() - _triton_qwen2vl_mrope_forward[(n_row, )]( + _triton_mrope_forward[(n_row, )]( q, k, cos, @@ -173,6 +182,8 @@ def triton_mrope( pad_hd, mrope_section[0], mrope_section[1], + mrope_section[2], + mrope_interleaved, ) return q, k @@ -201,7 +212,7 @@ class MRotaryEmbedding(RotaryEmbedding): is_neox_style: bool, dtype: torch.dtype, mrope_section: Optional[list[int]] = None, - mrope_interleaved: Optional[bool] = False, + mrope_interleaved: bool = False, ) -> None: # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get @@ -282,10 +293,6 @@ class MRotaryEmbedding(RotaryEmbedding): assert positions.ndim == 1 or positions.ndim == 2 assert key is not None - if self.mrope_interleaved: - # TODO: add triton implementation to support mrope-interleaved - return self.forward_native(positions, query, key) - num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) @@ -302,6 +309,7 @@ class MRotaryEmbedding(RotaryEmbedding): self.mrope_section, self.head_size, self.rotary_dim, + self.mrope_interleaved, ) return q.reshape(query_shape), k.reshape(key_shape) From 1dfea5f4a95df8d14b46433a479a28d56e60494c Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Fri, 19 Sep 2025 03:46:16 -0700 Subject: [PATCH 461/584] [Bugfix][Perf] Misc fixes for Qwen3 VL (#25238) Signed-off-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/qwen3_vl.py | 23 ++++++++++------------ vllm/model_executor/models/qwen3_vl_moe.py | 2 ++ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 2c36dfbce7f67..c224b78e2c27c 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1075,6 +1075,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, config.text_config.hidden_size) for _ in range(self.deepstack_num_level) ] if self.use_deepstack else None + self.visual_dim = config.vision_config.out_hidden_size + self.multiscale_dim = self.visual_dim * self.deepstack_num_level def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: @@ -1313,12 +1315,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ] multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0) - visual_dim = multimodal_embeddings_cat.shape[-1] // ( - self.deepstack_num_level + 1) - - main_dim, multi_dim = visual_dim, visual_dim * self.deepstack_num_level multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501 - multimodal_embeddings_cat, [main_dim, multi_dim], + multimodal_embeddings_cat, [self.visual_dim, self.multiscale_dim], dim=-1) multimodal_embeddings = torch.split(multimodal_embeddings_main, @@ -1340,10 +1338,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ], ) deepstack_input_embeds = deepstack_input_embeds.view( - inputs_embeds.shape[0], self.deepstack_num_level, - visual_dim).contiguous() - deepstack_input_embeds = deepstack_input_embeds.permute( - 1, 0, 2).contiguous() + inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim) + deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2) return deepstack_input_embeds, multimodal_embeddings def get_input_embeddings( @@ -1353,9 +1349,10 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: deepstack_input_embeds = None inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None and self.use_deepstack: - deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 - input_ids, inputs_embeds, multimodal_embeddings) + if multimodal_embeddings is not None: + if self.use_deepstack: + deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 + input_ids, inputs_embeds, multimodal_embeddings) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [self.config.image_token_id, self.config.video_token_id]) @@ -1531,4 +1528,4 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, language_model="language_model", connector="model.visual.merger", tower_model="model.visual.", - ) + ) \ No newline at end of file diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index d25bc71dcb59b..625f94cf7ad77 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -344,3 +344,5 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): config.text_config.hidden_size) for _ in range(self.deepstack_num_level) ] if self.use_deepstack else None + self.visual_dim = config.vision_config.out_hidden_size + self.multiscale_dim = self.visual_dim * self.deepstack_num_level \ No newline at end of file From 058525b9973cabfe27b7ab34dad6dbcbb6859f74 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 12:02:55 +0100 Subject: [PATCH 462/584] Move `PoolerConfig` from `config/__init__.py` to `config/pooler.py` (#25181) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/pooling_models.md | 4 +- docs/models/supported_models.md | 4 +- .../openai_embedding_long_text/README.md | 2 +- .../openai_embedding_long_text/client.py | 4 +- .../openai_embedding_long_text/service.sh | 2 +- .../openai/test_embedding_long_text.py | 2 +- .../models/language/pooling/test_embedding.py | 2 +- .../pooling/test_mm_classifier_conversion.py | 3 +- ...y => test_pooler_config_init_behaviour.py} | 62 ++++--- tests/test_config.py | 18 +- vllm/config/__init__.py | 155 ++++-------------- vllm/config/pooler.py | 97 +++++++++++ vllm/engine/arg_utils.py | 7 +- vllm/entrypoints/llm.py | 10 +- 14 files changed, 193 insertions(+), 179 deletions(-) rename tests/models/language/pooling/{test_override_pooler_config.py => test_pooler_config_init_behaviour.py} (74%) create mode 100644 vllm/config/pooler.py diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 0521a22c07029..50982d3d0d0f3 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -59,7 +59,7 @@ enabling the corresponding APIs: #### Predefined models If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`, -you can override some of its attributes via the `--override-pooler-config` option. +you can override some of its attributes via the `--pooler-config` option. #### Converted models @@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default: When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. -You can further customize this via the `--override-pooler-config` option, +You can further customize this via the `--pooler-config` option, which takes priority over both the model's and Sentence Transformers's defaults. ## Offline Inference diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index b67ebcbe3c81a..3a6738a27be09 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -457,7 +457,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A !!! note `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. - You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. + You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`. !!! note For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. @@ -552,7 +552,7 @@ If your model is not in the above list, we will try to automatically convert the !!! important For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, - e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. + e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. #### Token Classification diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/online_serving/openai_embedding_long_text/README.md index 04edc4680ea0b..00d3ded3e41c1 100644 --- a/examples/online_serving/openai_embedding_long_text/README.md +++ b/examples/online_serving/openai_embedding_long_text/README.md @@ -42,7 +42,7 @@ python client.py ### Server Configuration -The key parameters for chunked processing are in the `--override-pooler-config`: +The key parameters for chunked processing are in the `--pooler-config`: ```json { diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/online_serving/openai_embedding_long_text/client.py index 6e9838ac6d8db..4a3674bb3f2a8 100644 --- a/examples/online_serving/openai_embedding_long_text/client.py +++ b/examples/online_serving/openai_embedding_long_text/client.py @@ -13,7 +13,7 @@ Prerequisites: # MEAN pooling (processes all chunks, recommended for complete coverage) vllm serve intfloat/multilingual-e5-large \ - --override-pooler-config \ + --pooler-config \ '{"pooling_type": "MEAN", "normalize": true, ' \ '"enable_chunked_processing": true, "max_embed_len": 3072000}' \ --served-model-name multilingual-e5-large \ @@ -23,7 +23,7 @@ Prerequisites: # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks) vllm serve BAAI/bge-large-en-v1.5 \ - --override-pooler-config \ + --pooler-config \ '{"pooling_type": "CLS", "normalize": true, ' \ '"enable_chunked_processing": true, "max_embed_len": 1048576}' \ --served-model-name bge-large-en-v1.5 \ diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh index 56888c8aa0e4c..1577de85f7ff2 100644 --- a/examples/online_serving/openai_embedding_long_text/service.sh +++ b/examples/online_serving/openai_embedding_long_text/service.sh @@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab vllm serve "$MODEL_NAME" \ --tensor-parallel-size "$GPU_COUNT" \ --enforce-eager \ - --override-pooler-config "$POOLER_CONFIG" \ + --pooler-config "$POOLER_CONFIG" \ --served-model-name ${MODEL_CODE} \ --api-key "$API_KEY" \ --trust-remote-code \ diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py index 2d3da238d245e..ab5f765c28ed6 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py +++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py @@ -216,7 +216,7 @@ def server_with_chunked_processing(): "--enforce-eager", "--max-model-len", "512", # Set smaller max_model_len to trigger chunking mechanism - '--override-pooler-config', + '--pooler-config', ('{"pooling_type": "MEAN", "normalize": true, ' '"enable_chunked_processing": true, "max_embed_len": 10000}'), "--gpu-memory-utilization", diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index d61ac08475e3c..17513d1bb20d7 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -58,7 +58,7 @@ def test_models( vllm_extra_kwargs = {} if model == "ssmits/Qwen2-7B-Instruct-embed-base": - vllm_extra_kwargs["override_pooler_config"] = \ + vllm_extra_kwargs["pooler_config"] = \ PoolerConfig(pooling_type="MEAN", normalize=False) max_model_len: Optional[int] = 512 diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py index 166b953de43e7..9814cad48a807 100644 --- a/tests/models/language/pooling/test_mm_classifier_conversion.py +++ b/tests/models/language/pooling/test_mm_classifier_conversion.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.config.pooler import PoolerConfig from vllm.platforms import current_platform @@ -99,7 +100,7 @@ def test_gemma_multimodal( convert="classify", load_format="auto", hf_overrides=update_config, - override_pooler_config={"pooling_type": "LAST"}, + pooler_config=PoolerConfig(pooling_type="LAST"), max_model_len=512, enforce_eager=True, tensor_parallel_size=1, diff --git a/tests/models/language/pooling/test_override_pooler_config.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py similarity index 74% rename from tests/models/language/pooling/test_override_pooler_config.py rename to tests/models/language/pooling/test_pooler_config_init_behaviour.py index 2b1c74652e76f..9b3fbd6a6cd09 100644 --- a/tests/models/language/pooling/test_override_pooler_config.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -24,18 +24,18 @@ def test_classify_models_using_activation( dtype: str, ) -> None: - with vllm_runner(model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig( - activation=False)) as vllm_model: + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(activation=False)) as vllm_model: wo_activation_out = vllm_model.classify(example_prompts) - with vllm_runner(model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig( - activation=True)) as vllm_model: + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(activation=True)) as vllm_model: w_activation_out = vllm_model.classify(example_prompts) for wo_activation, w_activation in zip(wo_activation_out, @@ -43,9 +43,8 @@ def test_classify_models_using_activation( wo_activation = torch.tensor(wo_activation) w_activation = torch.tensor(w_activation) - assert not torch.allclose( - wo_activation, w_activation, - atol=1e-2), "override_pooler_config is not working" + assert not torch.allclose(wo_activation, w_activation, + atol=1e-2), "pooler_config is not working" assert torch.allclose(softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2) @@ -65,23 +64,22 @@ def test_embed_models_using_normalize( dtype: str, ) -> None: - with vllm_runner(model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig( - normalize=False)) as vllm_model: - wo_normalize = torch.tensor(vllm_model.embed(example_prompts)) - with vllm_runner( model, max_model_len=512, dtype=dtype, - override_pooler_config=PoolerConfig(normalize=True)) as vllm_model: + pooler_config=PoolerConfig(normalize=False)) as vllm_model: + wo_normalize = torch.tensor(vllm_model.embed(example_prompts)) + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=True)) as vllm_model: w_normalize = torch.tensor(vllm_model.embed(example_prompts)) assert not torch.allclose( wo_normalize, w_normalize, - atol=1e-2), "override_pooler_config normalize is not working" + atol=1e-2), "pooler_config normalize is not working" assert torch.allclose( F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2), "w_normal should be close to normal(wo_normal)." @@ -102,18 +100,16 @@ def test_reward_models_using_softmax( dtype: str, ) -> None: - with vllm_runner( - model, - max_model_len=1024, - dtype=dtype, - override_pooler_config=PoolerConfig(softmax=False)) as vllm_model: + with vllm_runner(model, + max_model_len=1024, + dtype=dtype, + pooler_config=PoolerConfig(softmax=False)) as vllm_model: wo_softmax = vllm_model.encode(example_prompts) - with vllm_runner( - model, - max_model_len=1024, - dtype=dtype, - override_pooler_config=PoolerConfig(softmax=True)) as vllm_model: + with vllm_runner(model, + max_model_len=1024, + dtype=dtype, + pooler_config=PoolerConfig(softmax=True)) as vllm_model: w_softmax = vllm_model.encode(example_prompts) for wo, w in zip(wo_softmax, w_softmax): @@ -121,7 +117,7 @@ def test_reward_models_using_softmax( w = torch.tensor(w) assert not torch.allclose( - wo, w, atol=1e-2), "override_pooler_config softmax is not working" + wo, w, atol=1e-2), "pooler_config softmax is not working" assert torch.allclose( softmax(wo), w, atol=1e-2), "w_softmax should be close to softmax(wo_softmax)." diff --git a/tests/test_config.py b/tests/test_config.py index 6e37bdbee59eb..0796447c079b6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -207,25 +207,19 @@ def test_get_pooling_config(): model_id = "sentence-transformers/all-MiniLM-L12-v2" model_config = ModelConfig(model_id) - pooling_config = model_config._init_pooler_config() - assert pooling_config is not None - - assert pooling_config.normalize - assert pooling_config.pooling_type == PoolingType.MEAN.name + assert model_config.pooler_config is not None + assert model_config.pooler_config.normalize + assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") def test_get_pooling_config_from_args(): model_id = "sentence-transformers/all-MiniLM-L12-v2" - model_config = ModelConfig(model_id) + pooler_config = PoolerConfig(pooling_type="CLS", normalize=True) + model_config = ModelConfig(model_id, pooler_config=pooler_config) - override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True) - model_config.override_pooler_config = override_pooler_config - - pooling_config = model_config._init_pooler_config() - assert pooling_config is not None - assert asdict(pooling_config) == asdict(override_pooler_config) + assert asdict(model_config.pooler_config) == asdict(pooler_config) @pytest.mark.parametrize( diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 25daca00c02d9..45504e010d688 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -40,6 +40,7 @@ from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, MultiModalConfig) from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) +from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.speculative import SpeculativeConfig from vllm.config.structured_outputs import StructuredOutputsConfig @@ -406,13 +407,6 @@ class ModelConfig: hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.""" - pooler_config: Optional["PoolerConfig"] = field(init=False) - """Pooler config which controls the behaviour of output pooling in pooling - models.""" - override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None - """Initialize non-default pooling config or override default pooling config - for the pooling model. e.g. `{"pooling_type": "mean", "normalize": false}`. - """ logits_processor_pattern: Optional[str] = None """Optional regex pattern specifying valid logits processor qualified names that can be passed with the `logits_processors` extra completion argument. @@ -448,6 +442,14 @@ class ModelConfig: io_processor_plugin: Optional[str] = None """IOProcessor plugin name to load at model startup""" + # Pooler config + pooler_config: Optional[PoolerConfig] = None + """Pooler config which controls the behaviour of output pooling in pooling + models.""" + override_pooler_config: Optional[Union[dict, PoolerConfig]] = None + """[DEPRECATED] Use `pooler_config` instead. This field will be removed in + v0.12.0 or v1.0.0, whichever is sooner.""" + # Multimodal config and init vars multimodal_config: Optional[MultiModalConfig] = None """Configuration for multimodal model. If `None`, this will be inferred @@ -709,7 +711,33 @@ class ModelConfig: self._architecture = arch logger.info("Resolved architecture: %s", arch) - self.pooler_config = self._init_pooler_config() + # Init pooler config if needed + if self.runner_type == "pooling": + if self.override_pooler_config is not None: + logger.warning_once( + "`override_pooler_config` is deprecated and will be " + "removed in v0.12.0 or v1.0.0, whichever is sooner. " + "Please use `pooler_config` instead.") + + if isinstance(self.override_pooler_config, dict): + self.pooler_config = PoolerConfig( + **self.override_pooler_config) + else: + self.pooler_config = self.override_pooler_config + + if self.pooler_config is None: + self.pooler_config = PoolerConfig() + + base_config = get_pooling_config(self.model, self.revision) + if base_config is not None: + # Only set values that are not overridden by the user + for k, v in base_config.items(): + if getattr(self.pooler_config, k) is None: + setattr(self.pooler_config, k, v) + + default_pooling_type = self._model_info.default_pooling_type + if self.pooler_config.pooling_type is None: + self.pooler_config.pooling_type = default_pooling_type self.dtype: torch.dtype = _get_and_verify_dtype( self.model, @@ -869,29 +897,6 @@ class ModelConfig: return get_sentence_transformer_tokenizer_config( self.model, self.revision) - def _init_pooler_config(self) -> Optional["PoolerConfig"]: - if self.runner_type == "pooling": - if isinstance(self.override_pooler_config, dict): - self.override_pooler_config = PoolerConfig( - **self.override_pooler_config) - - pooler_config = self.override_pooler_config or PoolerConfig() - - base_config = get_pooling_config(self.model, self.revision) - if base_config is not None: - # Only set values that are not overridden by the user - for k, v in base_config.items(): - if getattr(pooler_config, k) is None: - setattr(pooler_config, k, v) - - default_pooling_type = self._model_info.default_pooling_type - if pooler_config.pooling_type is None: - pooler_config.pooling_type = default_pooling_type - - return pooler_config - - return None - def _verify_tokenizer_mode(self) -> None: tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) if tokenizer_mode not in get_args(TokenizerMode): @@ -1833,94 +1838,6 @@ class DeviceConfig: self.device = torch.device(self.device_type) -@config -@dataclass -class PoolerConfig: - """Controls the behavior of output pooling in pooling models.""" - - pooling_type: Optional[str] = None - """ - The pooling method of the pooling model. This should be a key in - [`vllm.model_executor.layers.pooler.PoolingType`][]. - """ - - ## for embeddings models - normalize: Optional[bool] = None - """ - Whether to normalize the embeddings outputs. Defaults to True. - """ - dimensions: Optional[int] = None - """ - Reduce the dimensions of embeddings if model - support matryoshka representation. Defaults to None. - """ - enable_chunked_processing: Optional[bool] = None - """ - Whether to enable chunked processing for long inputs that exceed the model's - maximum position embeddings. When enabled, long inputs will be split into - chunks, processed separately, and then aggregated using weighted averaging. - This allows embedding models to handle arbitrarily long text without CUDA - errors. Defaults to False. - """ - max_embed_len: Optional[int] = None - """ - Maximum input length allowed for embedding generation. When set, allows - inputs longer than max_embed_len to be accepted for embedding models. - When an input exceeds max_embed_len, it will be handled according to - the original max_model_len validation logic. - Defaults to None (i.e. set to max_model_len). - """ - - ## for classification models - activation: Optional[bool] = None - """ - Whether to apply activation function to the classification outputs. - Defaults to True. - """ - logit_bias: Optional[float] = None - """ - If provided, apply classification logit biases. Defaults to None. - """ - - ## for reward models - softmax: Optional[bool] = None - """ - Whether to apply softmax to the reward outputs. - Defaults to True. - """ - step_tag_id: Optional[int] = None - """ - If set, only the score corresponding to the ``step_tag_id`` in the - generated sentence should be returned. Otherwise, the scores for all tokens - are returned. - """ - returned_token_ids: Optional[list[int]] = None - """ - A list of indices for the vocabulary dimensions to be extracted, - such as the token IDs of ``good_token`` and ``bad_token`` in the - ``math-shepherd-mistral-7b-prm`` model. - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - _STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.float16, "float16": torch.float16, diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py new file mode 100644 index 0000000000000..85b5a1ace85f6 --- /dev/null +++ b/vllm/config/pooler.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import Any, Optional + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + + +@config +@dataclass +class PoolerConfig: + """Controls the behavior of output pooling in pooling models.""" + + pooling_type: Optional[str] = None + """ + The pooling method of the pooling model. This should be a key in + [`vllm.model_executor.layers.pooler.PoolingType`][]. + """ + + ## for embeddings models + normalize: Optional[bool] = None + """ + Whether to normalize the embeddings outputs. Defaults to True. + """ + dimensions: Optional[int] = None + """ + Reduce the dimensions of embeddings if model + support matryoshka representation. Defaults to None. + """ + enable_chunked_processing: Optional[bool] = None + """ + Whether to enable chunked processing for long inputs that exceed the model's + maximum position embeddings. When enabled, long inputs will be split into + chunks, processed separately, and then aggregated using weighted averaging. + This allows embedding models to handle arbitrarily long text without CUDA + errors. Defaults to False. + """ + max_embed_len: Optional[int] = None + """ + Maximum input length allowed for embedding generation. When set, allows + inputs longer than max_embed_len to be accepted for embedding models. + When an input exceeds max_embed_len, it will be handled according to + the original max_model_len validation logic. + Defaults to None (i.e. set to max_model_len). + """ + + ## for classification models + activation: Optional[bool] = None + """ + Whether to apply activation function to the classification outputs. + Defaults to True. + """ + logit_bias: Optional[float] = None + """ + If provided, apply classification logit biases. Defaults to None. + """ + + ## for reward models + softmax: Optional[bool] = None + """ + Whether to apply softmax to the reward outputs. + Defaults to True. + """ + step_tag_id: Optional[int] = None + """ + If set, only the score corresponding to the ``step_tag_id`` in the + generated sentence should be returned. Otherwise, the scores for all tokens + are returned. + """ + returned_token_ids: Optional[list[int]] = None + """ + A list of indices for the vocabulary dimensions to be extracted, + such as the token IDs of ``good_token`` and ``bad_token`` in the + ``math-shepherd-mistral-7b-prm`` model. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 63282c4253509..27462b8fa0dad 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -441,6 +441,7 @@ class EngineArgs: scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls + pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ ModelConfig.override_pooler_config compilation_config: CompilationConfig = \ @@ -579,8 +580,11 @@ class EngineArgs: help=model_kwargs["hf_token"]["help"]) model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) + model_group.add_argument("--pooler-config", + **model_kwargs["pooler_config"]) model_group.add_argument("--override-pooler-config", - **model_kwargs["override_pooler_config"]) + **model_kwargs["override_pooler_config"], + deprecated=True) model_group.add_argument("--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]) model_group.add_argument("--generation-config", @@ -1031,6 +1035,7 @@ class EngineArgs: mm_shm_cache_max_object_size_mb=self. mm_shm_cache_max_object_size_mb, mm_encoder_tp_mode=self.mm_encoder_tp_mode, + pooler_config=self.pooler_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index df6b16c73d6e7..e21bfce0ab085 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -151,9 +151,11 @@ class LLM: multi-modal processor obtained from `AutoProcessor.from_pretrained`. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. - override_pooler_config: Initialize non-default pooling config or - override default pooling config for the pooling model. - e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. + pooler_config: Initialize non-default pooling config for the pooling + model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. + override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This + argument is deprecated and will be removed in v0.12.0 or v1.0.0, + whichever is sooner. compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. @@ -191,6 +193,7 @@ class LLM: hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, + pooler_config: Optional[PoolerConfig] = None, override_pooler_config: Optional[PoolerConfig] = None, structured_outputs_config: Optional[Union[dict[ str, Any], StructuredOutputsConfig]] = None, @@ -288,6 +291,7 @@ class LLM: hf_token=hf_token, hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, + pooler_config=pooler_config, override_pooler_config=override_pooler_config, structured_outputs_config=structured_outputs_instance, compilation_config=compilation_config_instance, From a3d087adecadd4f6f83b72181ade40ec2de92aef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Fri, 19 Sep 2025 13:09:14 +0200 Subject: [PATCH 463/584] [P/D][Nixl] Introduce `KVTransferMetrics` and aggregation strategy (#22188) Signed-off-by: NickLucche <nlucches@redhat.com> --- .../kv_connector/unit/test_nixl_connector.py | 211 +++++++++++++++++- .../kv_transfer/kv_connector/utils.py | 21 +- .../kv_transfer/kv_connector/v1/base.py | 22 +- .../kv_transfer/kv_connector/v1/metrics.py | 100 +++++++++ .../kv_connector/v1/multi_connector.py | 68 +++++- .../kv_connector/v1/nixl_connector.py | 68 +++++- vllm/v1/core/sched/scheduler.py | 27 ++- vllm/v1/metrics/loggers.py | 8 +- vllm/v1/metrics/stats.py | 3 +- vllm/v1/outputs.py | 11 +- .../worker/kv_connector_model_runner_mixin.py | 11 +- 11 files changed, 525 insertions(+), 25 deletions(-) create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/metrics.py diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 040b44dc5d2ca..6e58d158c3f4b 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -18,12 +18,18 @@ import torch from vllm import LLM from vllm.config import KVTransferConfig +from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) +from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import ( + MultiKVConnectorStats) from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, - NixlConnectorWorker) + NixlConnectorWorker, NixlKVConnectorStats) from vllm.forward_context import ForwardContext from vllm.sampling_params import SamplingParams from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from .utils import create_request, create_scheduler, create_vllm_config @@ -475,6 +481,209 @@ class TestNixlHandshake: # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which # we put here is important. First run ray, it will clean up the resources, then # the rest of the tests. +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper) +def test_kv_connector_stats(dist_init): + """Test that KV transfer stats are properly recorded and retrieved.""" + vllm_config = create_vllm_config() + + # Test worker role in decode server. + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector.connector_worker = FakeNixlConnectorWorker(vllm_config, + connector.engine_id, + hand_shake_latency=0) + + # Verify that xfer_stats starts empty + initial_stats = connector.get_kv_connector_stats() + assert initial_stats is None + + # Create transfer metadata + request_id = "test_req_for_stats" + metadata = NixlConnectorMetadata() + metadata.add_new_req(request_id=request_id, + local_block_ids=[1, 2, 3], + kv_transfer_params={ + "remote_block_ids": [4, 5, 6], + "remote_engine_id": + FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_host": "localhost", + "remote_port": 1234, + "remote_tp_size": 1, + }) + connector.bind_connector_metadata(metadata) + + # Start the transfer + dummy_ctx = ForwardContext( + no_compile_layers={}, + attn_metadata={}, + virtual_engine=0, + ) + connector.start_load_kv(dummy_ctx) + + # Verify stats are recorded after transfer is complete + max_iterations = 2 + # Clear metadata before start_load_kv to prevent reprocessing same request + connector.bind_connector_metadata(NixlConnectorMetadata()) + for _ in range(max_iterations): + # Need to call start_load_kv to process completed handshakes + connector.start_load_kv(dummy_ctx) + _, done_recving = connector.get_finished(finished_req_ids=set()) + if len(done_recving) > 0 and request_id in done_recving: + break + time.sleep( + 0.1) # Small delay to allow background handshake to complete + else: + assert "Transfer did not complete within expected iterations" + + # Now check that stats were recorded + stats_after_transfer = connector.get_kv_connector_stats() + assert isinstance(stats_after_transfer, NixlKVConnectorStats) + + # Verify stats values are recorded + assert not stats_after_transfer.is_empty() + assert stats_after_transfer.data["num_successful_transfers"] == 1 + + # Verify stats are reset after retrieval + stats_after_reset = connector.get_kv_connector_stats() + assert stats_after_reset is None + + +def test_kv_connector_stats_aggregation(): + """ + Test KV transfer stats aggregation across TP ranks using + KVOutputAggregator (used by MultiprocExecutor). + """ + + # Create KVOutputAggregator for 3 workers (simulating TP=3), same thing + # done in MultiprocExecutor.execute_model + aggregator = KVOutputAggregator(world_size=3) + + # Create stats for multiple workers with different transfer patterns + worker1_stats = NixlKVConnectorStats() + worker2_stats = NixlKVConnectorStats() + worker3_stats = NixlKVConnectorStats() + + # Record different transfers on each worker + # Worker 1: 2 transfers + worker1_stats.record_transfer() + worker1_stats.record_transfer() + + # Worker 2: 1 transfer + worker2_stats.record_transfer() + + # Worker 3: 3 transfers + worker3_stats.record_transfer() + worker3_stats.record_transfer() + worker3_stats.record_transfer() + + # Create ModelRunnerOutput instances for each worker + worker_outputs = [] + for i, worker_stats in enumerate( + [worker1_stats, worker2_stats, worker3_stats]): + output = ModelRunnerOutput( + req_ids=[f"req_{i}"], + req_id_to_index={f"req_{i}": 0}, + sampled_token_ids=[[123]], # dummy token + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=set([f"req_{i}_send"]) + if i < 2 else None, # Workers 0,1 finished sending + finished_recving=set([f"req_{i}_recv"]) + if i > 0 else None, # Workers 1,2 finished receiving + kv_connector_stats=worker_stats, + )) + worker_outputs.append(output) + + # Use the real aggregation mechanism (like MultiprocExecutor.execute_model) + aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0) + kv_connector_stats = \ + aggregated_output.kv_connector_output.kv_connector_stats + assert isinstance(kv_connector_stats, NixlKVConnectorStats) + # Number of total transfers across all workers. + assert kv_connector_stats.data["num_successful_transfers"] == 6 + + +def test_multi_kv_connector_stats_aggregation(): + """ + Test MultiKVConnectorStats aggregation across TP ranks using + KVOutputAggregator (used by MultiprocExecutor). + """ + + aggregator = KVOutputAggregator(world_size=3) + + from dataclasses import dataclass + + @dataclass + class FooKVConnectorStats(KVConnectorStats): + + def reset(self): + self.data = {"num_foo_transfers": 0} + + def record_transfer(self): + if "num_foo_transfers" not in self.data: + self.data["num_foo_transfers"] = 0 + self.data["num_foo_transfers"] += 1 + + def is_empty(self) -> bool: + return self.data["num_foo_transfers"] == 0 + + def aggregate(self, + other: "FooKVConnectorStats") -> "FooKVConnectorStats": + if not other.is_empty(): + self.data["num_foo_transfers"] += other.data[ + "num_foo_transfers"] + return self + + def make_multi_stats(nixl_count: int, + foo_count: int) -> MultiKVConnectorStats: + data: dict[str, KVConnectorStats] = {} + if nixl_count > 0: + nixl_stats = NixlKVConnectorStats() + for _ in range(nixl_count): + nixl_stats.record_transfer() + data["NixlConnector"] = nixl_stats + if foo_count > 0: + foo_stats = FooKVConnectorStats() + for _ in range(foo_count): + foo_stats.record_transfer() + data["FooConnector"] = foo_stats + return MultiKVConnectorStats(data=data) + + # Create heterogeneous stats across 3 workers + worker_patterns = [(2, 1), (3, 0), (0, 5)] # (Nixl, Foo) + + worker_outputs: list[ModelRunnerOutput] = [] + for i, (nixl, foo) in enumerate(worker_patterns): + stats = make_multi_stats(nixl, foo) + output = ModelRunnerOutput( + req_ids=[f"req_{i}"], + req_id_to_index={f"req_{i}": 0}, + sampled_token_ids=[[123]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=set([f"req_{i}_send"]) if i < 2 else None, + finished_recving=set([f"req_{i}_recv"]) if i > 0 else None, + kv_connector_stats=stats, + ), + ) + worker_outputs.append(output) + + aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0) + kv_connector_stats = \ + aggregated_output.kv_connector_output.kv_connector_stats + assert isinstance(kv_connector_stats, MultiKVConnectorStats) + + # Validate per-connector totals across workers + assert kv_connector_stats["NixlConnector"].data[ + "num_successful_transfers"] == 5 + assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6 + + @pytest.mark.parametrize("distributed_executor_backend", ["ray", None]) @patch( "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index f4dc248a12794..911d77ba36fa0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -129,7 +129,7 @@ class KVOutputAggregator: def aggregate(self, outputs: list[ModelRunnerOutput], output_rank: int = 0) -> ModelRunnerOutput: - # aggregate kv_connector_output from all workers + # Aggregate kv_connector_output from all workers def update_finished_set(req_ids: Optional[set[str]], remaining_count_dict: dict[str, int], @@ -142,8 +142,9 @@ class KVOutputAggregator: finished_sending = set[str]() finished_recving = set[str]() - for output in outputs: - output = output.kv_connector_output + aggregated_kv_connector_stats = None + for model_runner_output in outputs: + output = model_runner_output.kv_connector_output if not output: continue update_finished_set(output.finished_sending, @@ -151,12 +152,26 @@ class KVOutputAggregator: update_finished_set(output.finished_recving, self._recv_remaining_count, finished_recving) + # Aggregate kv_connector_stats from all workers. + if aggregated_kv_connector_stats is None: + # Use the first worker's kv_connector_stats as accumulator. + aggregated_kv_connector_stats = output.kv_connector_stats + elif kv_connector_stats := output.kv_connector_stats: + if aggregated_kv_connector_stats is None: + aggregated_kv_connector_stats = kv_connector_stats + else: + assert isinstance(aggregated_kv_connector_stats, + type(kv_connector_stats)) + aggregated_kv_connector_stats = \ + aggregated_kv_connector_stats.aggregate(kv_connector_stats) + # select output of the worker specified by output_rank output = outputs[output_rank] output.kv_connector_output = KVConnectorOutput( finished_sending=finished_sending or None, finished_recving=finished_recving or None, + kv_connector_stats=aggregated_kv_connector_stats or None, ) return output diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 70c07eac6304b..184d0a62f2c30 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -49,6 +49,8 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import KVCacheEvent + from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request @@ -235,6 +237,12 @@ class KVConnectorBase_V1(ABC): """ return None + def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]: + """ + Get the KV connector stats collected during the last interval. + """ + return None + # ============================== # Scheduler-side methods # ============================== @@ -365,4 +373,16 @@ class KVConnectorBase_V1(ABC): int: expected sending or receiving completion count. """ - return None \ No newline at end of file + return None + + @classmethod + def build_kv_connector_stats( + cls, + data: Optional[dict[str, + Any]] = None) -> Optional["KVConnectorStats"]: + """ + KVConnectorStats resolution method. This method allows dynamically + registered connectors to return their own KVConnectorStats object, + which can implement custom aggregation logic on the data dict. + """ + return None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py new file mode 100644 index 0000000000000..e40007230ba45 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass, field +from typing import Any, Optional, Union + +from vllm.config.kv_transfer import KVTransferConfig +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_transfer_state import ( + has_kv_transfer_group) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@dataclass +class KVConnectorStats: + """ + Base class for KV Connector Stats, a container for transfer performance + metrics or otherwise important telemetry from the connector. + All sub-classes need to be serializable as stats are sent from worker to + logger process. + """ + data: dict[str, Any] = field(default_factory=dict) + + def reset(self): + """Reset the stats, clear the state.""" + raise NotImplementedError + + def aggregate(self, other: "KVConnectorStats") -> "KVConnectorStats": + """ + Aggregate stats with another `KVConnectorStats` object. + """ + raise NotImplementedError + + def reduce(self) -> dict[str, Union[int, float]]: + """ + Reduce the observations collected during a time interval to one or + more representative values (eg avg/median/sum of the series). + This is meant to be called by the logger to produce a summary of the + stats for the last time interval. + """ + raise NotImplementedError + + def is_empty(self) -> bool: + """Return True if the stats are empty.""" + raise NotImplementedError + + +class KVConnectorLogging: + + def __init__(self, kv_tranfer_config: KVTransferConfig): + # This should be called on frontend process. + assert not has_kv_transfer_group() + # Instantiate the connector's stats class. + if kv_tranfer_config and kv_tranfer_config.kv_connector: + self.connector_cls = KVConnectorFactory.get_connector_class( + kv_tranfer_config) + self.reset() + + def reset(self): + self.transfer_stats_accumulator: Optional[KVConnectorStats] = None + + def observe(self, transfer_stats_data: dict[str, Any]): + # Should not be called when a KVConnector is not configured. + assert self.connector_cls is not None + # Called periodically when connector syncs with the scheduler. + # Note that this is not the same as the logging interval. + # We expect transfer_stats_data to be aggregated across all workers and + # consist of observations from a single connector or a MultiConnector. + transfer_stats = self.connector_cls.build_kv_connector_stats( + transfer_stats_data) + if transfer_stats is None: + logger.warning_once( + "The connector %s is collecting stats but " + "does not implement the " + "`build_kv_connector_stats` method. " + "Stats will not be logged.", self.connector_cls) + return + + if self.transfer_stats_accumulator is None: + self.transfer_stats_accumulator = transfer_stats + else: + # Accumulate last interval stats. + self.transfer_stats_accumulator = \ + self.transfer_stats_accumulator.aggregate(transfer_stats) + + def log(self, log_fn=logger.info): + """Log transfer metrics periodically, similar to throughput logging""" + if (self.transfer_stats_accumulator + and not self.transfer_stats_accumulator.is_empty()): + # Produce a single cumulative stats object for the last time + # interval from the recorded observations. + xfer_metrics = self.transfer_stats_accumulator.reduce() + xfer_metrics_str = ", ".join(f"{k}={v}" + for k, v in xfer_metrics.items()) + log_fn("KV Transfer metrics: %s", xfer_metrics_str) + + # Reset metrics for next interval + self.reset() \ No newline at end of file diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 616d158d67670..6836a71e58d62 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -9,19 +9,21 @@ import torch from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig -from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.logger import init_logger -from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata + from vllm.distributed.kv_events import KVCacheEvent from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request logger = init_logger(__name__) @@ -33,6 +35,43 @@ class MultiKVConnectorMetadata(KVConnectorMetadata): extra_async_saves: Optional[dict[str, int]] = None +@dataclass +class MultiKVConnectorStats(KVConnectorStats): + """ + Maintain a dict of KVConnectorStats objects, one for each connector. + This is used to aggregate the stats from all connectors separately. + """ + + def aggregate(self, other: KVConnectorStats) -> KVConnectorStats: + for connector_id, stats in other.data.items(): + if connector_id not in self.data: + self[connector_id] = stats + else: + assert isinstance(stats, type(self.data[connector_id])) + self[connector_id] = self[connector_id].aggregate(stats) + return self + + def reset(self): + for stats in self.data.values(): + stats.reset() + + def reduce(self) -> dict[str, Any]: + # TODO (NickLucche) Adjust for logging on separate lines + return { + connector_id: stats.reduce() + for connector_id, stats in self.data.items() + } + + def is_empty(self) -> bool: + return all(stats.is_empty() for stats in self.data.values()) + + def __getitem__(self, connector_id: str) -> KVConnectorStats: + return self.data[connector_id] + + def __setitem__(self, connector_id: str, stats: KVConnectorStats): + self.data[connector_id] = stats + + class MultiConnector(KVConnectorBase_V1): """ A wrapper for using multiple KVConnectors at the same time. @@ -46,6 +85,7 @@ class MultiConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._connectors: list[KVConnectorBase_V1] = [] + self._ktc_kv_transfer_config = [] ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( "connectors") assert ktcs is not None @@ -57,6 +97,7 @@ class MultiConnector(KVConnectorBase_V1): **ktc, engine_id=engine_id) self._connectors.append( KVConnectorFactory.create_connector(temp_config, role)) + self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config) # A mapping from request id to the index of the connector chosen to # load the request from (if any). @@ -227,7 +268,7 @@ class MultiConnector(KVConnectorBase_V1): return async_saves > 0, kv_txfer_params - def take_events(self) -> Iterable[KVCacheEvent]: + def take_events(self) -> Iterable["KVCacheEvent"]: for c in self._connectors: yield from c.take_events() @@ -264,3 +305,24 @@ class MultiConnector(KVConnectorBase_V1): f"({', '.join(layouts) })." f"All connectors must use the same layout.") return next(iter(layouts), None) + + @classmethod + def build_kv_connector_stats( + cls, + data: Optional[dict[str, + Any]] = None) -> Optional[KVConnectorStats]: + return MultiKVConnectorStats(data=data) if data is not None \ + else MultiKVConnectorStats() + + def get_kv_connector_stats(self) -> Optional[MultiKVConnectorStats]: + # Group connector stats by connector type. + stats_by_connector: Optional[MultiKVConnectorStats] = None + for c in self._connectors: + stats = c.get_kv_connector_stats() + if stats is None: + continue + if stats_by_connector is None: + # Lazy init to allow optional return value. + stats_by_connector = MultiKVConnectorStats() + stats_by_connector[c.__class__.__name__] = stats + return stats_by_connector diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 1ff1407aeb99b..ff62f60e5a42c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib +import copy import logging import math import queue @@ -11,7 +12,7 @@ from collections import defaultdict from collections.abc import Iterator from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Optional, Union import msgspec import numpy as np @@ -23,6 +24,8 @@ from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) @@ -33,7 +36,6 @@ from vllm.platforms import _Backend, current_platform from vllm.utils import make_zmq_path, make_zmq_socket from vllm.v1.attention.backends.utils import get_kv_cache_layout from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.request import RequestStatus if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -206,6 +208,18 @@ class NixlConnector(KVConnectorBase_V1): assert self.connector_worker is not None return self.connector_worker.get_finished() + def get_kv_connector_stats(self) -> Optional[KVConnectorStats]: + assert self.connector_worker is not None + return self.connector_worker.get_kv_connector_stats() + + @classmethod + def build_kv_connector_stats( + cls, + data: Optional[dict[str, + Any]] = None) -> Optional[KVConnectorStats]: + return NixlKVConnectorStats(data=data) if data is not None \ + else NixlKVConnectorStats() + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: assert self.connector_worker is not None @@ -377,6 +391,7 @@ class NixlConnectorScheduler: Once a request is finished, determine whether request blocks should be freed now or will be sent asynchronously and freed later. """ + from vllm.v1.request import RequestStatus params = request.kv_transfer_params logger.debug( @@ -550,6 +565,7 @@ class NixlConnectorWorker: # With heterogeneous TP, P must wait for all assigned D TP workers to # finish reading before safely freeing the blocks. self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int) + self.xfer_stats = NixlKVConnectorStats() def __del__(self): """Cleanup background threads on destruction.""" @@ -1097,6 +1113,8 @@ class NixlConnectorWorker: xfer_state = self.nixl_wrapper.check_xfer_state(handle) if xfer_state == "DONE": self.nixl_wrapper.release_xfer_handle(handle) + # TODO (NickLucche) Get from NIXL telemetry once integrated + self.xfer_stats.record_transfer() elif xfer_state == "PROC": in_progress = True continue @@ -1248,7 +1266,6 @@ class NixlConnectorWorker: self.nixl_wrapper.transfer(handle) # Use handle to check completion in future step(). - # TODO (NickLucche) surface xfer elapsed time self._recving_transfers[request_id].append( (handle, time.perf_counter())) @@ -1300,6 +1317,15 @@ class NixlConnectorWorker: block_len = self.block_len return block_len + def get_kv_connector_stats(self) -> Optional[KVConnectorStats]: + """ + Get the KV transfer stats for the connector. + """ + # Clear stats for next iteration + if not self.xfer_stats.is_empty(): + return self.xfer_stats.clone_and_reset() + return None + @contextlib.contextmanager def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: @@ -1318,3 +1344,39 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: finally: if ctx is not None: ctx.destroy(linger=0) + + +@dataclass +class NixlKVConnectorStats(KVConnectorStats): + """Container for transfer performance metrics""" + + def __post_init__(self): + if "num_successful_transfers" not in self.data: + self.data["num_successful_transfers"] = 0 + + def reset(self): + self.data = {"num_successful_transfers": 0} + + def record_transfer(self): + # TODO: record actual transfer stats when available + self.data["num_successful_transfers"] += 1 + + def clone_and_reset(self) -> "NixlKVConnectorStats": + old = copy.copy(self) + self.reset() + return old + + def is_empty(self) -> bool: + return self.data["num_successful_transfers"] == 0 + + def aggregate(self, other: KVConnectorStats) -> KVConnectorStats: + if not other.is_empty(): + self.data["num_successful_transfers"] += other.data[ + "num_successful_transfers"] + return self + + def reduce(self) -> dict[str, Union[int, float]]: + # TODO: reduce stats to a single value, calculate latency/throughput + return { + "num_successful_transfers": self.data["num_successful_transfers"] + } \ No newline at end of file diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 85ca858ad7bd6..b08898d253cab 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -15,6 +15,8 @@ from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, @@ -869,9 +871,12 @@ class Scheduler(SchedulerInterface): num_scheduled_tokens = scheduler_output.num_scheduled_tokens pooler_outputs = model_runner_output.pooler_output num_nans_in_logits = model_runner_output.num_nans_in_logits + kv_connector_output = model_runner_output.kv_connector_output outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: Optional[SpecDecodingStats] = None + kv_connector_stats = (kv_connector_output.kv_connector_stats + if kv_connector_output else None) # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more, # the below loop can be a performance bottleneck. We should do our best @@ -1007,7 +1012,8 @@ class Scheduler(SchedulerInterface): finished_requests=finished_set) finished_req_ids.clear() - if (stats := self.make_stats(spec_decoding_stats)) is not None: + if (stats := self.make_stats(spec_decoding_stats, + kv_connector_stats)) is not None: # Return stats to only one of the front-ends. if (eco := next(iter(engine_core_outputs.values()), None)) is None: # We must return the stats even if there are no request @@ -1172,20 +1178,21 @@ class Scheduler(SchedulerInterface): def make_stats( self, spec_decoding_stats: Optional[SpecDecodingStats] = None, + kv_connector_stats: Optional[KVConnectorStats] = None, ) -> Optional[SchedulerStats]: if not self.log_stats: return None prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats() assert prefix_cache_stats is not None - return SchedulerStats( - num_running_reqs=len(self.running), - num_waiting_reqs=len(self.waiting), - kv_cache_usage=self.kv_cache_manager.usage, - prefix_cache_stats=prefix_cache_stats, - spec_decoding_stats=spec_decoding_stats, - num_corrupted_reqs=sum(req.is_output_corrupted - for req in self.running), - ) + return SchedulerStats(num_running_reqs=len(self.running), + num_waiting_reqs=len(self.waiting), + kv_cache_usage=self.kv_cache_manager.usage, + prefix_cache_stats=prefix_cache_stats, + spec_decoding_stats=spec_decoding_stats, + num_corrupted_reqs=sum(req.is_output_corrupted + for req in self.running), + kv_connector_stats=kv_connector_stats.data + if kv_connector_stats else None) def make_spec_decoding_stats( self, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index b30036a6f8e80..f0076b2d81dbf 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,6 +9,8 @@ from typing import Callable, Optional, Union import prometheus_client from vllm.config import SupportsMetricsInfo, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorLogging) from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason @@ -59,6 +61,8 @@ class LoggingStatLogger(StatLoggerBase): # TODO: Make the interval configurable. self.prefix_caching_metrics = PrefixCachingMetrics() self.spec_decoding_logging = SpecDecodingLogging() + kv_tranfer_config = self.vllm_config.kv_transfer_config + self.kv_transfer_logging = KVConnectorLogging(kv_tranfer_config) self.last_prompt_throughput: float = 0.0 self.last_generation_throughput: float = 0.0 @@ -97,7 +101,8 @@ class LoggingStatLogger(StatLoggerBase): if scheduler_stats.spec_decoding_stats is not None: self.spec_decoding_logging.observe( scheduler_stats.spec_decoding_stats) - + if kv_connector_stats := scheduler_stats.kv_connector_stats: + self.kv_transfer_logging.observe(kv_connector_stats) self.last_scheduler_stats = scheduler_stats def log(self): @@ -136,6 +141,7 @@ class LoggingStatLogger(StatLoggerBase): self.prefix_caching_metrics.hit_rate * 100, ) self.spec_decoding_logging.log(log_fn=log_fn) + self.kv_transfer_logging.log(log_fn=log_fn) def log_engine_initialized(self): if self.vllm_config.cache_config.num_gpu_blocks: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index e6c344d193df2..0eff557336bc0 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -3,7 +3,7 @@ import time from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Any, Optional from vllm.v1.spec_decode.metrics import SpecDecodingStats @@ -43,6 +43,7 @@ class SchedulerStats: default_factory=PrefixCacheStats) spec_decoding_stats: Optional[SpecDecodingStats] = None + kv_connector_stats: Optional[dict[str, Any]] = None num_corrupted_reqs: int = 0 diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 1b2da8addb19e..e6cc6019b1728 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -3,10 +3,14 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import NamedTuple, Optional +from typing import TYPE_CHECKING, NamedTuple, Optional import torch +if TYPE_CHECKING: + from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) + class LogprobsLists(NamedTuple): @@ -77,6 +81,11 @@ class KVConnectorOutput: # [req_ids] finished_sending: Optional[set[str]] = None finished_recving: Optional[set[str]] = None + kv_connector_stats: Optional["KVConnectorStats"] = None + + def is_empty(self): + return (not self.finished_sending and not self.finished_recving + and not self.kv_connector_stats) # ModelRunnerOutput is serialized and sent to the scheduler process. diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 3eb9f26e9f5b6..016a90c196ba3 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -13,6 +13,8 @@ from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group) from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorStats) from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput, @@ -119,4 +121,11 @@ class KVConnectorModelRunnerMixin: output.finished_sending, output.finished_recving = ( kv_connector.get_finished(scheduler_output.finished_req_ids)) - kv_connector.clear_connector_metadata() + output.kv_connector_stats = KVConnectorModelRunnerMixin.\ + get_kv_connector_stats() + + @staticmethod + def get_kv_connector_stats() -> Optional[KVConnectorStats]: + if has_kv_transfer_group(): + return get_kv_transfer_group().get_kv_connector_stats() + return None From 5089fd749cbe4233a29f29ce706d56c47464c117 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Fri, 19 Sep 2025 19:10:52 +0800 Subject: [PATCH 464/584] [V0 Deprecation] Remove V0 logic from `get_input_embeddings` interface (#25242) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .../models/hyperclovax_vision.py | 43 +++++++------------ vllm/model_executor/models/interfaces.py | 24 ----------- vllm/model_executor/models/ultravox.py | 19 ++------ vllm/model_executor/models/utils.py | 18 +------- 4 files changed, 21 insertions(+), 83 deletions(-) diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 53f0585541b1c..870addd0dcbca 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -46,7 +46,8 @@ from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel -from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix +from .utils import (AutoWeightsLoader, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) from .vision import get_vision_encoder_info EOT = "<|endofturn|>" @@ -740,33 +741,20 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - **kwargs, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if (kwargs.get("pixel_values_images") is not None - or kwargs.get("pixel_values_videos") - is not None): # v0 compatibility - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - if multimodal_embeddings is not None: - multimodal_embeddings = torch.cat(multimodal_embeddings, dim=0) - _mask_image = input_ids == self.config.image_token_id - _mask_video = input_ids == self.config.video_token_id - assert _mask_image.sum() + _mask_video.sum() == len( - multimodal_embeddings) + if multimodal_embeddings is not None \ + and len(multimodal_embeddings) != 0: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + placeholder_token_id=[ + self.config.image_token_id, + self.config.video_token_id, + ], + ) - if multimodal_embeddings.dtype != inputs_embeds.dtype: - multimodal_embeddings = multimodal_embeddings.to( - dtype=inputs_embeds.dtype) - if multimodal_embeddings.device != inputs_embeds.device: - multimodal_embeddings = multimodal_embeddings.to( - device=inputs_embeds.device) - - if _mask_image.sum() > 0: - inputs_embeds[ - _mask_image] = multimodal_embeddings[:sum(_mask_image)] - if _mask_video.sum() > 0: - inputs_embeds[_mask_video] = multimodal_embeddings[ - -sum(_mask_video):] return inputs_embeds def forward( @@ -783,8 +771,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: - inputs_embeds = self.get_input_embeddings(input_ids=input_ids, - **kwargs) + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index e9c600e36cfa7..6be70c4b3b214 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -23,7 +23,6 @@ from vllm.utils import supports_kw from .interfaces_base import is_pooling_model if TYPE_CHECKING: - from vllm.attention import AttentionMetadata from vllm.config import VllmConfig from vllm.model_executor.models.utils import WeightsMapper from vllm.sequence import IntermediateTensors @@ -97,33 +96,10 @@ class SupportsMultiModal(Protocol): """ ... - # Only for models that support v0 chunked prefill - # TODO(ywang96): Remove this overload once v0 is deprecated - @overload def get_input_embeddings( self, input_ids: Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - attn_metadata: Optional["AttentionMetadata"] = None, - ) -> Tensor: - ... - - # TODO: Remove this overload once v0 is deprecated - @overload - def get_input_embeddings( - self, - input_ids: Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> Tensor: - ... - - def get_input_embeddings( - self, - input_ids: Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - # Only necessary so that the v0 overload is valid - # TODO: Remove attn_metadata once v0 is deprecated - attn_metadata: Optional["AttentionMetadata"] = None, ) -> Tensor: """ Returns the input embeddings merged from the text embeddings from diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 371ca817d5f92..f1f11c5fe8f00 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -13,9 +13,7 @@ from transformers import BatchFeature, ProcessorMixin from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper.modeling_whisper import WhisperEncoder -from vllm import envs from vllm.config import VllmConfig -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader import DefaultModelLoader @@ -37,8 +35,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings, - merge_multimodal_embeddings_from_map) + merge_multimodal_embeddings) _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _MAX_ENCODER_BATCH_SIZE = 16 @@ -568,17 +565,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): safe_input_ids) if multimodal_embeddings is not None and len( multimodal_embeddings) > 0: - - # TODO(ywang96): remove this block after v0 is deprecated. - if not envs.VLLM_USE_V1: - attn_metadata = get_forward_context().attn_metadata - merge_multimodal_embeddings_from_map( - inputs_embeds, multimodal_embeddings, - attn_metadata.multi_modal_placeholder_index_maps["audio"]) - else: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.audio_token_index) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.audio_token_index) return inputs_embeds def forward(self, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index e716ec582baab..83e381b3b1578 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -15,7 +15,7 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors +from vllm.multimodal import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available, is_uva_available) @@ -389,22 +389,6 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str: _embedding_count_expression(inner) for inner in embeddings) -def merge_multimodal_embeddings_from_map( - inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor: - """ - Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided - placeholder map . - - Note: - This updates ``inputs_embeds`` in place. - """ - flattened_embeddings = _flatten_embeddings(multimodal_embeddings) - inputs_embeds[placeholder_map.dest] = flattened_embeddings[ - placeholder_map.src].to(dtype=inputs_embeds.dtype) - return inputs_embeds - - def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, is_multimodal: torch.Tensor, From 838d7116ba59db528647b29f0d000742f4af9d4b Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Fri, 19 Sep 2025 20:25:12 +0800 Subject: [PATCH 465/584] [Qwen] Remove cuda hard-code in qwen3 next (#25243) Signed-off-by: Icey <1790571317@qq.com> --- vllm/model_executor/models/qwen3_next.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 0c974ee44eee2..98749c160ba4d 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -306,7 +306,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): eps=self.layer_norm_epsilon, group_size=None, norm_before_gate=True, - device=torch.cuda.current_device(), + device=current_platform.current_device(), dtype=config.torch_dtype, ) From cf278ff3b231b4fca0232db2d1183427dbc200bb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:12:55 +0100 Subject: [PATCH 466/584] Update CODEOWNERS (#25269) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 08717cdde643a..323675993467a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -66,18 +66,25 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/models/test_transformers.py @hmellor # Docs -/docs @hmellor +/docs/mkdocs @hmellor +/docs/**/*.yml @hmellor +/requirements/docs.txt @hmellor +.readthedocs.yaml @hmellor mkdocs.yaml @hmellor +# Linting +.markdownlint.yaml @hmellor +.pre-commit-config.yaml @hmellor + # CPU -/vllm/v1/worker/^cpu @bigPYJ1151 +/vllm/v1/worker/cpu* @bigPYJ1151 /csrc/cpu @bigPYJ1151 /vllm/platforms/cpu.py @bigPYJ1151 /cmake/cpu_extension.cmake @bigPYJ1151 /docker/Dockerfile.cpu @bigPYJ1151 # Intel GPU -/vllm/v1/worker/^xpu @jikunshang +/vllm/v1/worker/xpu* @jikunshang /vllm/platforms/xpu.py @jikunshang /docker/Dockerfile.xpu @jikunshang From aed16879a9191a58adc5b8ac3973454dddefe018 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:22:33 +0100 Subject: [PATCH 467/584] Move `ModelConfig` from `config/__init__.py` to `config/model.py` (#25252) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/conftest.py | 3 +- tests/distributed/test_pipeline_parallel.py | 2 +- tests/models/test_initialization.py | 5 +- tests/v1/sample/test_logprobs.py | 9 +- vllm/config/__init__.py | 2108 +------------------ vllm/config/model.py | 2006 ++++++++++++++++++ vllm/config/scheduler.py | 8 +- vllm/config/utils.py | 100 +- vllm/engine/arg_utils.py | 15 +- vllm/model_executor/model_loader/utils.py | 7 +- vllm/model_executor/models/registry.py | 22 +- vllm/v1/sample/ops/topk_topp_sampler.py | 13 +- vllm/v1/sample/sampler.py | 11 +- 13 files changed, 2160 insertions(+), 2149 deletions(-) create mode 100644 vllm/config/model.py diff --git a/tests/conftest.py b/tests/conftest.py index 9d433dedbf479..3cd93f4ad3289 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -39,7 +39,8 @@ from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype +from vllm.config.model import (ConvertOption, RunnerOption, + _get_and_verify_dtype) from vllm.connections import global_http_connection from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index fcd09844c0951..073b362b64749 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional import pytest -from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption +from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption from vllm.logger import init_logger from vllm.transformers_utils.config import get_config diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 56b5d32d16536..9281579b71e74 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -7,7 +7,6 @@ from unittest.mock import patch import pytest from vllm import LLM -from vllm.config import ModelImpl from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.utils import GiB_bytes from vllm.v1.core.kv_cache_utils import get_kv_cache_configs @@ -111,8 +110,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, # these tests seem to produce leftover memory gpu_memory_utilization=0.80, load_format="dummy", - model_impl=ModelImpl.TRANSFORMERS - if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM, + model_impl="transformers" + if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm", hf_overrides=hf_overrides_fn, max_num_seqs=model_info.max_num_seqs) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 570e330208a39..71aa9e3d379cf 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -3,6 +3,7 @@ import itertools from collections.abc import Generator +from typing import get_args import pytest import torch @@ -464,7 +465,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): assert len(prompt_logprob) == vocab_size -@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode)) +@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode)) def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): """Test with LLM engine with different logprobs_mode. @@ -493,14 +494,12 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode, for logprobs in output.logprobs: for token_id in logprobs: logprob = logprobs[token_id] - if logprobs_mode in (LogprobsMode.RAW_LOGPROBS, - LogprobsMode.PROCESSED_LOGPROBS): + if logprobs_mode in ("raw_logprobs", "processed_logprobs"): assert logprob.logprob <= 0 if logprob.logprob > 0: positive_values = positive_values + 1 total_token_with_logprobs = total_token_with_logprobs + 1 assert total_token_with_logprobs >= len(results[0].outputs) - if logprobs_mode in (LogprobsMode.RAW_LOGITS, - LogprobsMode.PROCESSED_LOGITS): + if logprobs_mode in ("raw_logits", "processed_logits"): assert positive_values > 0 del llm diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 45504e010d688..ddd8de4324f6f 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -4,27 +4,22 @@ # ruff: noqa: F401 import ast import copy -import enum import hashlib import inspect import json import os import textwrap -import warnings from contextlib import contextmanager -from dataclasses import InitVar, field, fields, is_dataclass, replace +from dataclasses import field, fields, is_dataclass, replace from functools import cached_property, lru_cache -from importlib.util import find_spec -from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol, - TypeVar, Union, cast, get_args) +from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar, + Union, cast) import regex as re import torch -from pydantic import (ConfigDict, SkipValidation, field_validator, - model_validator) +from pydantic import ConfigDict, SkipValidation from pydantic.dataclasses import dataclass -from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE -from typing_extensions import assert_never, runtime_checkable +from typing_extensions import runtime_checkable import vllm.envs as envs from vllm import version @@ -36,45 +31,31 @@ from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig +from vllm.config.model import (ConvertOption, HfOverrides, LogprobsMode, + ModelConfig, ModelDType, ModelImpl, + RunnerOption, TaskOption, TokenizerMode, + iter_architecture_defaults, + try_match_architecture_defaults) from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, MultiModalConfig) from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.pooler import PoolerConfig -from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy +from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy from vllm.config.speculative import SpeculativeConfig from vllm.config.structured_outputs import StructuredOutputsConfig -from vllm.config.utils import ConfigType, config +from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.platforms import current_platform -from vllm.transformers_utils.config import ( - ConfigFormat, get_config, get_hf_image_processor_config, - get_hf_text_config, get_pooling_config, - get_sentence_transformer_tokenizer_config, is_encoder_decoder, - is_interleaved, maybe_override_with_speculators_target_model, - try_get_generation_config, try_get_safetensors_metadata, - try_get_tokenizer_config, uses_mrope) -from vllm.transformers_utils.runai_utils import (ObjectStorageModel, - is_runai_obj_uri) -from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, - STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, - LazyLoader, common_broadcastable_dtype, random_uuid) +from vllm.transformers_utils.runai_utils import is_runai_obj_uri +from vllm.utils import random_uuid if TYPE_CHECKING: from _typeshed import DataclassInstance from transformers.configuration_utils import PretrainedConfig - import vllm.model_executor.layers.quantization as me_quant - import vllm.model_executor.models as me_models - from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) - from vllm.v1.sample.logits_processor import LogitsProcessor - - HfOverrides = Union[dict, Callable[[type], type]] else: DataclassInstance = Any PretrainedConfig = Any @@ -82,83 +63,10 @@ else: QuantizationMethods = Any BaseModelLoader = Any LogitsProcessor = Any - HfOverrides = Union[dict[str, Any], Callable[[type], type]] - - me_quant = LazyLoader("model_executor", globals(), - "vllm.model_executor.layers.quantization") - me_models = LazyLoader("model_executor", globals(), - "vllm.model_executor.models") logger = init_logger(__name__) DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance) -TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", - "score", "reward", "transcription", "draft"] - -_ResolvedTask = Literal["generate", "transcription", "encode", "embed", - "classify", "reward", "draft"] - -RunnerOption = Literal["auto", "generate", "pooling", "draft"] - -RunnerType = Literal["generate", "pooling", "draft"] - -ConvertOption = Literal["auto", "none", "embed", "classify", "reward"] - -ConvertType = Literal["none", "embed", "classify", "reward"] - -_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { - "generate": ["generate", "transcription"], - "pooling": ["embedding", "embed", "classify", "score", "reward"], - "draft": ["draft"], -} - -_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { - "generate": [], - "pooling": ["embed", "classify", "reward"], - "draft": [], -} - -# Some model suffixes are based on auto classes from Transformers: -# https://huggingface.co/docs/transformers/en/model_doc/auto -# NOTE: Items higher on this list priority over lower ones -_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [ - ("ForCausalLM", ("generate", "none")), - ("ForConditionalGeneration", ("generate", "none")), - ("ChatModel", ("generate", "none")), - ("LMHeadModel", ("generate", "none")), - ("ForTextEncoding", ("pooling", "embed")), - ("EmbeddingModel", ("pooling", "embed")), - ("ForSequenceClassification", ("pooling", "classify")), - ("ForAudioClassification", ("pooling", "classify")), - ("ForImageClassification", ("pooling", "classify")), - ("ForVideoClassification", ("pooling", "classify")), - ("ClassificationModel", ("pooling", "classify")), - ("ForRewardModeling", ("pooling", "reward")), - ("RewardModel", ("pooling", "reward")), - # Let other `*Model`s take priority - ("Model", ("pooling", "embed")), -] - - -def iter_architecture_defaults(): - yield from _SUFFIX_TO_DEFAULTS - - -def try_match_architecture_defaults( - architecture: str, - *, - runner_type: Optional[RunnerType] = None, - convert_type: Optional[ConvertType] = None, -) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]: - for suffix, (default_runner_type, - default_convert_type) in iter_architecture_defaults(): - if ((runner_type is None or runner_type == default_runner_type) and - (convert_type is None or convert_type == default_convert_type) - and architecture.endswith(suffix)): - return suffix, (default_runner_type, default_convert_type) - - return None - @runtime_checkable class SupportsHash(Protocol): @@ -173,1608 +81,6 @@ class SupportsMetricsInfo(Protocol): ... -class ModelImpl(str, enum.Enum): - AUTO = "auto" - VLLM = "vllm" - TRANSFORMERS = "transformers" - TERRATORCH = "terratorch" - - -def get_attr_docs(cls: type[Any]) -> dict[str, str]: - """ - Get any docstrings placed after attribute assignments in a class body. - - https://davidism.com/mit-license/ - """ - - def pairwise(iterable): - """ - Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise - - Can be removed when Python 3.9 support is dropped. - """ - iterator = iter(iterable) - a = next(iterator, None) - - for b in iterator: - yield a, b - a = b - - try: - cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] - except (OSError, KeyError, TypeError): - # HACK: Python 3.13+ workaround - set missing __firstlineno__ - # Workaround can be removed after we upgrade to pydantic==2.12.0 - with open(inspect.getfile(cls)) as f: - for i, line in enumerate(f): - if f"class {cls.__name__}" in line and ":" in line: - cls.__firstlineno__ = i + 1 - break - cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] - - if not isinstance(cls_node, ast.ClassDef): - raise TypeError("Given object was not a class.") - - out = {} - - # Consider each pair of nodes. - for a, b in pairwise(cls_node.body): - # Must be an assignment then a constant string. - if (not isinstance(a, (ast.Assign, ast.AnnAssign)) - or not isinstance(b, ast.Expr) - or not isinstance(b.value, ast.Constant) - or not isinstance(b.value.value, str)): - continue - - doc = inspect.cleandoc(b.value.value) - - # An assignment can have multiple targets (a = b = v), but an - # annotated assignment only has one target. - targets = a.targets if isinstance(a, ast.Assign) else [a.target] - - for target in targets: - # Must be assigning to a plain name. - if not isinstance(target, ast.Name): - continue - - out[target.id] = doc - - return out - - -def is_init_field(cls: ConfigType, name: str) -> bool: - return next(f for f in fields(cls) if f.name == name).init - - -TokenizerMode = Literal["auto", "slow", "mistral", "custom"] -ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] - - -class LogprobsMode(enum.Enum): - RAW_LOGITS = "raw_logits" - RAW_LOGPROBS = "raw_logprobs" - PROCESSED_LOGITS = "processed_logits" - PROCESSED_LOGPROBS = "processed_logprobs" - - -@config -@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) -class ModelConfig: - """Configuration for the model.""" - - model: str = "Qwen/Qwen3-0.6B" - """Name or path of the Hugging Face model to use. It is also used as the - content for `model_name` tag in metrics output when `served_model_name` is - not specified.""" - runner: RunnerOption = "auto" - """The type of model runner to use. Each vLLM instance only supports one - model runner, even if the same model can be used for multiple types.""" - convert: ConvertOption = "auto" - """Convert the model using adapters defined in - [vllm.model_executor.models.adapters][]. The most common use case is to - adapt a text generation model to be used for pooling tasks.""" - task: Optional[TaskOption] = None - """[DEPRECATED] The task to use the model for. If the model supports more - than one model runner, this is used to select which model runner to run. - - Note that the model may support other tasks using the same model runner. - """ - tokenizer: SkipValidation[str] = None # type: ignore - """Name or path of the Hugging Face tokenizer to use. If unspecified, model - name or path will be used.""" - tokenizer_mode: TokenizerMode = "auto" - """Tokenizer mode:\n - - "auto" will use the fast tokenizer if available.\n - - "slow" will always use the slow tokenizer.\n - - "mistral" will always use the tokenizer from `mistral_common`.\n - - "custom" will use --tokenizer to select the preregistered tokenizer.""" - trust_remote_code: bool = False - """Trust remote code (e.g., from HuggingFace) when downloading the model - and tokenizer.""" - dtype: Union[ModelDType, torch.dtype] = "auto" - """Data type for model weights and activations:\n - - "auto" will use FP16 precision for FP32 and FP16 models, and BF16 - precision for BF16 models.\n - - "half" for FP16. Recommended for AWQ quantization.\n - - "float16" is the same as "half".\n - - "bfloat16" for a balance between precision and range.\n - - "float" is shorthand for FP32 precision.\n - - "float32" for FP32 precision.""" - seed: Optional[int] = None - """Random seed for reproducibility. Initialized to None in V0, but - initialized to 0 in V1.""" - hf_config_path: Optional[str] = None - """Name or path of the Hugging Face config to use. If unspecified, model - name or path will be used.""" - allowed_local_media_path: str = "" - """Allowing API requests to read local images or videos from directories - specified by the server file system. This is a security risk. Should only - be enabled in trusted environments.""" - revision: Optional[str] = None - """The specific model version to use. It can be a branch name, a tag name, - or a commit id. If unspecified, will use the default version.""" - code_revision: Optional[str] = None - """The specific revision to use for the model code on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" - rope_scaling: dict[str, Any] = field(default_factory=dict) - """RoPE scaling configuration. For example, - `{"rope_type":"dynamic","factor":2.0}`.""" - rope_theta: Optional[float] = None - """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE - theta improves the performance of the scaled model.""" - tokenizer_revision: Optional[str] = None - """The specific revision to use for the tokenizer on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" - max_model_len: SkipValidation[int] = None # type: ignore - """Model context length (prompt and output). If unspecified, will be - automatically derived from the model config. - - When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable - format. Examples:\n - - 1k -> 1000\n - - 1K -> 1024\n - - 25.6k -> 25,600""" - spec_target_max_model_len: Optional[int] = None - """Specify the maximum length for spec decoding draft models.""" - quantization: SkipValidation[Optional[QuantizationMethods]] = None - """Method used to quantize the weights. If `None`, we first check the - `quantization_config` attribute in the model config file. If that is - `None`, we assume the model weights are not quantized and use `dtype` to - determine the data type of the weights.""" - enforce_eager: bool = False - """Whether to always use eager-mode PyTorch. If True, we will disable CUDA - graph and always execute the model in eager mode. If False, we will use - CUDA graph and eager execution in hybrid for maximal performance and - flexibility.""" - max_seq_len_to_capture: int = 8192 - """Maximum sequence len covered by CUDA graphs. When a sequence has context - length larger than this, we fall back to eager mode. Additionally for - encoder-decoder models, if the sequence length of the encoder input is - larger than this, we fall back to the eager mode.""" - max_logprobs: int = 20 - """Maximum number of log probabilities to return when `logprobs` is - specified in `SamplingParams`. The default value comes the default for the - OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * - vocab_size) logprobs are allowed to be returned and it may cause OOM.""" - logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS - """Indicates the content returned in the logprobs and prompt_logprobs. - Supported mode: - 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. - Raw means the values before applying any logit processors, like bad words. - Processed means the values after applying all processors, including - temperature and top_k/top_p. - """ - disable_sliding_window: bool = False - """Whether to disable sliding window. If True, we will disable the sliding - window functionality of the model, capping to sliding window size. If the - model does not support sliding window, this argument is ignored.""" - disable_cascade_attn: bool = False - """Disable cascade attention for V1. While cascade attention does not - change the mathematical correctness, disabling it could be useful for - preventing potential numerical issues. Note that even if this is set to - False, cascade attention will be only used when the heuristic tells that - it's beneficial.""" - skip_tokenizer_init: bool = False - """Skip initialization of tokenizer and detokenizer. Expects valid - `prompt_token_ids` and `None` for prompt from the input. The generated - output will contain token ids.""" - enable_prompt_embeds: bool = False - """If `True`, enables passing text embeddings as inputs via the - `prompt_embeds` key. Note that enabling this will double the time required - for graph compilation.""" - served_model_name: Optional[Union[str, list[str]]] = None - """The model name(s) used in the API. If multiple names are provided, the - server will respond to any of the provided names. The model name in the - model field of a response will be the first name in this list. If not - specified, the model name will be the same as the `--model` argument. Noted - that this name(s) will also be used in `model_name` tag content of - prometheus metrics, if multiple names provided, metrics tag will take the - first one.""" - use_async_output_proc: bool = True - """Whether to use async output processor.""" - config_format: Union[str, ConfigFormat] = "auto" - """The format of the model config to load:\n - - "auto" will try to load the config in hf format if available else it - will try to load in mistral format.\n - - "hf" will load the config in hf format.\n - - "mistral" will load the config in mistral format.""" - hf_token: Optional[Union[bool, str]] = None - """The token to use as HTTP bearer authorization for remote files . If - `True`, will use the token generated when running `huggingface-cli login` - (stored in `~/.huggingface`).""" - hf_overrides: HfOverrides = field(default_factory=dict) - """If a dictionary, contains arguments to be forwarded to the Hugging Face - config. If a callable, it is called to update the HuggingFace config.""" - logits_processor_pattern: Optional[str] = None - """Optional regex pattern specifying valid logits processor qualified names - that can be passed with the `logits_processors` extra completion argument. - Defaults to `None`, which allows no processors.""" - generation_config: str = "auto" - """The folder path to the generation config. Defaults to `"auto"`, the - generation config will be loaded from model path. If set to `"vllm"`, no - generation config is loaded, vLLM defaults will be used. If set to a folder - path, the generation config will be loaded from the specified folder path. - If `max_new_tokens` is specified in generation config, then it sets a - server-wide limit on the number of output tokens for all requests.""" - override_generation_config: dict[str, Any] = field(default_factory=dict) - """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If - used with `--generation-config auto`, the override parameters will be - merged with the default config from the model. If used with - `--generation-config vllm`, only the override parameters are used.""" - enable_sleep_mode: bool = False - """Enable sleep mode for the engine (only cuda platform is supported).""" - model_impl: Union[str, ModelImpl] = ModelImpl.AUTO.value - """Which implementation of the model to use:\n - - "auto" will try to use the vLLM implementation, if it exists, and fall - back to the Transformers implementation if no vLLM implementation is - available.\n - - "vllm" will use the vLLM model implementation.\n - - "transformers" will use the Transformers model implementation.\n - - "terratorch" will use the TerraTorch model implementation. - """ - override_attention_dtype: Optional[str] = None - """Override dtype for attention""" - logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None - """One or more logits processors' fully-qualified class names or class - definitions""" - io_processor_plugin: Optional[str] = None - """IOProcessor plugin name to load at model startup""" - - # Pooler config - pooler_config: Optional[PoolerConfig] = None - """Pooler config which controls the behaviour of output pooling in pooling - models.""" - override_pooler_config: Optional[Union[dict, PoolerConfig]] = None - """[DEPRECATED] Use `pooler_config` instead. This field will be removed in - v0.12.0 or v1.0.0, whichever is sooner.""" - - # Multimodal config and init vars - multimodal_config: Optional[MultiModalConfig] = None - """Configuration for multimodal model. If `None`, this will be inferred - from the architecture of `self.model`.""" - limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None - media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None - mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None - mm_processor_cache_gb: InitVar[Optional[float]] = None - mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None - mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None - mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None - interleave_mm_strings: InitVar[Optional[bool]] = None - skip_mm_profiling: InitVar[Optional[bool]] = None - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - factors.append(self.model) - factors.append(self.dtype) - factors.append(self.quantization) - factors.append(self.revision) - factors.append(self.code_revision) - factors.append(self.max_model_len) - factors.append(self.max_logprobs) - factors.append(self.disable_sliding_window) - factors.append(self.trust_remote_code) - factors.append(self.generation_config) - factors.append(self.model_impl) - factors.append(self.override_generation_config) - factors.append(self.rope_scaling) - factors.append(self.rope_theta) - # hf_config can control how the model looks! - factors.append(self.hf_config.to_json_string()) - str_factors = str(factors) - assert_hashable(str_factors) - return hashlib.sha256(str(factors).encode()).hexdigest() - - def __post_init__( - self, - # Multimodal config init vars - limit_mm_per_prompt: Optional[dict[str, int]], - media_io_kwargs: Optional[dict[str, dict[str, Any]]], - mm_processor_kwargs: Optional[dict[str, Any]], - mm_processor_cache_gb: Optional[float], - mm_processor_cache_type: Optional[MMCacheType], - mm_shm_cache_max_object_size_mb: Optional[int], - mm_encoder_tp_mode: Optional[MMEncoderTPMode], - interleave_mm_strings: Optional[bool], - skip_mm_profiling: Optional[bool]) -> None: - # Set the default seed to 0 in V1. - # NOTE(woosuk): In V0, we set the default seed to None because the - # driver worker shares the same process as the user process, and thus - # setting a seed affects the user process as well. - # In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. However, without a consistent seed, - # different tensor parallel workers would sample different tokens, - # leading to inconsistent results. - if envs.VLLM_USE_V1 and self.seed is None: - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", self.seed) - - # Keep set served_model_name before maybe_model_redirect(self.model) - self.served_model_name = get_served_model_name(self.model, - self.served_model_name) - self.model = maybe_model_redirect(self.model) - # The tokenizer is consistent with the model by default. - if self.tokenizer is None: - self.tokenizer = self.model - if self.tokenizer_revision is None: - self.tokenizer_revision = self.revision - self.tokenizer = maybe_model_redirect(self.tokenizer) - - if isinstance(self.hf_config_path, str): - self.hf_config_path = maybe_model_redirect(self.hf_config_path) - - if callable(self.hf_overrides): - hf_overrides_kw = {} - hf_overrides_fn = self.hf_overrides - else: - hf_overrides_kw = self.hf_overrides - hf_overrides_fn = None - - if self.rope_scaling: - hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling} - hf_overrides_kw.update(hf_override) - hf_overrides_str = json.dumps(hf_overrides_kw) - msg = ( - "`--rope-scaling` will be removed in a future release. " - f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") - warnings.warn(DeprecationWarning(msg), stacklevel=2) - if self.rope_theta is not None: - hf_override = {"rope_theta": self.rope_theta} - hf_overrides_kw.update(hf_override) - hf_overrides_str = json.dumps(hf_overrides_kw) - msg = ( - "`--rope-theta` will be removed in a future release. " - f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - - if self.runner != "draft": - # If we're not running the draft model, check for speculators config - # If speculators config, set model / tokenizer to be target model - self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 - model=self.model, - tokenizer=self.tokenizer, - revision=self.revision, - trust_remote_code=self.trust_remote_code) - - if (backend := envs.VLLM_ATTENTION_BACKEND - ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: - raise ValueError( - "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " - "module was not found. See " - "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 - "for instructions on how to install it.") - - from vllm.platforms import current_platform - - if (self.override_attention_dtype is not None - and not current_platform.is_rocm()): - warnings.warn( - "override-attention-dtype is set but not using ROCm platform", - stacklevel=2) - - if (self.enable_sleep_mode - and not current_platform.is_sleep_mode_available()): - raise ValueError( - "Sleep mode is not supported on current platform.") - - hf_config = get_config(self.hf_config_path or self.model, - self.trust_remote_code, - self.revision, - self.code_revision, - self.config_format, - hf_overrides_kw=hf_overrides_kw, - hf_overrides_fn=hf_overrides_fn) - - self.hf_config = hf_config - self.hf_text_config = get_hf_text_config(self.hf_config) - self.attention_chunk_size = getattr(self.hf_text_config, - "attention_chunk_size", None) - self.encoder_config = self._get_encoder_config() - self.hf_image_processor_config = get_hf_image_processor_config( - self.model, hf_token=self.hf_token, revision=self.revision) - - architectures = self.architectures - registry = self.registry - is_generative_model = registry.is_text_generation_model( - architectures, self) - is_pooling_model = registry.is_pooling_model(architectures, self) - - def _task_to_convert(task: TaskOption) -> ConvertType: - if task == "embedding" or task == "embed": - return "embed" - if task == "classify": - return "classify" - if task == "reward": - return "reward" - if task == "score": - new_task = self._get_default_pooling_task(architectures) - return "classify" if new_task == "classify" else "embed" - - return "none" - - if self.task is not None: - runner: RunnerOption = "auto" - convert: ConvertOption = "auto" - msg_prefix = ("The 'task' option has been deprecated and will be " - "removed in v0.13.0 or v1.0, whichever comes first.") - msg_hint = "Please remove this option." - - is_generative_task = self.task in _RUNNER_TASKS["generate"] - is_pooling_task = self.task in _RUNNER_TASKS["pooling"] - - if is_generative_model and is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = ("Please replace this option with `--runner " - "generate` to continue using this model " - "as a generative model.") - elif is_pooling_task: - runner = "pooling" - convert = "auto" - msg_hint = ("Please replace this option with `--runner " - "pooling` to continue using this model " - "as a pooling model.") - else: # task == "auto" - pass - elif is_generative_model or is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = "Please remove this option" - elif is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ("Please replace this option with `--convert " - f"{convert}` to continue using this model " - "as a pooling model.") - else: # task == "auto" - pass - else: - raise AssertionError("The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}.") - - self.runner = runner - self.convert = convert - - msg = f"{msg_prefix} {msg_hint}" - warnings.warn(msg, DeprecationWarning, stacklevel=2) - - self.runner_type = self._get_runner_type(architectures, self.runner) - self.convert_type = self._get_convert_type(architectures, - self.runner_type, - self.convert) - - if self.runner_type == "generate" and not is_generative_model: - generate_converts = _RUNNER_CONVERTS["generate"] - if self.convert_type not in generate_converts: - # Currently we don't have any converters for generative models - raise ValueError( - "This model does not support `--runner generate`.") - if self.runner_type == "pooling" and not is_pooling_model: - pooling_converts = _RUNNER_CONVERTS["pooling"] - if self.convert_type not in pooling_converts: - convert_option = "<" + "|".join(pooling_converts) + ">" - raise ValueError( - "This model does not support `--runner pooling`. " - f"You can pass `--convert {convert_option} to adapt " - "it into a pooling model.") - - self.supported_tasks = self._get_supported_tasks( - architectures, self.runner_type, self.convert_type) - - # Note: Initialize these attributes early because transformers fallback - # may fail to load dynamic modules in child processes - model_info, arch = registry.inspect_model_cls(architectures, self) - self._model_info = model_info - self._architecture = arch - logger.info("Resolved architecture: %s", arch) - - # Init pooler config if needed - if self.runner_type == "pooling": - if self.override_pooler_config is not None: - logger.warning_once( - "`override_pooler_config` is deprecated and will be " - "removed in v0.12.0 or v1.0.0, whichever is sooner. " - "Please use `pooler_config` instead.") - - if isinstance(self.override_pooler_config, dict): - self.pooler_config = PoolerConfig( - **self.override_pooler_config) - else: - self.pooler_config = self.override_pooler_config - - if self.pooler_config is None: - self.pooler_config = PoolerConfig() - - base_config = get_pooling_config(self.model, self.revision) - if base_config is not None: - # Only set values that are not overridden by the user - for k, v in base_config.items(): - if getattr(self.pooler_config, k) is None: - setattr(self.pooler_config, k, v) - - default_pooling_type = self._model_info.default_pooling_type - if self.pooler_config.pooling_type is None: - self.pooler_config.pooling_type = default_pooling_type - - self.dtype: torch.dtype = _get_and_verify_dtype( - self.model, - self.hf_config, - self.dtype, - is_pooling_model=self.runner_type == "pooling", - revision=self.revision, - ) - - # Interleaved attention is not supported by some backends in V0 - if (not self.disable_sliding_window - and is_interleaved(self.hf_text_config) - and not envs.VLLM_USE_V1 - and (backend := envs.VLLM_ATTENTION_BACKEND) - in ("XFORMERS", "FLASHINFER")): - logger.warning_once( - "%s has interleaved attention, which is currently not " - "supported by the %s backend. Disabling sliding window and " - "capping the max length to the sliding window size (%d).", - self.hf_text_config.model_type, - backend, - self.hf_text_config.sliding_window, - ) - self.disable_sliding_window = True - - self.original_max_model_len = self.max_model_len - self.max_model_len = self.get_and_verify_max_len(self.max_model_len) - # Init multimodal config if needed - if self._model_info.supports_multimodal: - if (mm_encoder_tp_mode == "data" and - not self._model_info.supports_multimodal_encoder_tp_data): - logger.warning_once( - "This model does not support `--mm-encoder-tp-mode data`. " - "Falling back to `--mm-encoder-tp-mode weights`.") - mm_encoder_tp_mode = "weights" - - mm_config_kwargs = dict( - limit_per_prompt=limit_mm_per_prompt, - media_io_kwargs=media_io_kwargs, - mm_processor_kwargs=mm_processor_kwargs, - mm_processor_cache_gb=mm_processor_cache_gb, - mm_processor_cache_type=mm_processor_cache_type, - mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, - mm_encoder_tp_mode=mm_encoder_tp_mode, - interleave_mm_strings=interleave_mm_strings, - skip_mm_profiling=skip_mm_profiling, - ) - - mm_config_kwargs = { - k: v - for k, v in mm_config_kwargs.items() if v is not None - } - - self.multimodal_config = MultiModalConfig(**mm_config_kwargs) - - if self.disable_sliding_window: - # Set after get_and_verify_max_len to ensure that max_model_len - # can be correctly capped to sliding window size - self.hf_text_config.sliding_window = None - - if not self.skip_tokenizer_init: - self._verify_tokenizer_mode() - - # Avoid running try_verify_and_update_config multiple times - self.config_updated = False - - self._verify_quantization() - self._verify_cuda_graph() - self._verify_bnb_config() - - @field_validator("quantization", mode="before") - @classmethod - def validate_quantization_before(cls, value: Any) -> Any: - if isinstance(value, str): - return value.lower() - return value - - @model_validator(mode="after") - def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": - if not isinstance(self.tokenizer, str): - raise ValueError("tokenizer must be a string after __post_init__.") - if not isinstance(self.max_model_len, int): - raise ValueError( - "max_model_len must be an integer after __post_init__.") - return self - - def _get_transformers_backend_cls(self) -> str: - """Determine which Transformers backend class will be used if - `model_impl` is set to `transformers` or `auto`.""" - if getattr(self, "runner_type", self.runner) == "pooling": - return "TransformersModel" - if self.hf_config != self.hf_text_config: - # If 'hf_text_config' is the same as 'hf_config'. If not, it is - # probably a composite config, i.e. multimodal - return "TransformersForMultimodalLM" - return "TransformersForCausalLM" - - def using_transformers_backend(self) -> bool: - """Check if the model is using the Transformers backend class.""" - return self.architecture == self._get_transformers_backend_cls() - - @property - def registry(self): - return me_models.ModelRegistry - - @property - def architectures(self) -> list[str]: - return getattr(self.hf_config, "architectures", []) - - @property - def architecture(self) -> str: - """The architecture vllm actually used.""" - return self._architecture - - def maybe_pull_model_tokenizer_for_runai(self, model: str, - tokenizer: str) -> None: - """Pull model/tokenizer from Object Storage to temporary - directory when needed. - - Args: - model: Model name or path - tokenizer: Tokenizer name or path - """ - if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): - return - - if is_runai_obj_uri(model): - object_storage_model = ObjectStorageModel() - object_storage_model.pull_files( - model, allow_pattern=["*.model", "*.py", "*.json"]) - self.model_weights = model - self.model = object_storage_model.dir - - # If tokenizer is same as model, download to same directory - if model == tokenizer: - object_storage_model.pull_files(model, - ignore_pattern=[ - "*.pt", "*.safetensors", - "*.bin", "*.tensors", - "*.pth" - ]) - self.tokenizer = object_storage_model.dir - return - - # Only download tokenizer if needed and not already handled - if is_runai_obj_uri(tokenizer): - object_storage_tokenizer = ObjectStorageModel() - object_storage_tokenizer.pull_files(model, - ignore_pattern=[ - "*.pt", "*.safetensors", - "*.bin", "*.tensors", - "*.pth" - ]) - self.tokenizer = object_storage_tokenizer.dir - - def _get_encoder_config(self): - return get_sentence_transformer_tokenizer_config( - self.model, self.revision) - - def _verify_tokenizer_mode(self) -> None: - tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) - if tokenizer_mode not in get_args(TokenizerMode): - raise ValueError( - f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " - f"one of {get_args(TokenizerMode)}.") - self.tokenizer_mode = tokenizer_mode - - def _get_default_runner_type( - self, - architectures: list[str], - ) -> RunnerType: - registry = self.registry - - # Some Sentence Transformers models use *ForCausalLM archs - if get_pooling_config(self.model, self.revision): - return "pooling" - - for arch in architectures: - if arch in registry.get_supported_archs(): - if registry.is_pooling_model(architectures, self): - return "pooling" - if registry.is_text_generation_model(architectures, self): - return "generate" - - match = try_match_architecture_defaults(arch) - if match: - _, (runner_type, _) = match - return runner_type - - return "generate" - - def _get_runner_type( - self, - architectures: list[str], - runner: RunnerOption, - ) -> RunnerType: - if runner != "auto": - return runner - - runner_type = self._get_default_runner_type(architectures) - - # Don't log the most common case - if runner_type != "generate": - logger.info( - "Resolved `--runner auto` to `--runner %s`. " - "Pass the value explicitly to silence this message.", - runner_type) - - return runner_type - - def _get_default_convert_type( - self, - architectures: list[str], - runner_type: RunnerType, - ) -> ConvertType: - registry = self.registry - - for arch in architectures: - if arch in registry.get_supported_archs(): - if (runner_type == "generate" - and registry.is_text_generation_model( - architectures, self)): - return "none" - if (runner_type == "pooling" - and registry.is_pooling_model(architectures, self)): - return "none" - - match = try_match_architecture_defaults(arch, - runner_type=runner_type) - if match: - _, (_, convert_type) = match - return convert_type - - # This is to handle Sentence Transformers models that use *ForCausalLM - # and also multi-modal pooling models which are not defined as - # Sentence Transformers models - if runner_type == "pooling": - return "embed" - - return "none" - - def _get_convert_type( - self, - architectures: list[str], - runner_type: RunnerType, - convert: ConvertOption, - ) -> ConvertType: - if convert != "auto": - return convert - - convert_type = self._get_default_convert_type(architectures, - runner_type) - - # Don't log the most common case - if convert_type != "none": - logger.info( - "Resolved `--convert auto` to `--convert %s`. " - "Pass the value explicitly to silence this message.", - convert_type) - - return convert_type - - def _get_supported_generation_tasks( - self, - architectures: list[str], - convert_type: ConvertType, - ) -> list[_ResolvedTask]: - registry = self.registry - - if registry.is_transcription_only_model(architectures, self): - return ["transcription"] - - # TODO: Use get_supported_generation_tasks once V0 is removed - supported_tasks = list[_ResolvedTask]() - if (registry.is_text_generation_model(architectures, self) - or convert_type in _RUNNER_CONVERTS["generate"]): - supported_tasks.append("generate") - - if registry.is_transcription_model(architectures, self): - supported_tasks.append("transcription") - - return supported_tasks - - def _get_default_pooling_task( - self, - architectures: list[str], - ) -> Literal["embed", "classify", "reward"]: - if self.registry.is_cross_encoder_model(architectures, self): - return "classify" - - for arch in architectures: - match = try_match_architecture_defaults(arch, - runner_type="pooling") - if match: - _, (_, convert_type) = match - assert convert_type != "none" - return convert_type - - return "embed" - - def _get_supported_pooling_tasks( - self, - architectures: list[str], - convert_type: ConvertType, - ) -> list[_ResolvedTask]: - registry = self.registry - - # TODO: Use get_supported_pooling_tasks once V0 is removed - supported_tasks = list[_ResolvedTask]() - if (registry.is_pooling_model(architectures, self) - or convert_type in _RUNNER_CONVERTS["pooling"]): - supported_tasks.append("encode") - - extra_task = (self._get_default_pooling_task(architectures) - if convert_type == "none" else convert_type) - supported_tasks.append(extra_task) - - return supported_tasks - - def _get_supported_tasks( - self, - architectures: list[str], - runner_type: RunnerType, - convert_type: ConvertType, - ) -> list[_ResolvedTask]: - if runner_type == "generate": - return self._get_supported_generation_tasks( - architectures, convert_type) - if runner_type == "pooling": - return self._get_supported_pooling_tasks(architectures, - convert_type) - if runner_type == "draft": - return ["draft"] - - assert_never(runner_type) - - def _parse_quant_hf_config(self, hf_config: PretrainedConfig): - quant_cfg = getattr(hf_config, "quantization_config", None) - if quant_cfg is None: - # compressed-tensors uses a "compression_config" key - quant_cfg = getattr(hf_config, "compression_config", None) - - else: - # Set quant_method for ModelOpt models. - producer_name = quant_cfg.get("producer", {}).get("name") - if producer_name == "modelopt": - quant_algo = quant_cfg.get("quantization", - {}).get("quant_algo") - if quant_algo == "FP8": - quant_cfg["quant_method"] = "modelopt" - elif quant_algo == "NVFP4": - quant_cfg["quant_method"] = "modelopt_fp4" - elif quant_algo is not None: - raise ValueError( - f"Unknown ModelOpt quant algo: {quant_algo}") - - return quant_cfg - - def _verify_quantization(self) -> None: - supported_quantization = me_quant.QUANTIZATION_METHODS - if self.quantization is not None: - self.quantization = cast(me_quant.QuantizationMethods, - self.quantization) - - # Parse quantization method from the HF model config, if available. - quant_cfg = self._parse_quant_hf_config(self.hf_config) - if quant_cfg is None and (text_config := getattr( - self.hf_config, "text_config", None)): - # Check the text config as well for multi-modal models. - quant_cfg = self._parse_quant_hf_config(text_config) - - if quant_cfg is not None: - # Use the community standard 'quant_method' - quant_method = quant_cfg.get("quant_method", "").lower() - - # Normalize library names - quant_method = quant_method.replace("compressed_tensors", - "compressed-tensors") - - quant_cfg["quant_method"] = quant_method - - # Quantization methods which are overrides (i.e. they have a - # `override_quantization_method` method) must be checked in order - # of preference (this is particularly important for GPTQ). - overrides = [ - "bitblas", - "gptq_marlin_24", - "gptq_marlin", - "gptq_bitblas", - "awq_marlin", - "ipex", - "moe_wna16", - "modelopt", - "modelopt_fp4", - "petit_nvfp4", - ] - quantization_methods = [ - q for q in supported_quantization if q not in overrides - ] - # Any custom overrides will be in quantization_methods so we place - # them at the start of the list so custom overrides have preference - # over the built-in ones. - quantization_methods = quantization_methods + overrides - - # Detect which checkpoint is it - for name in quantization_methods: - method = me_quant.get_quantization_config(name) - quantization_override = method.override_quantization_method( - quant_cfg, self.quantization) - if quantization_override is not None: - # Raise error if the override is not custom (custom would - # be in QUANTIZATION_METHODS but not QuantizationMethods) - # and hasn't been added to the overrides list. - if (name in get_args(me_quant.QuantizationMethods) - and name not in overrides): - raise ValueError( - f"Quantization method {name} is an override but " - "is has not been added to the `overrides` list " - "above. This is necessary to ensure that the " - "overrides are checked in order of preference.") - quant_method = quantization_override - self.quantization = quantization_override - break - - # Verify quantization configurations. - if self.quantization is None: - self.quantization = quant_method - elif self.quantization != quant_method: - raise ValueError( - "Quantization method specified in the model config " - f"({quant_method}) does not match the quantization " - f"method specified in the `quantization` argument " - f"({self.quantization}).") - - if self.quantization is not None: - if self.quantization not in supported_quantization: - raise ValueError( - f"Unknown quantization method: {self.quantization}. Must " - f"be one of {supported_quantization}.") - from vllm.platforms import current_platform - current_platform.verify_quantization(self.quantization) - - def _verify_cuda_graph(self) -> None: - # The `max_seq_len_to_capture` was incorrectly - # based on the encoder's input length (448) - # but not the decoder's larger input length (1500). - # This change ensures the CUDA Graph captures the correct, - # larger sequence length, allowing it to work as intended. - effective_max_seq_len = self.max_model_len - if self.is_encoder_decoder: - effective_max_seq_len = max( - effective_max_seq_len, - getattr(self.hf_config, "max_source_positions", 0)) - self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, - effective_max_seq_len) - # CUDAGraph capture not supported for encoder-decoder models on ROCm - unsupported_rocm = self.is_encoder_decoder - - if (unsupported_rocm and not self.enforce_eager - and current_platform.is_rocm()): - logger.warning( - "CUDA graph is not supported for %s on ROCm yet, fallback " - "to eager mode.", self.hf_config.model_type) - self.enforce_eager = True - - def _verify_bnb_config(self) -> None: - """ - The current version of bitsandbytes (0.46.1) with 8-bit models does not - yet support CUDA graph. - # TODO Remove this when bitsandbytes supports. - """ - is_bitsandbytes = self.quantization == "bitsandbytes" - has_quantization_config = (getattr(self.hf_config, - "quantization_config", None) - is not None) - is_8bit = (self.hf_config.quantization_config.get( - "load_in_8bit", False) if has_quantization_config else False) - if all([ - is_bitsandbytes, - has_quantization_config, - is_8bit, - not self.enforce_eager, - ]): - logger.warning( - "CUDA graph is not supported on BitsAndBytes 8bit yet, " - "fallback to the eager mode.") - - self.enforce_eager = True - - def _verify_with_expert_parallelism(self) -> None: - num_expert_names = [ - "moe_num_experts", # Dbrx - "num_experts", # Jamba - "n_routed_experts", # DeepSeek - "num_local_experts", # Mixtral - ] - num_experts = 0 - for name in num_expert_names: - num_experts = getattr(self.hf_text_config, name, 0) - if num_experts > 0: - break - if num_experts < 1: - raise ValueError( - "Number of experts in the model must be greater than 0 " - "when expert parallelism is enabled.") - - def verify_dual_chunk_attention_config( - self, - load_config: "LoadConfig", - ) -> None: - if hasattr(self.hf_config, "dual_chunk_attention_config"): - # Try loading the sparse attention config - from vllm.model_executor.model_loader.weight_utils import ( - get_sparse_attention_config) - sparse_attn_config = get_sparse_attention_config(self, load_config) - if sparse_attn_config: - self.hf_config.dual_chunk_attention_config[ - "sparse_attention_config"] = sparse_attn_config - if "sparse_attention_enabled" not in \ - self.hf_config.dual_chunk_attention_config: - self.hf_config.dual_chunk_attention_config[ - "sparse_attention_enabled"] = True - - if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL: - raise ValueError("please set VLLM_ATTENTION_BACKEND to " - f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") - - def verify_async_output_proc(self, parallel_config, speculative_config, - device_config) -> None: - if not self.use_async_output_proc: - # Nothing to check - return - - if parallel_config.pipeline_parallel_size > 1: - self.use_async_output_proc = False - return - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - from vllm.platforms import current_platform - if not current_platform.is_async_output_supported(self.enforce_eager): - self.use_async_output_proc = False - return - - if envs.VLLM_USE_RAY_SPMD_WORKER: - self.use_async_output_proc = False - return - - # Async postprocessor is not necessary for pooling models - # since there is no token generation - if self.runner_type == "pooling": - self.use_async_output_proc = False - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if speculative_config: - self.use_async_output_proc = False - - def verify_with_parallel_config( - self, - parallel_config: "ParallelConfig", - ) -> None: - - if parallel_config.distributed_executor_backend == "external_launcher": - assert self.seed is not None, ( - "Seed must be set when using external launcher backend to " - "make sure sampling results are the same across workers.") - - total_num_attention_heads = getattr(self.hf_text_config, - "num_attention_heads", 0) - tensor_parallel_size = parallel_config.tensor_parallel_size - if total_num_attention_heads % tensor_parallel_size != 0: - raise ValueError( - f"Total number of attention heads ({total_num_attention_heads})" - " must be divisible by tensor parallel size " - f"({tensor_parallel_size}).") - - if parallel_config.enable_expert_parallel: - self._verify_with_expert_parallelism() - - pipeline_parallel_size = parallel_config.pipeline_parallel_size - if pipeline_parallel_size > 1: - if not self.registry.is_pp_supported_model(self.architectures, - self): - raise NotImplementedError( - "Pipeline parallelism is not supported for this model. " - "Supported models implement the `SupportsPP` interface.") - - if self.use_async_output_proc: - self.use_async_output_proc = False - - def get_sliding_window(self) -> Optional[int]: - """Get the sliding window size from the HF text config if present.""" - return getattr(self.hf_text_config, "sliding_window", None) - - def get_vocab_size(self) -> int: - return getattr(self.hf_text_config, "vocab_size", 0) - - def get_hidden_size(self) -> int: - return getattr(self.hf_text_config, "hidden_size", 0) - - @property - def is_deepseek_mla(self) -> bool: - if not hasattr(self.hf_text_config, "model_type"): - return False - elif self.hf_text_config.model_type in \ - ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'kimi_k2'): - return self.hf_text_config.kv_lora_rank is not None - elif self.hf_text_config.model_type == 'eagle': - # if the model is an EAGLE module, check for the - # underlying architecture - return self.hf_text_config.model.model_type in \ - ('deepseek_v2', 'deepseek_v3') \ - and self.hf_text_config.kv_lora_rank is not None - return False - - def get_head_size(self) -> int: - # TODO remove hard code - if self.is_deepseek_mla: - qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", - 0) - if self.use_mla: - return self.hf_text_config.kv_lora_rank + qk_rope_head_dim - else: - qk_nope_head_dim = getattr(self.hf_text_config, - "qk_nope_head_dim", 0) - if qk_rope_head_dim and qk_nope_head_dim: - return qk_rope_head_dim + qk_nope_head_dim - - if hasattr(self.hf_text_config, - "model_type") and (self.hf_text_config.model_type - == "zamba2"): - return self.hf_text_config.attention_head_dim - - if self.is_attention_free: - return 0 - - # NOTE: Some configs may set head_dim=None in the config - if getattr(self.hf_text_config, "head_dim", None) is not None: - return self.hf_text_config.head_dim - - # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` - if getattr(self.hf_text_config, "hidden_size_per_head", - None) is not None: - return self.hf_text_config.hidden_size_per_head - - # FIXME(woosuk): This may not be true for all models. - return (self.hf_text_config.hidden_size // - self.hf_text_config.num_attention_heads) - - def get_total_num_kv_heads(self) -> int: - """Returns the total number of KV heads.""" - # For GPTBigCode & Falcon: - # NOTE: for falcon, when new_decoder_architecture is True, the - # multi_query flag is ignored and we use n_head_kv for the number of - # KV heads. - falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] - new_decoder_arch_falcon = ( - self.hf_config.model_type in falcon_model_types - and getattr(self.hf_config, "new_decoder_architecture", False)) - if not new_decoder_arch_falcon and getattr(self.hf_text_config, - "multi_query", False): - # Multi-query attention, only one KV head. - # Currently, tensor parallelism is not supported in this case. - return 1 - - # For DBRX and MPT - if self.hf_config.model_type == "mpt": - if "kv_n_heads" in self.hf_config.attn_config: - return self.hf_config.attn_config["kv_n_heads"] - return self.hf_config.num_attention_heads - if self.hf_config.model_type == "dbrx": - return getattr(self.hf_config.attn_config, "kv_n_heads", - self.hf_config.num_attention_heads) - - if self.hf_config.model_type == "nemotron-nas": - for block in self.hf_config.block_configs: - if not block.attention.no_op: - return self.hf_config.num_attention_heads \ - // block.attention.n_heads_in_group - - raise RuntimeError("Couldn't determine number of kv heads") - - if self.is_attention_free: - return 0 - - attributes = [ - # For Falcon: - "n_head_kv", - "num_kv_heads", - # For LLaMA-2: - "num_key_value_heads", - # For ChatGLM: - "multi_query_group_num", - ] - for attr in attributes: - num_kv_heads = getattr(self.hf_text_config, attr, None) - if num_kv_heads is not None: - return num_kv_heads - - # For non-grouped-query attention models, the number of KV heads is - # equal to the number of attention heads. - return self.hf_text_config.num_attention_heads - - def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: - """Returns the number of KV heads per GPU.""" - if self.use_mla: - # When using MLA during decode it becomes MQA - return 1 - - total_num_kv_heads = self.get_total_num_kv_heads() - # If tensor parallelism is used, we divide the number of KV heads by - # the tensor parallel size. We will replicate the KV heads in the - # case where the number of KV heads is smaller than the tensor - # parallel size so each GPU has at least one KV head. - return max(1, - total_num_kv_heads // parallel_config.tensor_parallel_size) - - def get_num_attention_heads(self, - parallel_config: "ParallelConfig") -> int: - num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) - return num_heads // parallel_config.tensor_parallel_size - - def get_layers_start_end_indices( - self, parallel_config: "ParallelConfig") -> tuple[int, int]: - from vllm.distributed.utils import get_pp_indices - if (self.hf_text_config.model_type == "deepseek_mtp" - or self.hf_config.model_type == "mimo_mtp" - or self.hf_config.model_type == "glm4_moe_mtp" - or self.hf_config.model_type == "ernie_mtp" - or self.hf_config.model_type == "qwen3_next_mtp"): - total_num_hidden_layers = getattr(self.hf_text_config, - "num_nextn_predict_layers", 0) - else: - total_num_hidden_layers = getattr(self.hf_text_config, - "num_hidden_layers", 0) - # the layout order is: DP x PP x TP - pp_rank = (parallel_config.rank // parallel_config.tensor_parallel_size - ) % parallel_config.pipeline_parallel_size - pp_size = parallel_config.pipeline_parallel_size - start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) - return start, end - - def get_num_layers(self, parallel_config: "ParallelConfig") -> int: - start, end = self.get_layers_start_end_indices(parallel_config) - return end - start - - def get_num_layers_by_block_type( - self, - parallel_config: "ParallelConfig", - block_type: LayerBlockType = LayerBlockType.attention, - ) -> int: - # This function relies on 'layers_block_type' in hf_config, - # for w/o this attribute, we will need to have workarounds like so - attn_block_type = block_type == LayerBlockType.attention - is_transformer = not self.is_hybrid and \ - not self.has_noops and \ - not self.is_attention_free - start, end = self.get_layers_start_end_indices(parallel_config) - - if is_transformer: - # Handle the basic case first - return end - start if attn_block_type else 0 - elif self.is_attention_free: - # Attention free - # Note that this code assumes there - # is only one type of attention-free block type. - return 0 if attn_block_type else end - start - elif self.has_noops: - block_configs = self.hf_config.block_configs - return sum(not bc.attention.no_op - for bc in block_configs[start:end]) - else: - # Hybrid model Jamba - layers_block_type_value = getattr(self.hf_text_config, - "layers_block_type", None) - if layers_block_type_value is not None: - if hasattr(self.hf_text_config, - "model_type") and (self.hf_text_config.model_type - == "zamba2"): - if attn_block_type: - return sum(t == "hybrid" - for t in layers_block_type_value[start:end]) - else: - return self.get_num_layers(parallel_config) - return sum(t == block_type.value - for t in layers_block_type_value[start:end]) - - # Hybrid model Minimax - attn_type_list = getattr(self.hf_config, "attn_type_list", None) - if attn_type_list: - return sum(t == 1 for t in attn_type_list[start:end]) - - # Hybrid model Qwen3Next - layer_types_value = getattr(self.hf_config, "layer_types", None) - if layer_types_value is not None: - if getattr(block_type, "value", block_type) == "attention": - return sum(t == "full_attention" - for t in layer_types_value[start:end]) - elif getattr(block_type, "value", - block_type) == "linear_attention": - return sum(t == "linear_attention" - for t in layer_types_value[start:end]) - else: - return sum(t == getattr(block_type, "value", block_type) - for t in layer_types_value[start:end]) - - if (layers_block_type_value is None and attn_type_list is None - and layer_types_value is None): - raise ValueError( - "The model is an hybrid without a" - "layers_block_type or an attn_type_list, or a layer_types " - "in the hf_config, cannot determine the num of " - f"{block_type.value} layers") - - def get_mamba_chunk_size(self) -> Optional[int]: - """ - Returns the mamba chunk size if it exists - """ - # used by e.g. Bamba, FalconH1, Granite, PLaMo2 - chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None) - if chunk_size is None: - # used by e.g. Mamba2, NemotronH, Zamba - chunk_size = getattr(self.hf_text_config, "chunk_size", None) - return chunk_size - - def get_multimodal_config(self) -> "MultiModalConfig": - """ - Get the multimodal configuration of the model. - - Raises: - ValueError: If the model is not multimodal. - """ - if self.multimodal_config is None: - raise ValueError("The model is not multimodal.") - - return self.multimodal_config - - def try_get_generation_config(self) -> dict[str, Any]: - """ - This method attempts to retrieve the non-default values of the - generation config for this model. - - The generation config can contain information about special tokens, as - well as sampling parameters. Which is why this method exists separately - to `get_diff_sampling_param`. - - Returns: - A dictionary containing the non-default generation config. - """ - if self.generation_config in {"auto", "vllm"}: - config = try_get_generation_config( - self.hf_config_path or self.model, - trust_remote_code=self.trust_remote_code, - revision=self.revision, - ) - else: - config = try_get_generation_config( - self.generation_config, - trust_remote_code=self.trust_remote_code, - ) - - if config is None: - return {} - - return config.to_diff_dict() - - def get_diff_sampling_param(self) -> dict[str, Any]: - """ - This method returns a dictionary containing the non-default sampling - parameters with `override_generation_config` applied. - - The default sampling parameters are: - - - vLLM's neutral defaults if `self.generation_config="vllm"` - - the model's defaults if `self.generation_config="auto"` - - as defined in `generation_config.json` if - `self.generation_config="path/to/generation_config/dir"` - - Returns: - A dictionary containing the non-default sampling parameters. - """ - if self.generation_config == "vllm": - config = {} - else: - config = self.try_get_generation_config() - - # Overriding with given generation config - config.update(self.override_generation_config) - - available_params = [ - "repetition_penalty", - "temperature", - "top_k", - "top_p", - "min_p", - "max_new_tokens", - ] - if any(p in config for p in available_params): - diff_sampling_param = { - p: config.get(p) - for p in available_params if config.get(p) is not None - } - # Huggingface definition of max_new_tokens is equivalent - # to vLLM's max_tokens - if "max_new_tokens" in diff_sampling_param: - diff_sampling_param["max_tokens"] = diff_sampling_param.pop( - "max_new_tokens") - else: - diff_sampling_param = {} - - if diff_sampling_param: - logger.warning_once( - "Default sampling parameters have been overridden by the " - "model's Hugging Face generation config recommended from the " - "model creator. If this is not intended, please relaunch " - "vLLM instance with `--generation-config vllm`.") - return diff_sampling_param - - @property - def is_encoder_decoder(self) -> bool: - """Extract the HF encoder/decoder model flag.""" - return is_encoder_decoder(self.hf_config) - - @property - def uses_mrope(self) -> bool: - return uses_mrope(self.hf_config) - - @property - def is_multimodal_model(self) -> bool: - return self.multimodal_config is not None - - @property - def is_multimodal_raw_input_only_model(self) -> bool: - return self._model_info.supports_multimodal_raw_input_only - - @property - def is_cross_encoder(self) -> bool: - return (self._model_info.supports_cross_encoding - or self.convert_type == "classify") - - @property - def is_pp_supported(self) -> bool: - return self._model_info.supports_pp - - @property - def is_attention_free(self) -> bool: - return self._model_info.is_attention_free - - @property - def is_hybrid(self) -> bool: - return self._model_info.is_hybrid - - @property - def has_noops(self) -> bool: - return self._model_info.has_noops - - @property - def has_inner_state(self): - return self._model_info.has_inner_state - - @property - def is_v1_compatible(self) -> bool: - return not self._model_info.supports_v0_only - - @property - def use_mla(self) -> bool: - return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE - - @property - def is_matryoshka(self) -> bool: - return (bool(getattr(self.hf_config, "matryoshka_dimensions", None)) - or getattr(self.hf_config, "is_matryoshka", False)) - - @property - def matryoshka_dimensions(self): - return getattr(self.hf_config, "matryoshka_dimensions", None) - - @property - def use_pad_token(self) -> bool: - # cross_encoder models defaults to using pad_token. - # `llm as reranker` models defaults to not using pad_token. - return getattr(self.hf_config, "use_pad_token", True) - - @property - def head_dtype(self) -> torch.dtype: - """ - "head" refers to the last Linear layer(s) of an LLM, - such as the lm_head in a generation model, - or the score or classifier in a classification model. - - `head_dtype` currently only supports pooling models.\n - - The pooling model defaults to using fp32 head, - you can use --hf-overrides '{"head_dtype": "model"}' to disable it. - """ - - head_dtype = _get_head_dtype(config=self.hf_config, - dtype=self.dtype, - runner_type=self.runner_type) - - if self.runner_type != "pooling" and head_dtype != self.dtype: - logger.warning_once( - "`head_dtype` currently only supports pooling models." - "fallback to model dtype [%s].", self.dtype) - return self.dtype - - if head_dtype not in current_platform.supported_dtypes: - logger.warning_once( - "The current platform does not support [%s] head dtype, " - "fallback to model dtype [%s].", head_dtype, self.dtype) - return self.dtype - - logger.debug_once("head dtype: %s", head_dtype) - return head_dtype - - def get_and_verify_max_len(self, max_model_len: int): - # Consider max_model_len in tokenizer_config only when - # pooling models use absolute position_embedding. - tokenizer_config = None - if (self.runner_type == "pooling" and getattr( - self.hf_config, "position_embedding_type", "") == "absolute"): - tokenizer_config = try_get_tokenizer_config( - self.tokenizer, - trust_remote_code=self.trust_remote_code, - revision=self.tokenizer_revision) - max_model_len = _get_and_verify_max_len( - hf_config=self.hf_text_config, - tokenizer_config=tokenizer_config, - max_model_len=max_model_len, - disable_sliding_window=self.disable_sliding_window, - sliding_window=self.get_sliding_window(), - spec_target_max_model_len=self.spec_target_max_model_len, - encoder_config=self.encoder_config) - logger.info("Using max model len %s", max_model_len) - return max_model_len - - Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] @@ -1838,365 +144,6 @@ class DeviceConfig: self.device = torch.device(self.device_type) -_STR_DTYPE_TO_TORCH_DTYPE = { - "half": torch.float16, - "float16": torch.float16, - "float": torch.float32, - "float32": torch.float32, - "bfloat16": torch.bfloat16, -} - -# model_type -> reason -_FLOAT16_NOT_SUPPORTED_MODELS = { - "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", - "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", - "gemma3_text": - "Numerical instability. Please use bfloat16 or float32 instead.", - "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", - "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", -} - - -def _is_valid_dtype(model_type: str, dtype: torch.dtype): - if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 - return False - - return True - - -def _check_valid_dtype(model_type: str, dtype: torch.dtype): - if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: - reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] - raise ValueError(f"The model type {model_type!r} " - f"does not support float16. Reason: {reason}") - - return True - - -def _find_dtype( - model_id: str, - config: PretrainedConfig, - *, - revision: Optional[str], -): - # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct - # because config.torch_dtype can be None. - config_dtype = getattr(config, "torch_dtype", None) - - # Fallbacks for multi-modal models if the root config - # does not define torch_dtype - if config_dtype is None: - config_dtype = getattr(config.get_text_config(), "torch_dtype", None) - if config_dtype is None and hasattr(config, "vision_config"): - config_dtype = getattr(config.vision_config, "torch_dtype", None) - if config_dtype is None and hasattr(config, "encoder_config"): - config_dtype = getattr(config.encoder_config, "torch_dtype", None) - - # Try to read the dtype of the weights if they are in safetensors format - if config_dtype is None: - repo_mt = try_get_safetensors_metadata(model_id, revision=revision) - - if repo_mt and (files_mt := repo_mt.files_metadata): - param_dtypes: set[torch.dtype] = { - _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] - for file_mt in files_mt.values() - for dtype_str in file_mt.parameter_count - if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE - } - - if param_dtypes: - return common_broadcastable_dtype(param_dtypes) - - if config_dtype is None: - config_dtype = torch.float32 - - return config_dtype - - -def _resolve_auto_dtype( - model_type: str, - config_dtype: torch.dtype, - *, - is_pooling_model: bool, -): - from vllm.platforms import current_platform - - supported_dtypes = [ - dtype for dtype in current_platform.supported_dtypes - if _is_valid_dtype(model_type, dtype) - ] - - if is_pooling_model and torch.float16 in supported_dtypes: - preferred_dtype = torch.float16 - else: - preferred_dtype = supported_dtypes[0] - - # Downcast for float32 models - if config_dtype == torch.float32: - config_dtype = preferred_dtype - - if config_dtype in supported_dtypes: - return config_dtype - - # Ensure device compatibility - device_name = current_platform.get_device_name() - device_capability = current_platform.get_device_capability() - - if device_capability is None: - device_str = f"{device_name!r}" - else: - version_str = device_capability.as_version_str() - device_str = f"{device_name!r} (with compute capability {version_str})" - - logger.warning( - "Your device %s doesn't support %s. " - "Falling back to %s for compatibility.", - device_str, - config_dtype, - preferred_dtype, - ) - - return preferred_dtype - - -def _get_and_verify_dtype( - model_id: str, - config: PretrainedConfig, - dtype: Union[str, torch.dtype], - *, - is_pooling_model: bool, - revision: Optional[str] = None, -) -> torch.dtype: - config_dtype = _find_dtype(model_id, config, revision=revision) - model_type = config.model_type - - if isinstance(dtype, str): - dtype = dtype.lower() - if dtype == "auto": - # Set default dtype from model config - torch_dtype = _resolve_auto_dtype( - model_type, - config_dtype, - is_pooling_model=is_pooling_model, - ) - else: - if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: - raise ValueError(f"Unknown dtype: {dtype!r}") - torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] - elif isinstance(dtype, torch.dtype): - torch_dtype = dtype - else: - raise ValueError(f"Unknown dtype: {dtype}") - - _check_valid_dtype(model_type, torch_dtype) - - if torch_dtype != config_dtype: - if torch_dtype == torch.float32: - # Upcasting to float32 is allowed. - logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) - elif config_dtype == torch.float32: - # Downcasting from float32 to float16 or bfloat16 is allowed. - logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) - else: - # Casting between float16 and bfloat16 is allowed with a warning. - logger.warning("Casting %s to %s.", config_dtype, torch_dtype) - - return torch_dtype - - -def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype, - runner_type: str) -> torch.dtype: - head_dtype: Optional[Union[str, - torch.dtype]] = getattr(config, "head_dtype", - None) - - if head_dtype == "model": - return dtype - elif isinstance(head_dtype, str): - head_dtype = head_dtype.lower() - if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE: - raise ValueError(f"Unknown dtype: {head_dtype!r}") - return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype] - elif isinstance(head_dtype, torch.dtype): - return head_dtype - elif head_dtype is None: - if torch.float32 not in current_platform.supported_dtypes: - return dtype - if runner_type == "pooling": - return torch.float32 - return dtype - else: - raise ValueError(f"Unknown dtype: {head_dtype}") - - -def _get_and_verify_max_len( - hf_config: PretrainedConfig, - tokenizer_config: Optional[dict], - max_model_len: Optional[int], - disable_sliding_window: bool, - sliding_window: Optional[int], - spec_target_max_model_len: Optional[int] = None, - encoder_config: Optional[Any] = None, -) -> int: - """Get and verify the model's maximum length.""" - derived_max_model_len = float("inf") - possible_keys = [ - # OPT - "max_position_embeddings", - # GPT-2 - "n_positions", - # MPT - "max_seq_len", - # ChatGLM2 - "seq_length", - # Command-R - "model_max_length", - # Whisper - "max_target_positions", - # Others - "max_sequence_length", - "max_seq_length", - "seq_len", - ] - # Choose the smallest "max_length" from the possible keys - max_len_key = None - for key in possible_keys: - max_len = getattr(hf_config, key, None) - if max_len is not None: - max_len_key = key if max_len < derived_max_model_len \ - else max_len_key - derived_max_model_len = min(derived_max_model_len, max_len) - # For Command-R / Cohere, Cohere2 / Aya Vision models - if tmp_max_len := getattr(hf_config, "model_max_length", None): - max_len_key = "model_max_length" - derived_max_model_len = tmp_max_len - - # If sliding window is manually disabled, max_length should be less - # than the sliding window length in the model config. - if (disable_sliding_window and sliding_window is not None - and sliding_window < derived_max_model_len): - max_len_key = "sliding_window" - derived_max_model_len = sliding_window - - # Consider model_max_length in tokenizer_config - if tokenizer_config: - tokenizer_model_max_length = tokenizer_config.get( - "model_max_length", derived_max_model_len) - derived_max_model_len = min(derived_max_model_len, - tokenizer_model_max_length) - - # If none of the keys were found in the config, use a default and - # log a warning. - if derived_max_model_len == float("inf"): - if max_model_len is not None: - # If max_model_len is specified, we use it. - return max_model_len - - if spec_target_max_model_len is not None: - # If this is a speculative draft model, we use the max model len - # from the target model. - return spec_target_max_model_len - - default_max_len = 2048 - logger.warning( - "The model's config.json does not contain any of the following " - "keys to determine the original maximum length of the model: " - "%s. Assuming the model's maximum length is %d.", possible_keys, - default_max_len) - derived_max_model_len = default_max_len - - rope_scaling = getattr(hf_config, "rope_scaling", None) - # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE - # scaling, so we skip applying the scaling factor again. - if rope_scaling is not None and "gemma3" not in hf_config.model_type: - # No need to consider "type" key because of patch_rope_scaling when - # loading HF config - rope_type = rope_scaling["rope_type"] - - if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window: - # TODO(robertgshaw): Find a model that supports rope_scaling - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "with rope_scaling. Please raise an issue so we can " - "investigate.") - - # NOTE: rope_type == "default" does not define factor - # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py - scaling_factor = rope_scaling.get("factor", 1.0) - - if rope_type == "yarn": - derived_max_model_len = rope_scaling[ - "original_max_position_embeddings"] - derived_max_model_len *= scaling_factor - - if encoder_config and "max_seq_length" in encoder_config: - derived_max_model_len = encoder_config["max_seq_length"] - - # If the user specified a max length, make sure it is smaller than the - # derived length from the HF model config. - if max_model_len is None: - max_model_len = int(derived_max_model_len) - if current_platform.is_tpu(): - logger.warning( - "--max-model-len is not specified, " - "it's currently using model's default length %s, " - "which might be too large." - "Please input with --max-model-len based on your " - "request input length and output length, to avoid " - "unnecessary degradation.", max_model_len) - elif max_model_len > derived_max_model_len: - # Some models might have a separate key for specifying model_max_length - # that will be bigger than derived_max_model_len. We compare user input - # with model_max_length and allow this override when it's smaller. - model_max_length = getattr(hf_config, "model_max_length", None) - if model_max_length is not None and max_model_len <= model_max_length: - if disable_sliding_window: - # TODO(robertgshaw): Find a model that has model_max_length - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "model_max_length in the config. Please raise an issue " - "so we can investigate.") - else: - msg = ( - f"User-specified max_model_len ({max_model_len}) is greater " - f"than the derived max_model_len ({max_len_key}=" - f"{derived_max_model_len} or model_max_length=" - f"{model_max_length} in model's config.json).") - warning = ( - "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme " - "caution. If the model uses relative position encoding (RoPE), " - "positions exceeding derived_max_model_len lead to nan. If the " - "model uses absolute position encoding, positions exceeding " - "derived_max_model_len will cause a CUDA array out-of-bounds " - "error.") - if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: - logger.warning_once("%s %s", msg, warning) - else: - raise ValueError( - f"{msg} To allow overriding this maximum, set " - f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}") - return int(max_model_len) - - -def get_served_model_name(model: str, - served_model_name: Optional[Union[str, list[str]]]): - """ - If the input is a non-empty list, the first model_name in - `served_model_name` is taken. - If the input is a non-empty string, it is used directly. - For cases where the input is either an empty string or an - empty list, the fallback is to use `self.model`. - """ - if not served_model_name: - return model - if isinstance(served_model_name, list): - return served_model_name[0] - return served_model_name - - DetailedTraceModules = Literal["model", "worker", "all"] @@ -3012,33 +959,6 @@ def get_current_model_prefix() -> str: return _current_prefix -def contains_object_print(text): - """ - Check if the text looks like a printed Python object, e.g. - contains any substring matching the pattern: "at 0xFFFFFFF>" - We match against 0x followed by 2-16 hex chars (there's - a max of 16 on a 64-bit system). - - Args: - text (str): The text to check - - Returns: - result (bool): `True` if a match is found, `False` otherwise. - """ - pattern = r'at 0x[a-fA-F0-9]{2,16}>' - match = re.search(pattern, text) - return match is not None - - -def assert_hashable(text): - if not contains_object_print(text): - return True - raise AssertionError( - f"vLLM tried to hash some configs that may have Python objects ids " - f"in them. This is a bug, please file an issue. " - f"Text being hashed: {text}") - - T = TypeVar("T") diff --git a/vllm/config/model.py b/vllm/config/model.py new file mode 100644 index 0000000000000..21457d3660a23 --- /dev/null +++ b/vllm/config/model.py @@ -0,0 +1,2006 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import json +import warnings +from dataclasses import InitVar, field +from importlib.util import find_spec +from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Union, + cast, get_args) + +import torch +from pydantic import (ConfigDict, SkipValidation, field_validator, + model_validator) +from pydantic.dataclasses import dataclass +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from typing_extensions import assert_never + +import vllm.envs as envs +from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, + MultiModalConfig) +from vllm.config.pooler import PoolerConfig +from vllm.config.utils import assert_hashable, config +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.transformers_utils.config import ( + ConfigFormat, get_config, get_hf_image_processor_config, + get_hf_text_config, get_pooling_config, + get_sentence_transformer_tokenizer_config, is_encoder_decoder, + is_interleaved, maybe_override_with_speculators_target_model, + try_get_generation_config, try_get_safetensors_metadata, + try_get_tokenizer_config, uses_mrope) +from vllm.transformers_utils.runai_utils import (ObjectStorageModel, + is_runai_obj_uri) +from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, + LazyLoader, common_broadcastable_dtype) + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + import vllm.model_executor.layers.quantization as me_quant + import vllm.model_executor.models as me_models + from vllm.config.load import LoadConfig + from vllm.config.parallel import ParallelConfig + from vllm.config.scheduler import RunnerType + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.v1.sample.logits_processor import LogitsProcessor +else: + PretrainedConfig = Any + + me_quant = LazyLoader("model_executor", globals(), + "vllm.model_executor.layers.quantization") + me_models = LazyLoader("model_executor", globals(), + "vllm.model_executor.models") + LoadConfig = Any + ParallelConfig = Any + RunnerType = Any + QuantizationMethods = Any + LogitsProcessor = Any + +logger = init_logger(__name__) + +RunnerOption = Literal["auto", "generate", "pooling", "draft"] +ConvertType = Literal["none", "embed", "classify", "reward"] +ConvertOption = Literal["auto", ConvertType] +TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", + "score", "reward", "transcription", "draft"] +_ResolvedTask = Literal["generate", "transcription", "encode", "embed", + "classify", "reward", "draft"] +TokenizerMode = Literal["auto", "slow", "mistral", "custom"] +ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] +LogprobsMode = Literal["raw_logits", "raw_logprobs", "processed_logits", + "processed_logprobs"] +HfOverrides = Union[dict[str, Any], Callable[[type], type]] +ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] + +_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { + "generate": ["generate", "transcription"], + "pooling": ["embedding", "embed", "classify", "score", "reward"], + "draft": ["draft"], +} + +_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { + "generate": [], + "pooling": ["embed", "classify", "reward"], + "draft": [], +} + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class ModelConfig: + """Configuration for the model.""" + + model: str = "Qwen/Qwen3-0.6B" + """Name or path of the Hugging Face model to use. It is also used as the + content for `model_name` tag in metrics output when `served_model_name` is + not specified.""" + runner: RunnerOption = "auto" + """The type of model runner to use. Each vLLM instance only supports one + model runner, even if the same model can be used for multiple types.""" + convert: ConvertOption = "auto" + """Convert the model using adapters defined in + [vllm.model_executor.models.adapters][]. The most common use case is to + adapt a text generation model to be used for pooling tasks.""" + task: Optional[TaskOption] = None + """[DEPRECATED] The task to use the model for. If the model supports more + than one model runner, this is used to select which model runner to run. + + Note that the model may support other tasks using the same model runner. + """ + tokenizer: SkipValidation[str] = None # type: ignore + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode = "auto" + """Tokenizer mode:\n + - "auto" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "custom" will use --tokenizer to select the preregistered tokenizer.""" + trust_remote_code: bool = False + """Trust remote code (e.g., from HuggingFace) when downloading the model + and tokenizer.""" + dtype: Union[ModelDType, torch.dtype] = "auto" + """Data type for model weights and activations:\n + - "auto" will use FP16 precision for FP32 and FP16 models, and BF16 + precision for BF16 models.\n + - "half" for FP16. Recommended for AWQ quantization.\n + - "float16" is the same as "half".\n + - "bfloat16" for a balance between precision and range.\n + - "float" is shorthand for FP32 precision.\n + - "float32" for FP32 precision.""" + seed: Optional[int] = None + """Random seed for reproducibility. Initialized to None in V0, but + initialized to 0 in V1.""" + hf_config_path: Optional[str] = None + """Name or path of the Hugging Face config to use. If unspecified, model + name or path will be used.""" + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + revision: Optional[str] = None + """The specific model version to use. It can be a branch name, a tag name, + or a commit id. If unspecified, will use the default version.""" + code_revision: Optional[str] = None + """The specific revision to use for the model code on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + rope_scaling: dict[str, Any] = field(default_factory=dict) + """RoPE scaling configuration. For example, + `{"rope_type":"dynamic","factor":2.0}`.""" + rope_theta: Optional[float] = None + """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE + theta improves the performance of the scaled model.""" + tokenizer_revision: Optional[str] = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + max_model_len: SkipValidation[int] = None # type: ignore + """Model context length (prompt and output). If unspecified, will be + automatically derived from the model config. + + When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable + format. Examples:\n + - 1k -> 1000\n + - 1K -> 1024\n + - 25.6k -> 25,600""" + spec_target_max_model_len: Optional[int] = None + """Specify the maximum length for spec decoding draft models.""" + quantization: SkipValidation[Optional[QuantizationMethods]] = None + """Method used to quantize the weights. If `None`, we first check the + `quantization_config` attribute in the model config file. If that is + `None`, we assume the model weights are not quantized and use `dtype` to + determine the data type of the weights.""" + enforce_eager: bool = False + """Whether to always use eager-mode PyTorch. If True, we will disable CUDA + graph and always execute the model in eager mode. If False, we will use + CUDA graph and eager execution in hybrid for maximal performance and + flexibility.""" + max_seq_len_to_capture: int = 8192 + """Maximum sequence len covered by CUDA graphs. When a sequence has context + length larger than this, we fall back to eager mode. Additionally for + encoder-decoder models, if the sequence length of the encoder input is + larger than this, we fall back to the eager mode.""" + max_logprobs: int = 20 + """Maximum number of log probabilities to return when `logprobs` is + specified in `SamplingParams`. The default value comes the default for the + OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * + vocab_size) logprobs are allowed to be returned and it may cause OOM.""" + logprobs_mode: LogprobsMode = "raw_logprobs" + """Indicates the content returned in the logprobs and prompt_logprobs. + Supported mode: + 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. + Raw means the values before applying any logit processors, like bad words. + Processed means the values after applying all processors, including + temperature and top_k/top_p. + """ + disable_sliding_window: bool = False + """Whether to disable sliding window. If True, we will disable the sliding + window functionality of the model, capping to sliding window size. If the + model does not support sliding window, this argument is ignored.""" + disable_cascade_attn: bool = False + """Disable cascade attention for V1. While cascade attention does not + change the mathematical correctness, disabling it could be useful for + preventing potential numerical issues. Note that even if this is set to + False, cascade attention will be only used when the heuristic tells that + it's beneficial.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" + enable_prompt_embeds: bool = False + """If `True`, enables passing text embeddings as inputs via the + `prompt_embeds` key. Note that enabling this will double the time required + for graph compilation.""" + served_model_name: Optional[Union[str, list[str]]] = None + """The model name(s) used in the API. If multiple names are provided, the + server will respond to any of the provided names. The model name in the + model field of a response will be the first name in this list. If not + specified, the model name will be the same as the `--model` argument. Noted + that this name(s) will also be used in `model_name` tag content of + prometheus metrics, if multiple names provided, metrics tag will take the + first one.""" + use_async_output_proc: bool = True + """Whether to use async output processor.""" + config_format: Union[str, ConfigFormat] = "auto" + """The format of the model config to load:\n + - "auto" will try to load the config in hf format if available else it + will try to load in mistral format.\n + - "hf" will load the config in hf format.\n + - "mistral" will load the config in mistral format.""" + hf_token: Optional[Union[bool, str]] = None + """The token to use as HTTP bearer authorization for remote files . If + `True`, will use the token generated when running `huggingface-cli login` + (stored in `~/.huggingface`).""" + hf_overrides: HfOverrides = field(default_factory=dict) + """If a dictionary, contains arguments to be forwarded to the Hugging Face + config. If a callable, it is called to update the HuggingFace config.""" + logits_processor_pattern: Optional[str] = None + """Optional regex pattern specifying valid logits processor qualified names + that can be passed with the `logits_processors` extra completion argument. + Defaults to `None`, which allows no processors.""" + generation_config: str = "auto" + """The folder path to the generation config. Defaults to `"auto"`, the + generation config will be loaded from model path. If set to `"vllm"`, no + generation config is loaded, vLLM defaults will be used. If set to a folder + path, the generation config will be loaded from the specified folder path. + If `max_new_tokens` is specified in generation config, then it sets a + server-wide limit on the number of output tokens for all requests.""" + override_generation_config: dict[str, Any] = field(default_factory=dict) + """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If + used with `--generation-config auto`, the override parameters will be + merged with the default config from the model. If used with + `--generation-config vllm`, only the override parameters are used.""" + enable_sleep_mode: bool = False + """Enable sleep mode for the engine (only cuda platform is supported).""" + model_impl: Union[str, ModelImpl] = "auto" + """Which implementation of the model to use:\n + - "auto" will try to use the vLLM implementation, if it exists, and fall + back to the Transformers implementation if no vLLM implementation is + available.\n + - "vllm" will use the vLLM model implementation.\n + - "transformers" will use the Transformers model implementation.\n + - "terratorch" will use the TerraTorch model implementation. + """ + override_attention_dtype: Optional[str] = None + """Override dtype for attention""" + logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None + """One or more logits processors' fully-qualified class names or class + definitions""" + io_processor_plugin: Optional[str] = None + """IOProcessor plugin name to load at model startup""" + + # Pooler config + pooler_config: Optional[PoolerConfig] = None + """Pooler config which controls the behaviour of output pooling in pooling + models.""" + override_pooler_config: Optional[Union[dict, PoolerConfig]] = None + """[DEPRECATED] Use `pooler_config` instead. This field will be removed in + v0.12.0 or v1.0.0, whichever is sooner.""" + + # Multimodal config and init vars + multimodal_config: Optional[MultiModalConfig] = None + """Configuration for multimodal model. If `None`, this will be inferred + from the architecture of `self.model`.""" + limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None + mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None + mm_processor_cache_gb: InitVar[Optional[float]] = None + mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None + mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None + mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None + interleave_mm_strings: InitVar[Optional[bool]] = None + skip_mm_profiling: InitVar[Optional[bool]] = None + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.model) + factors.append(self.dtype) + factors.append(self.quantization) + factors.append(self.revision) + factors.append(self.code_revision) + factors.append(self.max_model_len) + factors.append(self.max_logprobs) + factors.append(self.disable_sliding_window) + factors.append(self.trust_remote_code) + factors.append(self.generation_config) + factors.append(self.model_impl) + factors.append(self.override_generation_config) + factors.append(self.rope_scaling) + factors.append(self.rope_theta) + # hf_config can control how the model looks! + factors.append(self.hf_config.to_json_string()) + str_factors = str(factors) + assert_hashable(str_factors) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __post_init__( + self, + # Multimodal config init vars + limit_mm_per_prompt: Optional[dict[str, int]], + media_io_kwargs: Optional[dict[str, dict[str, Any]]], + mm_processor_kwargs: Optional[dict[str, Any]], + mm_processor_cache_gb: Optional[float], + mm_processor_cache_type: Optional[MMCacheType], + mm_shm_cache_max_object_size_mb: Optional[int], + mm_encoder_tp_mode: Optional[MMEncoderTPMode], + interleave_mm_strings: Optional[bool], + skip_mm_profiling: Optional[bool]) -> None: + # Set the default seed to 0 in V1. + # NOTE(woosuk): In V0, we set the default seed to None because the + # driver worker shares the same process as the user process, and thus + # setting a seed affects the user process as well. + # In V1, we use separate processes for workers (unless + # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here + # doesn't affect the user process. However, without a consistent seed, + # different tensor parallel workers would sample different tokens, + # leading to inconsistent results. + if envs.VLLM_USE_V1 and self.seed is None: + self.seed = 0 + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", self.seed) + + # Keep set served_model_name before maybe_model_redirect(self.model) + self.served_model_name = get_served_model_name(self.model, + self.served_model_name) + self.model = maybe_model_redirect(self.model) + # The tokenizer is consistent with the model by default. + if self.tokenizer is None: + self.tokenizer = self.model + if self.tokenizer_revision is None: + self.tokenizer_revision = self.revision + self.tokenizer = maybe_model_redirect(self.tokenizer) + + if isinstance(self.hf_config_path, str): + self.hf_config_path = maybe_model_redirect(self.hf_config_path) + + if callable(self.hf_overrides): + hf_overrides_kw = {} + hf_overrides_fn = self.hf_overrides + else: + hf_overrides_kw = self.hf_overrides + hf_overrides_fn = None + + if self.rope_scaling: + hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling} + hf_overrides_kw.update(hf_override) + hf_overrides_str = json.dumps(hf_overrides_kw) + msg = ( + "`--rope-scaling` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + if self.rope_theta is not None: + hf_override = {"rope_theta": self.rope_theta} + hf_overrides_kw.update(hf_override) + hf_overrides_str = json.dumps(hf_overrides_kw) + msg = ( + "`--rope-theta` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + + if self.runner != "draft": + # If we're not running the draft model, check for speculators config + # If speculators config, set model / tokenizer to be target model + self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code) + + if (backend := envs.VLLM_ATTENTION_BACKEND + ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: + raise ValueError( + "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " + "module was not found. See " + "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 + "for instructions on how to install it.") + + from vllm.platforms import current_platform + + if (self.override_attention_dtype is not None + and not current_platform.is_rocm()): + warnings.warn( + "override-attention-dtype is set but not using ROCm platform", + stacklevel=2) + + if (self.enable_sleep_mode + and not current_platform.is_sleep_mode_available()): + raise ValueError( + "Sleep mode is not supported on current platform.") + + hf_config = get_config(self.hf_config_path or self.model, + self.trust_remote_code, + self.revision, + self.code_revision, + self.config_format, + hf_overrides_kw=hf_overrides_kw, + hf_overrides_fn=hf_overrides_fn) + + self.hf_config = hf_config + self.hf_text_config = get_hf_text_config(self.hf_config) + self.attention_chunk_size = getattr(self.hf_text_config, + "attention_chunk_size", None) + self.encoder_config = self._get_encoder_config() + self.hf_image_processor_config = get_hf_image_processor_config( + self.model, hf_token=self.hf_token, revision=self.revision) + + architectures = self.architectures + registry = self.registry + is_generative_model = registry.is_text_generation_model( + architectures, self) + is_pooling_model = registry.is_pooling_model(architectures, self) + + def _task_to_convert(task: TaskOption) -> ConvertType: + if task == "embedding" or task == "embed": + return "embed" + if task == "classify": + return "classify" + if task == "reward": + return "reward" + if task == "score": + new_task = self._get_default_pooling_task(architectures) + return "classify" if new_task == "classify" else "embed" + + return "none" + + if self.task is not None: + runner: RunnerOption = "auto" + convert: ConvertOption = "auto" + msg_prefix = ("The 'task' option has been deprecated and will be " + "removed in v0.13.0 or v1.0, whichever comes first.") + msg_hint = "Please remove this option." + + is_generative_task = self.task in _RUNNER_TASKS["generate"] + is_pooling_task = self.task in _RUNNER_TASKS["pooling"] + + if is_generative_model and is_pooling_model: + if is_generative_task: + runner = "generate" + convert = "auto" + msg_hint = ("Please replace this option with `--runner " + "generate` to continue using this model " + "as a generative model.") + elif is_pooling_task: + runner = "pooling" + convert = "auto" + msg_hint = ("Please replace this option with `--runner " + "pooling` to continue using this model " + "as a pooling model.") + else: # task == "auto" + pass + elif is_generative_model or is_pooling_model: + if is_generative_task: + runner = "generate" + convert = "auto" + msg_hint = "Please remove this option" + elif is_pooling_task: + runner = "pooling" + convert = _task_to_convert(self.task) + msg_hint = ("Please replace this option with `--convert " + f"{convert}` to continue using this model " + "as a pooling model.") + else: # task == "auto" + pass + else: + raise AssertionError("The model should be a generative or " + "pooling model when task is set to " + f"{self.task!r}.") + + self.runner = runner + self.convert = convert + + msg = f"{msg_prefix} {msg_hint}" + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + self.runner_type = self._get_runner_type(architectures, self.runner) + self.convert_type = self._get_convert_type(architectures, + self.runner_type, + self.convert) + + if self.runner_type == "generate" and not is_generative_model: + generate_converts = _RUNNER_CONVERTS["generate"] + if self.convert_type not in generate_converts: + # Currently we don't have any converters for generative models + raise ValueError( + "This model does not support `--runner generate`.") + if self.runner_type == "pooling" and not is_pooling_model: + pooling_converts = _RUNNER_CONVERTS["pooling"] + if self.convert_type not in pooling_converts: + convert_option = "<" + "|".join(pooling_converts) + ">" + raise ValueError( + "This model does not support `--runner pooling`. " + f"You can pass `--convert {convert_option} to adapt " + "it into a pooling model.") + + self.supported_tasks = self._get_supported_tasks( + architectures, self.runner_type, self.convert_type) + + # Note: Initialize these attributes early because transformers fallback + # may fail to load dynamic modules in child processes + model_info, arch = registry.inspect_model_cls(architectures, self) + self._model_info = model_info + self._architecture = arch + logger.info("Resolved architecture: %s", arch) + + # Init pooler config if needed + if self.runner_type == "pooling": + if self.override_pooler_config is not None: + logger.warning_once( + "`override_pooler_config` is deprecated and will be " + "removed in v0.12.0 or v1.0.0, whichever is sooner. " + "Please use `pooler_config` instead.") + + if isinstance(self.override_pooler_config, dict): + self.pooler_config = PoolerConfig( + **self.override_pooler_config) + else: + self.pooler_config = self.override_pooler_config + + if self.pooler_config is None: + self.pooler_config = PoolerConfig() + + base_config = get_pooling_config(self.model, self.revision) + if base_config is not None: + # Only set values that are not overridden by the user + for k, v in base_config.items(): + if getattr(self.pooler_config, k) is None: + setattr(self.pooler_config, k, v) + + default_pooling_type = self._model_info.default_pooling_type + if self.pooler_config.pooling_type is None: + self.pooler_config.pooling_type = default_pooling_type + + self.dtype: torch.dtype = _get_and_verify_dtype( + self.model, + self.hf_config, + self.dtype, + is_pooling_model=self.runner_type == "pooling", + revision=self.revision, + ) + + # Interleaved attention is not supported by some backends in V0 + if (not self.disable_sliding_window + and is_interleaved(self.hf_text_config) + and not envs.VLLM_USE_V1 + and (backend := envs.VLLM_ATTENTION_BACKEND) + in ("XFORMERS", "FLASHINFER")): + logger.warning_once( + "%s has interleaved attention, which is currently not " + "supported by the %s backend. Disabling sliding window and " + "capping the max length to the sliding window size (%d).", + self.hf_text_config.model_type, + backend, + self.hf_text_config.sliding_window, + ) + self.disable_sliding_window = True + + self.original_max_model_len = self.max_model_len + self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + # Init multimodal config if needed + if self._model_info.supports_multimodal: + if (mm_encoder_tp_mode == "data" and + not self._model_info.supports_multimodal_encoder_tp_data): + logger.warning_once( + "This model does not support `--mm-encoder-tp-mode data`. " + "Falling back to `--mm-encoder-tp-mode weights`.") + mm_encoder_tp_mode = "weights" + + mm_config_kwargs = dict( + limit_per_prompt=limit_mm_per_prompt, + media_io_kwargs=media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, + mm_processor_cache_gb=mm_processor_cache_gb, + mm_processor_cache_type=mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, + mm_encoder_tp_mode=mm_encoder_tp_mode, + interleave_mm_strings=interleave_mm_strings, + skip_mm_profiling=skip_mm_profiling, + ) + + mm_config_kwargs = { + k: v + for k, v in mm_config_kwargs.items() if v is not None + } + + self.multimodal_config = MultiModalConfig(**mm_config_kwargs) + + if self.disable_sliding_window: + # Set after get_and_verify_max_len to ensure that max_model_len + # can be correctly capped to sliding window size + self.hf_text_config.sliding_window = None + + if not self.skip_tokenizer_init: + self._verify_tokenizer_mode() + + # Avoid running try_verify_and_update_config multiple times + self.config_updated = False + + self._verify_quantization() + self._verify_cuda_graph() + self._verify_bnb_config() + + @field_validator("quantization", mode="before") + @classmethod + def validate_quantization_before(cls, value: Any) -> Any: + if isinstance(value, str): + return value.lower() + return value + + @model_validator(mode="after") + def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": + if not isinstance(self.tokenizer, str): + raise ValueError("tokenizer must be a string after __post_init__.") + if not isinstance(self.max_model_len, int): + raise ValueError( + "max_model_len must be an integer after __post_init__.") + return self + + def _get_transformers_backend_cls(self) -> str: + """Determine which Transformers backend class will be used if + `model_impl` is set to `transformers` or `auto`.""" + if getattr(self, "runner_type", self.runner) == "pooling": + return "TransformersModel" + if self.hf_config != self.hf_text_config: + # If 'hf_text_config' is the same as 'hf_config'. If not, it is + # probably a composite config, i.e. multimodal + return "TransformersForMultimodalLM" + return "TransformersForCausalLM" + + def using_transformers_backend(self) -> bool: + """Check if the model is using the Transformers backend class.""" + return self.architecture == self._get_transformers_backend_cls() + + @property + def registry(self): + return me_models.ModelRegistry + + @property + def architectures(self) -> list[str]: + return getattr(self.hf_config, "architectures", []) + + @property + def architecture(self) -> str: + """The architecture vllm actually used.""" + return self._architecture + + def maybe_pull_model_tokenizer_for_runai(self, model: str, + tokenizer: str) -> None: + """Pull model/tokenizer from Object Storage to temporary + directory when needed. + + Args: + model: Model name or path + tokenizer: Tokenizer name or path + """ + if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): + return + + if is_runai_obj_uri(model): + object_storage_model = ObjectStorageModel() + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"]) + self.model_weights = model + self.model = object_storage_model.dir + + # If tokenizer is same as model, download to same directory + if model == tokenizer: + object_storage_model.pull_files(model, + ignore_pattern=[ + "*.pt", "*.safetensors", + "*.bin", "*.tensors", + "*.pth" + ]) + self.tokenizer = object_storage_model.dir + return + + # Only download tokenizer if needed and not already handled + if is_runai_obj_uri(tokenizer): + object_storage_tokenizer = ObjectStorageModel() + object_storage_tokenizer.pull_files(model, + ignore_pattern=[ + "*.pt", "*.safetensors", + "*.bin", "*.tensors", + "*.pth" + ]) + self.tokenizer = object_storage_tokenizer.dir + + def _get_encoder_config(self): + return get_sentence_transformer_tokenizer_config( + self.model, self.revision) + + def _verify_tokenizer_mode(self) -> None: + tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) + if tokenizer_mode not in get_args(TokenizerMode): + raise ValueError( + f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " + f"one of {get_args(TokenizerMode)}.") + self.tokenizer_mode = tokenizer_mode + + def _get_default_runner_type( + self, + architectures: list[str], + ) -> RunnerType: + registry = self.registry + + # Some Sentence Transformers models use *ForCausalLM archs + if get_pooling_config(self.model, self.revision): + return "pooling" + + for arch in architectures: + if arch in registry.get_supported_archs(): + if registry.is_pooling_model(architectures, self): + return "pooling" + if registry.is_text_generation_model(architectures, self): + return "generate" + + match = try_match_architecture_defaults(arch) + if match: + _, (runner_type, _) = match + return runner_type + + return "generate" + + def _get_runner_type( + self, + architectures: list[str], + runner: RunnerOption, + ) -> RunnerType: + if runner != "auto": + return runner + + runner_type = self._get_default_runner_type(architectures) + + # Don't log the most common case + if runner_type != "generate": + logger.info( + "Resolved `--runner auto` to `--runner %s`. " + "Pass the value explicitly to silence this message.", + runner_type) + + return runner_type + + def _get_default_convert_type( + self, + architectures: list[str], + runner_type: RunnerType, + ) -> ConvertType: + registry = self.registry + + for arch in architectures: + if arch in registry.get_supported_archs(): + if (runner_type == "generate" + and registry.is_text_generation_model( + architectures, self)): + return "none" + if (runner_type == "pooling" + and registry.is_pooling_model(architectures, self)): + return "none" + + match = try_match_architecture_defaults(arch, + runner_type=runner_type) + if match: + _, (_, convert_type) = match + return convert_type + + # This is to handle Sentence Transformers models that use *ForCausalLM + # and also multi-modal pooling models which are not defined as + # Sentence Transformers models + if runner_type == "pooling": + return "embed" + + return "none" + + def _get_convert_type( + self, + architectures: list[str], + runner_type: RunnerType, + convert: ConvertOption, + ) -> ConvertType: + if convert != "auto": + return convert + + convert_type = self._get_default_convert_type(architectures, + runner_type) + + # Don't log the most common case + if convert_type != "none": + logger.info( + "Resolved `--convert auto` to `--convert %s`. " + "Pass the value explicitly to silence this message.", + convert_type) + + return convert_type + + def _get_supported_generation_tasks( + self, + architectures: list[str], + convert_type: ConvertType, + ) -> list[_ResolvedTask]: + registry = self.registry + + if registry.is_transcription_only_model(architectures, self): + return ["transcription"] + + # TODO: Use get_supported_generation_tasks once V0 is removed + supported_tasks = list[_ResolvedTask]() + if (registry.is_text_generation_model(architectures, self) + or convert_type in _RUNNER_CONVERTS["generate"]): + supported_tasks.append("generate") + + if registry.is_transcription_model(architectures, self): + supported_tasks.append("transcription") + + return supported_tasks + + def _get_default_pooling_task( + self, + architectures: list[str], + ) -> Literal["embed", "classify", "reward"]: + if self.registry.is_cross_encoder_model(architectures, self): + return "classify" + + for arch in architectures: + match = try_match_architecture_defaults(arch, + runner_type="pooling") + if match: + _, (_, convert_type) = match + assert convert_type != "none" + return convert_type + + return "embed" + + def _get_supported_pooling_tasks( + self, + architectures: list[str], + convert_type: ConvertType, + ) -> list[_ResolvedTask]: + registry = self.registry + + # TODO: Use get_supported_pooling_tasks once V0 is removed + supported_tasks = list[_ResolvedTask]() + if (registry.is_pooling_model(architectures, self) + or convert_type in _RUNNER_CONVERTS["pooling"]): + supported_tasks.append("encode") + + extra_task = (self._get_default_pooling_task(architectures) + if convert_type == "none" else convert_type) + supported_tasks.append(extra_task) + + return supported_tasks + + def _get_supported_tasks( + self, + architectures: list[str], + runner_type: RunnerType, + convert_type: ConvertType, + ) -> list[_ResolvedTask]: + if runner_type == "generate": + return self._get_supported_generation_tasks( + architectures, convert_type) + if runner_type == "pooling": + return self._get_supported_pooling_tasks(architectures, + convert_type) + if runner_type == "draft": + return ["draft"] + + assert_never(runner_type) + + def _parse_quant_hf_config(self, hf_config: PretrainedConfig): + quant_cfg = getattr(hf_config, "quantization_config", None) + if quant_cfg is None: + # compressed-tensors uses a "compression_config" key + quant_cfg = getattr(hf_config, "compression_config", None) + + else: + # Set quant_method for ModelOpt models. + producer_name = quant_cfg.get("producer", {}).get("name") + if producer_name == "modelopt": + quant_algo = quant_cfg.get("quantization", + {}).get("quant_algo") + if quant_algo == "FP8": + quant_cfg["quant_method"] = "modelopt" + elif quant_algo == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + elif quant_algo is not None: + raise ValueError( + f"Unknown ModelOpt quant algo: {quant_algo}") + + return quant_cfg + + def _verify_quantization(self) -> None: + supported_quantization = me_quant.QUANTIZATION_METHODS + if self.quantization is not None: + self.quantization = cast(me_quant.QuantizationMethods, + self.quantization) + + # Parse quantization method from the HF model config, if available. + quant_cfg = self._parse_quant_hf_config(self.hf_config) + if quant_cfg is None and (text_config := getattr( + self.hf_config, "text_config", None)): + # Check the text config as well for multi-modal models. + quant_cfg = self._parse_quant_hf_config(text_config) + + if quant_cfg is not None: + # Use the community standard 'quant_method' + quant_method = quant_cfg.get("quant_method", "").lower() + + # Normalize library names + quant_method = quant_method.replace("compressed_tensors", + "compressed-tensors") + + quant_cfg["quant_method"] = quant_method + + # Quantization methods which are overrides (i.e. they have a + # `override_quantization_method` method) must be checked in order + # of preference (this is particularly important for GPTQ). + overrides = [ + "bitblas", + "gptq_marlin_24", + "gptq_marlin", + "gptq_bitblas", + "awq_marlin", + "ipex", + "moe_wna16", + "modelopt", + "modelopt_fp4", + "petit_nvfp4", + ] + quantization_methods = [ + q for q in supported_quantization if q not in overrides + ] + # Any custom overrides will be in quantization_methods so we place + # them at the start of the list so custom overrides have preference + # over the built-in ones. + quantization_methods = quantization_methods + overrides + + # Detect which checkpoint is it + for name in quantization_methods: + method = me_quant.get_quantization_config(name) + quantization_override = method.override_quantization_method( + quant_cfg, self.quantization) + if quantization_override is not None: + # Raise error if the override is not custom (custom would + # be in QUANTIZATION_METHODS but not QuantizationMethods) + # and hasn't been added to the overrides list. + if (name in get_args(me_quant.QuantizationMethods) + and name not in overrides): + raise ValueError( + f"Quantization method {name} is an override but " + "is has not been added to the `overrides` list " + "above. This is necessary to ensure that the " + "overrides are checked in order of preference.") + quant_method = quantization_override + self.quantization = quantization_override + break + + # Verify quantization configurations. + if self.quantization is None: + self.quantization = quant_method + elif self.quantization != quant_method: + raise ValueError( + "Quantization method specified in the model config " + f"({quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization}).") + + if self.quantization is not None: + if self.quantization not in supported_quantization: + raise ValueError( + f"Unknown quantization method: {self.quantization}. Must " + f"be one of {supported_quantization}.") + from vllm.platforms import current_platform + current_platform.verify_quantization(self.quantization) + + def _verify_cuda_graph(self) -> None: + # The `max_seq_len_to_capture` was incorrectly + # based on the encoder's input length (448) + # but not the decoder's larger input length (1500). + # This change ensures the CUDA Graph captures the correct, + # larger sequence length, allowing it to work as intended. + effective_max_seq_len = self.max_model_len + if self.is_encoder_decoder: + effective_max_seq_len = max( + effective_max_seq_len, + getattr(self.hf_config, "max_source_positions", 0)) + self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, + effective_max_seq_len) + # CUDAGraph capture not supported for encoder-decoder models on ROCm + unsupported_rocm = self.is_encoder_decoder + + if (unsupported_rocm and not self.enforce_eager + and current_platform.is_rocm()): + logger.warning( + "CUDA graph is not supported for %s on ROCm yet, fallback " + "to eager mode.", self.hf_config.model_type) + self.enforce_eager = True + + def _verify_bnb_config(self) -> None: + """ + The current version of bitsandbytes (0.46.1) with 8-bit models does not + yet support CUDA graph. + # TODO Remove this when bitsandbytes supports. + """ + is_bitsandbytes = self.quantization == "bitsandbytes" + has_quantization_config = (getattr(self.hf_config, + "quantization_config", None) + is not None) + is_8bit = (self.hf_config.quantization_config.get( + "load_in_8bit", False) if has_quantization_config else False) + if all([ + is_bitsandbytes, + has_quantization_config, + is_8bit, + not self.enforce_eager, + ]): + logger.warning( + "CUDA graph is not supported on BitsAndBytes 8bit yet, " + "fallback to the eager mode.") + + self.enforce_eager = True + + def _verify_with_expert_parallelism(self) -> None: + num_expert_names = [ + "moe_num_experts", # Dbrx + "num_experts", # Jamba + "n_routed_experts", # DeepSeek + "num_local_experts", # Mixtral + ] + num_experts = 0 + for name in num_expert_names: + num_experts = getattr(self.hf_text_config, name, 0) + if num_experts > 0: + break + if num_experts < 1: + raise ValueError( + "Number of experts in the model must be greater than 0 " + "when expert parallelism is enabled.") + + def verify_dual_chunk_attention_config( + self, + load_config: LoadConfig, + ) -> None: + if hasattr(self.hf_config, "dual_chunk_attention_config"): + # Try loading the sparse attention config + from vllm.model_executor.model_loader.weight_utils import ( + get_sparse_attention_config) + sparse_attn_config = get_sparse_attention_config(self, load_config) + if sparse_attn_config: + self.hf_config.dual_chunk_attention_config[ + "sparse_attention_config"] = sparse_attn_config + if "sparse_attention_enabled" not in \ + self.hf_config.dual_chunk_attention_config: + self.hf_config.dual_chunk_attention_config[ + "sparse_attention_enabled"] = True + + if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL: + raise ValueError("please set VLLM_ATTENTION_BACKEND to " + f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") + + def verify_async_output_proc(self, parallel_config, speculative_config, + device_config) -> None: + if not self.use_async_output_proc: + # Nothing to check + return + + if parallel_config.pipeline_parallel_size > 1: + self.use_async_output_proc = False + return + + # Reminder: Please update docs/features/compatibility_matrix.md + # If the feature combo become valid + from vllm.platforms import current_platform + if not current_platform.is_async_output_supported(self.enforce_eager): + self.use_async_output_proc = False + return + + if envs.VLLM_USE_RAY_SPMD_WORKER: + self.use_async_output_proc = False + return + + # Async postprocessor is not necessary for pooling models + # since there is no token generation + if self.runner_type == "pooling": + self.use_async_output_proc = False + + # Reminder: Please update docs/features/compatibility_matrix.md + # If the feature combo become valid + if speculative_config: + self.use_async_output_proc = False + + def verify_with_parallel_config( + self, + parallel_config: ParallelConfig, + ) -> None: + + if parallel_config.distributed_executor_backend == "external_launcher": + assert self.seed is not None, ( + "Seed must be set when using external launcher backend to " + "make sure sampling results are the same across workers.") + + total_num_attention_heads = getattr(self.hf_text_config, + "num_attention_heads", 0) + tensor_parallel_size = parallel_config.tensor_parallel_size + if total_num_attention_heads % tensor_parallel_size != 0: + raise ValueError( + f"Total number of attention heads ({total_num_attention_heads})" + " must be divisible by tensor parallel size " + f"({tensor_parallel_size}).") + + if parallel_config.enable_expert_parallel: + self._verify_with_expert_parallelism() + + pipeline_parallel_size = parallel_config.pipeline_parallel_size + if pipeline_parallel_size > 1: + if not self.registry.is_pp_supported_model(self.architectures, + self): + raise NotImplementedError( + "Pipeline parallelism is not supported for this model. " + "Supported models implement the `SupportsPP` interface.") + + if self.use_async_output_proc: + self.use_async_output_proc = False + + def get_sliding_window(self) -> Optional[int]: + """Get the sliding window size from the HF text config if present.""" + return getattr(self.hf_text_config, "sliding_window", None) + + def get_vocab_size(self) -> int: + return getattr(self.hf_text_config, "vocab_size", 0) + + def get_hidden_size(self) -> int: + return getattr(self.hf_text_config, "hidden_size", 0) + + @property + def is_deepseek_mla(self) -> bool: + if not hasattr(self.hf_text_config, "model_type"): + return False + elif self.hf_text_config.model_type in \ + ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'kimi_k2'): + return self.hf_text_config.kv_lora_rank is not None + elif self.hf_text_config.model_type == 'eagle': + # if the model is an EAGLE module, check for the + # underlying architecture + return self.hf_text_config.model.model_type in \ + ('deepseek_v2', 'deepseek_v3') \ + and self.hf_text_config.kv_lora_rank is not None + return False + + def get_head_size(self) -> int: + # TODO remove hard code + if self.is_deepseek_mla: + qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", + 0) + if self.use_mla: + return self.hf_text_config.kv_lora_rank + qk_rope_head_dim + else: + qk_nope_head_dim = getattr(self.hf_text_config, + "qk_nope_head_dim", 0) + if qk_rope_head_dim and qk_nope_head_dim: + return qk_rope_head_dim + qk_nope_head_dim + + if hasattr(self.hf_text_config, + "model_type") and (self.hf_text_config.model_type + == "zamba2"): + return self.hf_text_config.attention_head_dim + + if self.is_attention_free: + return 0 + + # NOTE: Some configs may set head_dim=None in the config + if getattr(self.hf_text_config, "head_dim", None) is not None: + return self.hf_text_config.head_dim + + # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` + if getattr(self.hf_text_config, "hidden_size_per_head", + None) is not None: + return self.hf_text_config.hidden_size_per_head + + # FIXME(woosuk): This may not be true for all models. + return (self.hf_text_config.hidden_size // + self.hf_text_config.num_attention_heads) + + def get_total_num_kv_heads(self) -> int: + """Returns the total number of KV heads.""" + # For GPTBigCode & Falcon: + # NOTE: for falcon, when new_decoder_architecture is True, the + # multi_query flag is ignored and we use n_head_kv for the number of + # KV heads. + falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] + new_decoder_arch_falcon = ( + self.hf_config.model_type in falcon_model_types + and getattr(self.hf_config, "new_decoder_architecture", False)) + if not new_decoder_arch_falcon and getattr(self.hf_text_config, + "multi_query", False): + # Multi-query attention, only one KV head. + # Currently, tensor parallelism is not supported in this case. + return 1 + + # For DBRX and MPT + if self.hf_config.model_type == "mpt": + if "kv_n_heads" in self.hf_config.attn_config: + return self.hf_config.attn_config["kv_n_heads"] + return self.hf_config.num_attention_heads + if self.hf_config.model_type == "dbrx": + return getattr(self.hf_config.attn_config, "kv_n_heads", + self.hf_config.num_attention_heads) + + if self.hf_config.model_type == "nemotron-nas": + for block in self.hf_config.block_configs: + if not block.attention.no_op: + return self.hf_config.num_attention_heads \ + // block.attention.n_heads_in_group + + raise RuntimeError("Couldn't determine number of kv heads") + + if self.is_attention_free: + return 0 + + attributes = [ + # For Falcon: + "n_head_kv", + "num_kv_heads", + # For LLaMA-2: + "num_key_value_heads", + # For ChatGLM: + "multi_query_group_num", + ] + for attr in attributes: + num_kv_heads = getattr(self.hf_text_config, attr, None) + if num_kv_heads is not None: + return num_kv_heads + + # For non-grouped-query attention models, the number of KV heads is + # equal to the number of attention heads. + return self.hf_text_config.num_attention_heads + + def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: + """Returns the number of KV heads per GPU.""" + if self.use_mla: + # When using MLA during decode it becomes MQA + return 1 + + total_num_kv_heads = self.get_total_num_kv_heads() + # If tensor parallelism is used, we divide the number of KV heads by + # the tensor parallel size. We will replicate the KV heads in the + # case where the number of KV heads is smaller than the tensor + # parallel size so each GPU has at least one KV head. + return max(1, + total_num_kv_heads // parallel_config.tensor_parallel_size) + + def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int: + num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + return num_heads // parallel_config.tensor_parallel_size + + def get_layers_start_end_indices( + self, parallel_config: ParallelConfig) -> tuple[int, int]: + from vllm.distributed.utils import get_pp_indices + if (self.hf_text_config.model_type == "deepseek_mtp" + or self.hf_config.model_type == "mimo_mtp" + or self.hf_config.model_type == "glm4_moe_mtp" + or self.hf_config.model_type == "ernie_mtp" + or self.hf_config.model_type == "qwen3_next_mtp"): + total_num_hidden_layers = getattr(self.hf_text_config, + "num_nextn_predict_layers", 0) + else: + total_num_hidden_layers = getattr(self.hf_text_config, + "num_hidden_layers", 0) + # the layout order is: DP x PP x TP + pp_rank = (parallel_config.rank // parallel_config.tensor_parallel_size + ) % parallel_config.pipeline_parallel_size + pp_size = parallel_config.pipeline_parallel_size + start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) + return start, end + + def get_num_layers(self, parallel_config: ParallelConfig) -> int: + start, end = self.get_layers_start_end_indices(parallel_config) + return end - start + + def get_num_layers_by_block_type( + self, + parallel_config: ParallelConfig, + block_type: LayerBlockType = LayerBlockType.attention, + ) -> int: + # This function relies on 'layers_block_type' in hf_config, + # for w/o this attribute, we will need to have workarounds like so + attn_block_type = block_type == LayerBlockType.attention + is_transformer = not self.is_hybrid and \ + not self.has_noops and \ + not self.is_attention_free + start, end = self.get_layers_start_end_indices(parallel_config) + + if is_transformer: + # Handle the basic case first + return end - start if attn_block_type else 0 + elif self.is_attention_free: + # Attention free + # Note that this code assumes there + # is only one type of attention-free block type. + return 0 if attn_block_type else end - start + elif self.has_noops: + block_configs = self.hf_config.block_configs + return sum(not bc.attention.no_op + for bc in block_configs[start:end]) + else: + # Hybrid model Jamba + layers_block_type_value = getattr(self.hf_text_config, + "layers_block_type", None) + if layers_block_type_value is not None: + if hasattr(self.hf_text_config, + "model_type") and (self.hf_text_config.model_type + == "zamba2"): + if attn_block_type: + return sum(t == "hybrid" + for t in layers_block_type_value[start:end]) + else: + return self.get_num_layers(parallel_config) + return sum(t == block_type.value + for t in layers_block_type_value[start:end]) + + # Hybrid model Minimax + attn_type_list = getattr(self.hf_config, "attn_type_list", None) + if attn_type_list: + return sum(t == 1 for t in attn_type_list[start:end]) + + # Hybrid model Qwen3Next + layer_types_value = getattr(self.hf_config, "layer_types", None) + if layer_types_value is not None: + if getattr(block_type, "value", block_type) == "attention": + return sum(t == "full_attention" + for t in layer_types_value[start:end]) + elif getattr(block_type, "value", + block_type) == "linear_attention": + return sum(t == "linear_attention" + for t in layer_types_value[start:end]) + else: + return sum(t == getattr(block_type, "value", block_type) + for t in layer_types_value[start:end]) + + if (layers_block_type_value is None and attn_type_list is None + and layer_types_value is None): + raise ValueError( + "The model is an hybrid without a" + "layers_block_type or an attn_type_list, or a layer_types " + "in the hf_config, cannot determine the num of " + f"{block_type.value} layers") + + def get_mamba_chunk_size(self) -> Optional[int]: + """ + Returns the mamba chunk size if it exists + """ + # used by e.g. Bamba, FalconH1, Granite, PLaMo2 + chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None) + if chunk_size is None: + # used by e.g. Mamba2, NemotronH, Zamba + chunk_size = getattr(self.hf_text_config, "chunk_size", None) + return chunk_size + + def get_multimodal_config(self) -> MultiModalConfig: + """ + Get the multimodal configuration of the model. + + Raises: + ValueError: If the model is not multimodal. + """ + if self.multimodal_config is None: + raise ValueError("The model is not multimodal.") + + return self.multimodal_config + + def try_get_generation_config(self) -> dict[str, Any]: + """ + This method attempts to retrieve the non-default values of the + generation config for this model. + + The generation config can contain information about special tokens, as + well as sampling parameters. Which is why this method exists separately + to `get_diff_sampling_param`. + + Returns: + A dictionary containing the non-default generation config. + """ + if self.generation_config in {"auto", "vllm"}: + config = try_get_generation_config( + self.hf_config_path or self.model, + trust_remote_code=self.trust_remote_code, + revision=self.revision, + ) + else: + config = try_get_generation_config( + self.generation_config, + trust_remote_code=self.trust_remote_code, + ) + + if config is None: + return {} + + return config.to_diff_dict() + + def get_diff_sampling_param(self) -> dict[str, Any]: + """ + This method returns a dictionary containing the non-default sampling + parameters with `override_generation_config` applied. + + The default sampling parameters are: + + - vLLM's neutral defaults if `self.generation_config="vllm"` + - the model's defaults if `self.generation_config="auto"` + - as defined in `generation_config.json` if + `self.generation_config="path/to/generation_config/dir"` + + Returns: + A dictionary containing the non-default sampling parameters. + """ + if self.generation_config == "vllm": + config = {} + else: + config = self.try_get_generation_config() + + # Overriding with given generation config + config.update(self.override_generation_config) + + available_params = [ + "repetition_penalty", + "temperature", + "top_k", + "top_p", + "min_p", + "max_new_tokens", + ] + if any(p in config for p in available_params): + diff_sampling_param = { + p: config.get(p) + for p in available_params if config.get(p) is not None + } + # Huggingface definition of max_new_tokens is equivalent + # to vLLM's max_tokens + if "max_new_tokens" in diff_sampling_param: + diff_sampling_param["max_tokens"] = diff_sampling_param.pop( + "max_new_tokens") + else: + diff_sampling_param = {} + + if diff_sampling_param: + logger.warning_once( + "Default sampling parameters have been overridden by the " + "model's Hugging Face generation config recommended from the " + "model creator. If this is not intended, please relaunch " + "vLLM instance with `--generation-config vllm`.") + return diff_sampling_param + + @property + def is_encoder_decoder(self) -> bool: + """Extract the HF encoder/decoder model flag.""" + return is_encoder_decoder(self.hf_config) + + @property + def uses_mrope(self) -> bool: + return uses_mrope(self.hf_config) + + @property + def is_multimodal_model(self) -> bool: + return self.multimodal_config is not None + + @property + def is_multimodal_raw_input_only_model(self) -> bool: + return self._model_info.supports_multimodal_raw_input_only + + @property + def is_cross_encoder(self) -> bool: + return (self._model_info.supports_cross_encoding + or self.convert_type == "classify") + + @property + def is_pp_supported(self) -> bool: + return self._model_info.supports_pp + + @property + def is_attention_free(self) -> bool: + return self._model_info.is_attention_free + + @property + def is_hybrid(self) -> bool: + return self._model_info.is_hybrid + + @property + def has_noops(self) -> bool: + return self._model_info.has_noops + + @property + def has_inner_state(self): + return self._model_info.has_inner_state + + @property + def is_v1_compatible(self) -> bool: + return not self._model_info.supports_v0_only + + @property + def use_mla(self) -> bool: + return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE + + @property + def is_matryoshka(self) -> bool: + return (bool(getattr(self.hf_config, "matryoshka_dimensions", None)) + or getattr(self.hf_config, "is_matryoshka", False)) + + @property + def matryoshka_dimensions(self): + return getattr(self.hf_config, "matryoshka_dimensions", None) + + @property + def use_pad_token(self) -> bool: + # cross_encoder models defaults to using pad_token. + # `llm as reranker` models defaults to not using pad_token. + return getattr(self.hf_config, "use_pad_token", True) + + @property + def head_dtype(self) -> torch.dtype: + """ + "head" refers to the last Linear layer(s) of an LLM, + such as the lm_head in a generation model, + or the score or classifier in a classification model. + + `head_dtype` currently only supports pooling models.\n + - The pooling model defaults to using fp32 head, + you can use --hf-overrides '{"head_dtype": "model"}' to disable it. + """ + + head_dtype = _get_head_dtype(config=self.hf_config, + dtype=self.dtype, + runner_type=self.runner_type) + + if self.runner_type != "pooling" and head_dtype != self.dtype: + logger.warning_once( + "`head_dtype` currently only supports pooling models." + "fallback to model dtype [%s].", self.dtype) + return self.dtype + + if head_dtype not in current_platform.supported_dtypes: + logger.warning_once( + "The current platform does not support [%s] head dtype, " + "fallback to model dtype [%s].", head_dtype, self.dtype) + return self.dtype + + logger.debug_once("head dtype: %s", head_dtype) + return head_dtype + + def get_and_verify_max_len(self, max_model_len: int): + # Consider max_model_len in tokenizer_config only when + # pooling models use absolute position_embedding. + tokenizer_config = None + if (self.runner_type == "pooling" and getattr( + self.hf_config, "position_embedding_type", "") == "absolute"): + tokenizer_config = try_get_tokenizer_config( + self.tokenizer, + trust_remote_code=self.trust_remote_code, + revision=self.tokenizer_revision) + max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + tokenizer_config=tokenizer_config, + max_model_len=max_model_len, + disable_sliding_window=self.disable_sliding_window, + sliding_window=self.get_sliding_window(), + spec_target_max_model_len=self.spec_target_max_model_len, + encoder_config=self.encoder_config) + logger.info("Using max model len %s", max_model_len) + return max_model_len + + +def get_served_model_name(model: str, + served_model_name: Optional[Union[str, list[str]]]): + """ + If the input is a non-empty list, the first model_name in + `served_model_name` is taken. + If the input is a non-empty string, it is used directly. + For cases where the input is either an empty string or an + empty list, the fallback is to use `self.model`. + """ + if not served_model_name: + return model + if isinstance(served_model_name, list): + return served_model_name[0] + return served_model_name + + +# Some model suffixes are based on auto classes from Transformers: +# https://huggingface.co/docs/transformers/en/model_doc/auto +# NOTE: Items higher on this list priority over lower ones +_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [ + ("ForCausalLM", ("generate", "none")), + ("ForConditionalGeneration", ("generate", "none")), + ("ChatModel", ("generate", "none")), + ("LMHeadModel", ("generate", "none")), + ("ForTextEncoding", ("pooling", "embed")), + ("EmbeddingModel", ("pooling", "embed")), + ("ForSequenceClassification", ("pooling", "classify")), + ("ForAudioClassification", ("pooling", "classify")), + ("ForImageClassification", ("pooling", "classify")), + ("ForVideoClassification", ("pooling", "classify")), + ("ClassificationModel", ("pooling", "classify")), + ("ForRewardModeling", ("pooling", "reward")), + ("RewardModel", ("pooling", "reward")), + # Let other `*Model`s take priority + ("Model", ("pooling", "embed")), +] + + +def iter_architecture_defaults(): + yield from _SUFFIX_TO_DEFAULTS + + +def try_match_architecture_defaults( + architecture: str, + *, + runner_type: Optional[RunnerType] = None, + convert_type: Optional[ConvertType] = None, +) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]: + for suffix, (default_runner_type, + default_convert_type) in iter_architecture_defaults(): + if ((runner_type is None or runner_type == default_runner_type) and + (convert_type is None or convert_type == default_convert_type) + and architecture.endswith(suffix)): + return suffix, (default_runner_type, default_convert_type) + + return None + + +_STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.float16, + "float16": torch.float16, + "float": torch.float32, + "float32": torch.float32, + "bfloat16": torch.bfloat16, +} + +# model_type -> reason +_FLOAT16_NOT_SUPPORTED_MODELS = { + "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3_text": + "Numerical instability. Please use bfloat16 or float32 instead.", + "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", + "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", +} + + +def _is_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 + return False + + return True + + +def _check_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: + reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] + raise ValueError(f"The model type {model_type!r} " + f"does not support float16. Reason: {reason}") + + return True + + +def _find_dtype( + model_id: str, + config: PretrainedConfig, + *, + revision: Optional[str], +): + # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct + # because config.torch_dtype can be None. + config_dtype = getattr(config, "torch_dtype", None) + + # Fallbacks for multi-modal models if the root config + # does not define torch_dtype + if config_dtype is None: + config_dtype = getattr(config.get_text_config(), "torch_dtype", None) + if config_dtype is None and hasattr(config, "vision_config"): + config_dtype = getattr(config.vision_config, "torch_dtype", None) + if config_dtype is None and hasattr(config, "encoder_config"): + config_dtype = getattr(config.encoder_config, "torch_dtype", None) + + # Try to read the dtype of the weights if they are in safetensors format + if config_dtype is None: + repo_mt = try_get_safetensors_metadata(model_id, revision=revision) + + if repo_mt and (files_mt := repo_mt.files_metadata): + param_dtypes: set[torch.dtype] = { + _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] + for file_mt in files_mt.values() + for dtype_str in file_mt.parameter_count + if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE + } + + if param_dtypes: + return common_broadcastable_dtype(param_dtypes) + + if config_dtype is None: + config_dtype = torch.float32 + + return config_dtype + + +def _resolve_auto_dtype( + model_type: str, + config_dtype: torch.dtype, + *, + is_pooling_model: bool, +): + from vllm.platforms import current_platform + + supported_dtypes = [ + dtype for dtype in current_platform.supported_dtypes + if _is_valid_dtype(model_type, dtype) + ] + + if is_pooling_model and torch.float16 in supported_dtypes: + preferred_dtype = torch.float16 + else: + preferred_dtype = supported_dtypes[0] + + # Downcast for float32 models + if config_dtype == torch.float32: + config_dtype = preferred_dtype + + if config_dtype in supported_dtypes: + return config_dtype + + # Ensure device compatibility + device_name = current_platform.get_device_name() + device_capability = current_platform.get_device_capability() + + if device_capability is None: + device_str = f"{device_name!r}" + else: + version_str = device_capability.as_version_str() + device_str = f"{device_name!r} (with compute capability {version_str})" + + logger.warning( + "Your device %s doesn't support %s. " + "Falling back to %s for compatibility.", + device_str, + config_dtype, + preferred_dtype, + ) + + return preferred_dtype + + +def _get_and_verify_dtype( + model_id: str, + config: PretrainedConfig, + dtype: Union[str, torch.dtype], + *, + is_pooling_model: bool, + revision: Optional[str] = None, +) -> torch.dtype: + config_dtype = _find_dtype(model_id, config, revision=revision) + model_type = config.model_type + + if isinstance(dtype, str): + dtype = dtype.lower() + if dtype == "auto": + # Set default dtype from model config + torch_dtype = _resolve_auto_dtype( + model_type, + config_dtype, + is_pooling_model=is_pooling_model, + ) + else: + if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {dtype!r}") + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] + elif isinstance(dtype, torch.dtype): + torch_dtype = dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + _check_valid_dtype(model_type, torch_dtype) + + if torch_dtype != config_dtype: + if torch_dtype == torch.float32: + # Upcasting to float32 is allowed. + logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) + elif config_dtype == torch.float32: + # Downcasting from float32 to float16 or bfloat16 is allowed. + logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) + else: + # Casting between float16 and bfloat16 is allowed with a warning. + logger.warning("Casting %s to %s.", config_dtype, torch_dtype) + + return torch_dtype + + +def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype, + runner_type: str) -> torch.dtype: + head_dtype: Optional[Union[str, + torch.dtype]] = getattr(config, "head_dtype", + None) + + if head_dtype == "model": + return dtype + elif isinstance(head_dtype, str): + head_dtype = head_dtype.lower() + if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {head_dtype!r}") + return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype] + elif isinstance(head_dtype, torch.dtype): + return head_dtype + elif head_dtype is None: + if torch.float32 not in current_platform.supported_dtypes: + return dtype + if runner_type == "pooling": + return torch.float32 + return dtype + else: + raise ValueError(f"Unknown dtype: {head_dtype}") + + +def _get_and_verify_max_len( + hf_config: PretrainedConfig, + tokenizer_config: Optional[dict], + max_model_len: Optional[int], + disable_sliding_window: bool, + sliding_window: Optional[int], + spec_target_max_model_len: Optional[int] = None, + encoder_config: Optional[Any] = None, +) -> int: + """Get and verify the model's maximum length.""" + derived_max_model_len = float("inf") + possible_keys = [ + # OPT + "max_position_embeddings", + # GPT-2 + "n_positions", + # MPT + "max_seq_len", + # ChatGLM2 + "seq_length", + # Command-R + "model_max_length", + # Whisper + "max_target_positions", + # Others + "max_sequence_length", + "max_seq_length", + "seq_len", + ] + # Choose the smallest "max_length" from the possible keys + max_len_key = None + for key in possible_keys: + max_len = getattr(hf_config, key, None) + if max_len is not None: + max_len_key = key if max_len < derived_max_model_len \ + else max_len_key + derived_max_model_len = min(derived_max_model_len, max_len) + # For Command-R / Cohere, Cohere2 / Aya Vision models + if tmp_max_len := getattr(hf_config, "model_max_length", None): + max_len_key = "model_max_length" + derived_max_model_len = tmp_max_len + + # If sliding window is manually disabled, max_length should be less + # than the sliding window length in the model config. + if (disable_sliding_window and sliding_window is not None + and sliding_window < derived_max_model_len): + max_len_key = "sliding_window" + derived_max_model_len = sliding_window + + # Consider model_max_length in tokenizer_config + if tokenizer_config: + tokenizer_model_max_length = tokenizer_config.get( + "model_max_length", derived_max_model_len) + derived_max_model_len = min(derived_max_model_len, + tokenizer_model_max_length) + + # If none of the keys were found in the config, use a default and + # log a warning. + if derived_max_model_len == float("inf"): + if max_model_len is not None: + # If max_model_len is specified, we use it. + return max_model_len + + if spec_target_max_model_len is not None: + # If this is a speculative draft model, we use the max model len + # from the target model. + return spec_target_max_model_len + + default_max_len = 2048 + logger.warning( + "The model's config.json does not contain any of the following " + "keys to determine the original maximum length of the model: " + "%s. Assuming the model's maximum length is %d.", possible_keys, + default_max_len) + derived_max_model_len = default_max_len + + rope_scaling = getattr(hf_config, "rope_scaling", None) + # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE + # scaling, so we skip applying the scaling factor again. + if rope_scaling is not None and "gemma3" not in hf_config.model_type: + # No need to consider "type" key because of patch_rope_scaling when + # loading HF config + rope_type = rope_scaling["rope_type"] + + if rope_type not in ("su", "longrope", "llama3"): + if disable_sliding_window: + # TODO(robertgshaw): Find a model that supports rope_scaling + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "with rope_scaling. Please raise an issue so we can " + "investigate.") + + # NOTE: rope_type == "default" does not define factor + # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + scaling_factor = rope_scaling.get("factor", 1.0) + + if rope_type == "yarn": + derived_max_model_len = rope_scaling[ + "original_max_position_embeddings"] + derived_max_model_len *= scaling_factor + + if encoder_config and "max_seq_length" in encoder_config: + derived_max_model_len = encoder_config["max_seq_length"] + + # If the user specified a max length, make sure it is smaller than the + # derived length from the HF model config. + if max_model_len is None: + max_model_len = int(derived_max_model_len) + if current_platform.is_tpu(): + logger.warning( + "--max-model-len is not specified, " + "it's currently using model's default length %s, " + "which might be too large." + "Please input with --max-model-len based on your " + "request input length and output length, to avoid " + "unnecessary degradation.", max_model_len) + elif max_model_len > derived_max_model_len: + # Some models might have a separate key for specifying model_max_length + # that will be bigger than derived_max_model_len. We compare user input + # with model_max_length and allow this override when it's smaller. + model_max_length = getattr(hf_config, "model_max_length", None) + if model_max_length is not None and max_model_len <= model_max_length: + if disable_sliding_window: + # TODO(robertgshaw): Find a model that has model_max_length + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "model_max_length in the config. Please raise an issue " + "so we can investigate.") + else: + msg = ( + f"User-specified max_model_len ({max_model_len}) is greater " + f"than the derived max_model_len ({max_len_key}=" + f"{derived_max_model_len} or model_max_length=" + f"{model_max_length} in model's config.json).") + warning = ( + "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme " + "caution. If the model uses relative position encoding (RoPE), " + "positions exceeding derived_max_model_len lead to nan. If the " + "model uses absolute position encoding, positions exceeding " + "derived_max_model_len will cause a CUDA array out-of-bounds " + "error.") + if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: + logger.warning_once("%s %s", msg, warning) + else: + raise ValueError( + f"{msg} To allow overriding this maximum, set " + f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}") + return int(max_model_len) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 93002012799ab..f0f67bab9d6ff 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -3,7 +3,7 @@ import hashlib from dataclasses import field -from typing import TYPE_CHECKING, Any, Literal, Optional, Union +from typing import Any, Literal, Optional, Union from pydantic import SkipValidation, model_validator from pydantic.dataclasses import dataclass @@ -15,13 +15,9 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, POOLING_MODEL_MAX_NUM_BATCHED_TOKENS) -if TYPE_CHECKING: - from vllm.config import RunnerType -else: - RunnerType = Any - logger = init_logger(__name__) +RunnerType = Literal["generate", "pooling", "draft"] PreemptionMode = Literal["swap", "recompute"] SchedulerPolicy = Literal["fcfs", "priority"] diff --git a/vllm/config/utils.py b/vllm/config/utils.py index db8c05ef8be4a..91e61b3302738 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -1,8 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import inspect +import textwrap from dataclasses import MISSING, Field, field, fields, is_dataclass -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar + +import regex as re if TYPE_CHECKING: from _typeshed import DataclassInstance @@ -45,3 +50,96 @@ def get_field(cls: ConfigType, name: str) -> Field: return field(default=default) raise ValueError( f"{cls.__name__}.{name} must have a default value or default factory.") + + +def contains_object_print(text: str) -> bool: + """ + Check if the text looks like a printed Python object, e.g. + contains any substring matching the pattern: "at 0xFFFFFFF>" + We match against 0x followed by 2-16 hex chars (there's + a max of 16 on a 64-bit system). + + Args: + text (str): The text to check + + Returns: + result (bool): `True` if a match is found, `False` otherwise. + """ + pattern = r'at 0x[a-fA-F0-9]{2,16}>' + match = re.search(pattern, text) + return match is not None + + +def assert_hashable(text: str) -> bool: + if not contains_object_print(text): + return True + raise AssertionError( + f"vLLM tried to hash some configs that may have Python objects ids " + f"in them. This is a bug, please file an issue. " + f"Text being hashed: {text}") + + +def get_attr_docs(cls: type[Any]) -> dict[str, str]: + """ + Get any docstrings placed after attribute assignments in a class body. + + https://davidism.com/mit-license/ + """ + + def pairwise(iterable): + """ + Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise + + Can be removed when Python 3.9 support is dropped. + """ + iterator = iter(iterable) + a = next(iterator, None) + + for b in iterator: + yield a, b + a = b + + try: + cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] + except (OSError, KeyError, TypeError): + # HACK: Python 3.13+ workaround - set missing __firstlineno__ + # Workaround can be removed after we upgrade to pydantic==2.12.0 + with open(inspect.getfile(cls)) as f: + for i, line in enumerate(f): + if f"class {cls.__name__}" in line and ":" in line: + cls.__firstlineno__ = i + 1 + break + cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] + + if not isinstance(cls_node, ast.ClassDef): + raise TypeError("Given object was not a class.") + + out = {} + + # Consider each pair of nodes. + for a, b in pairwise(cls_node.body): + # Must be an assignment then a constant string. + if (not isinstance(a, (ast.Assign, ast.AnnAssign)) + or not isinstance(b, ast.Expr) + or not isinstance(b.value, ast.Constant) + or not isinstance(b.value.value, str)): + continue + + doc = inspect.cleandoc(b.value.value) + + # An assignment can have multiple targets (a = b = v), but an + # annotated assignment only has one target. + targets = a.targets if isinstance(a, ast.Assign) else [a.target] + + for target in targets: + # Must be assigning to a plain name. + if not isinstance(target, ast.Name): + continue + + out[target.id] = doc + + return out + + +def is_init_field(cls: ConfigType, name: str) -> bool: + return next(f for f in fields(cls) if f.name == name).init diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 27462b8fa0dad..ecf4e486a016f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -27,11 +27,11 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, EPLBConfig, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, - ModelDType, ModelImpl, ObservabilityConfig, - ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, - RunnerOption, SchedulerConfig, SchedulerPolicy, - SpeculativeConfig, StructuredOutputsConfig, - TaskOption, TokenizerMode, VllmConfig, get_attr_docs) + ModelDType, ObservabilityConfig, ParallelConfig, + PoolerConfig, PrefixCachingHashAlgo, RunnerOption, + SchedulerConfig, SchedulerPolicy, SpeculativeConfig, + StructuredOutputsConfig, TaskOption, TokenizerMode, + VllmConfig, get_attr_docs) from vllm.config.multimodal import MMCacheType, MultiModalConfig from vllm.config.parallel import ExpertPlacementStrategy from vllm.config.utils import get_field @@ -548,7 +548,6 @@ class EngineArgs: model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", - choices=[f.value for f in LogprobsMode], **model_kwargs["logprobs_mode"]) model_group.add_argument("--disable-sliding-window", **model_kwargs["disable_sliding_window"]) @@ -593,9 +592,7 @@ class EngineArgs: **model_kwargs["override_generation_config"]) model_group.add_argument("--enable-sleep-mode", **model_kwargs["enable_sleep_mode"]) - model_group.add_argument("--model-impl", - choices=[f.value for f in ModelImpl], - **model_kwargs["model_impl"]) + model_group.add_argument("--model-impl", **model_kwargs["model_impl"]) model_group.add_argument("--override-attention-dtype", **model_kwargs["override_attention_dtype"]) model_group.add_argument("--logits-processors", diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 0c2441a6db44d..bd1773c753a93 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -13,8 +13,7 @@ from torch import nn from typing_extensions import assert_never from vllm.attention import Attention -from vllm.config import (ModelConfig, ModelImpl, VllmConfig, - set_current_vllm_config) +from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.linear import QKVCrossParallelLinear from vllm.model_executor.layers.quantization.base_config import ( @@ -176,8 +175,8 @@ def get_model_architecture( ) if arch == model_config._get_transformers_backend_cls(): - assert model_config.model_impl != ModelImpl.VLLM - if model_config.model_impl == ModelImpl.AUTO: + assert model_config.model_impl != "vllm" + if model_config.model_impl == "auto": logger.warning_once( "%s has no vLLM implementation, falling back to Transformers " "implementation. Some features may not be supported and " diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 1382fd9e93ea3..76f2bd087624c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -19,7 +19,7 @@ from typing import Callable, Optional, TypeVar, Union import torch.nn as nn import transformers -from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults, +from vllm.config import (ModelConfig, iter_architecture_defaults, try_match_architecture_defaults) from vllm.logger import init_logger from vllm.transformers_utils.dynamic_module import ( @@ -587,7 +587,7 @@ class _ModelRegistry: if model_module is not None: break else: - if model_config.model_impl != ModelImpl.TRANSFORMERS: + if model_config.model_impl != "transformers": return None raise ValueError( @@ -598,7 +598,7 @@ class _ModelRegistry: "'auto_map' (relevant if the model is custom).") if not model_module.is_backend_compatible(): - if model_config.model_impl != ModelImpl.TRANSFORMERS: + if model_config.model_impl != "transformers": return None raise ValueError( @@ -644,20 +644,20 @@ class _ModelRegistry: raise ValueError("No model architectures are specified") # Require transformers impl - if model_config.model_impl == ModelImpl.TRANSFORMERS: + if model_config.model_impl == "transformers": arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_info = self._try_inspect_model_cls(arch) if model_info is not None: return (model_info, arch) - elif model_config.model_impl == ModelImpl.TERRATORCH: + elif model_config.model_impl == "terratorch": model_info = self._try_inspect_model_cls("Terratorch") return (model_info, "Terratorch") # Fallback to transformers impl (after resolving convert_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO + and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none"): arch = self._try_resolve_transformers(architectures[0], model_config) @@ -674,7 +674,7 @@ class _ModelRegistry: # Fallback to transformers impl (before resolving runner_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO): + and model_config.model_impl == "auto"): arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: @@ -695,14 +695,14 @@ class _ModelRegistry: raise ValueError("No model architectures are specified") # Require transformers impl - if model_config.model_impl == ModelImpl.TRANSFORMERS: + if model_config.model_impl == "transformers": arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: model_cls = self._try_load_model_cls(arch) if model_cls is not None: return (model_cls, arch) - elif model_config.model_impl == ModelImpl.TERRATORCH: + elif model_config.model_impl == "terratorch": arch = "Terratorch" model_cls = self._try_load_model_cls(arch) if model_cls is not None: @@ -710,7 +710,7 @@ class _ModelRegistry: # Fallback to transformers impl (after resolving convert_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO + and model_config.model_impl == "auto" and getattr(model_config, "convert_type", "none") == "none"): arch = self._try_resolve_transformers(architectures[0], model_config) @@ -727,7 +727,7 @@ class _ModelRegistry: # Fallback to transformers impl (before resolving runner_type) if (all(arch not in self.models for arch in architectures) - and model_config.model_impl == ModelImpl.AUTO): + and model_config.model_impl == "auto"): arch = self._try_resolve_transformers(architectures[0], model_config) if arch is not None: diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index cc5653b10ec1d..747e52f2e5892 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -29,15 +29,12 @@ class TopKTopPSampler(nn.Module): Implementations may update the logits tensor in-place. """ - def __init__( - self, - logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None: + def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None: super().__init__() self.logprobs_mode = logprobs_mode # flashinfer optimization does not apply if intermediate # logprobs/logits after top_k/top_p need to be returned - if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS, - LogprobsMode.PROCESSED_LOGPROBS + if logprobs_mode not in ("processed_logits", "processed_logprobs" ) and current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ @@ -90,9 +87,9 @@ class TopKTopPSampler(nn.Module): """ logits = self.apply_top_k_top_p(logits, k, p) logits_to_return = None - if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + if self.logprobs_mode == "processed_logits": logits_to_return = logits - elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + elif self.logprobs_mode == "processed_logprobs": logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) return random_sample(probs, generators), logits_to_return @@ -115,7 +112,7 @@ class TopKTopPSampler(nn.Module): "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) assert self.logprobs_mode not in ( - LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS + "processed_logits", "processed_logprobs" ), "FlashInfer does not support returning logits/logprobs" # flashinfer sampling functions expect contiguous logits. # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 546531a91610f..fa2a6e590f22d 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -60,8 +60,7 @@ class Sampler(nn.Module): 9. Return the final `SamplerOutput`. """ - def __init__(self, - logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS): + def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"): super().__init__() self.topk_topp_sampler = TopKTopPSampler(logprobs_mode) self.pin_memory = is_pin_memory_available() @@ -78,9 +77,9 @@ class Sampler(nn.Module): # is used for sampling (after penalties and temperature scaling). num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: - if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS: + if self.logprobs_mode == "raw_logprobs": raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == LogprobsMode.RAW_LOGITS: + elif self.logprobs_mode == "raw_logits": raw_logprobs = logits.clone() # Use float32 for the logits. @@ -156,9 +155,9 @@ class Sampler(nn.Module): if sampling_metadata.all_greedy: processed_logprobs = None if sampling_metadata.max_num_logprobs is not None: - if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + if self.logprobs_mode == "processed_logits": processed_logprobs = logits - elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + elif self.logprobs_mode == "processed_logprobs": processed_logprobs = self.compute_logprobs(logits) return greedy_sampled, processed_logprobs From ce75e153735363eca01ce67b9f69e7b9ea440c63 Mon Sep 17 00:00:00 2001 From: samzong <samzong.lu@gmail.com> Date: Sat, 20 Sep 2025 00:36:52 +0800 Subject: [PATCH 468/584] refactor(benchmarks): add type annotations to wait_for_endpoint parameters (#25218) Signed-off-by: samzong <samzong.lu@gmail.com> --- vllm/benchmarks/lib/endpoint_request_func.py | 15 +++++++++++++-- vllm/benchmarks/lib/ready_checker.py | 5 +++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 066b8fe834380..725b7df8b1871 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -8,8 +8,9 @@ import os import sys import time import traceback +from collections.abc import Awaitable from dataclasses import dataclass, field -from typing import Optional, Union +from typing import Optional, Protocol, Union import aiohttp from tqdm.asyncio import tqdm @@ -92,6 +93,16 @@ class RequestFuncOutput: start_time: float = 0.0 +class RequestFunc(Protocol): + def __call__( + self, + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, + ) -> Awaitable[RequestFuncOutput]: + ... + + async def async_request_openai_completions( request_func_input: RequestFuncInput, session: aiohttp.ClientSession, @@ -507,7 +518,7 @@ async def async_request_openai_embeddings( # TODO: Add more request functions for different API protocols. -ASYNC_REQUEST_FUNCS = { +ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "vllm": async_request_openai_completions, "openai": async_request_openai_completions, "openai-chat": async_request_openai_chat_completions, diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py index 7e836158386a9..87fc16b55012e 100644 --- a/vllm/benchmarks/lib/ready_checker.py +++ b/vllm/benchmarks/lib/ready_checker.py @@ -8,11 +8,12 @@ import time import aiohttp from tqdm.asyncio import tqdm -from .endpoint_request_func import RequestFuncInput, RequestFuncOutput +from .endpoint_request_func import (RequestFunc, RequestFuncInput, + RequestFuncOutput) async def wait_for_endpoint( - request_func, + request_func: RequestFunc, test_input: RequestFuncInput, session: aiohttp.ClientSession, timeout_seconds: int = 600, From 7ac67ea5255c764e87bdfc5c712bfaa35f491764 Mon Sep 17 00:00:00 2001 From: Or Ozeri <oro@il.ibm.com> Date: Fri, 19 Sep 2025 19:53:45 +0300 Subject: [PATCH 469/584] [KV offload][3/N] Add worker-side CPU support (#21448) Signed-off-by: Or Ozeri <oro@il.ibm.com> --- tests/v1/kv_offload/test_cpu_gpu.py | 177 +++++++++++++++++++++++++++ vllm/v1/kv_offload/worker/cpu_gpu.py | 171 ++++++++++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 tests/v1/kv_offload/test_cpu_gpu.py create mode 100644 vllm/v1/kv_offload/worker/cpu_gpu.py diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py new file mode 100644 index 0000000000000..0edb9513e3ff9 --- /dev/null +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import time + +import pytest +import torch + +from vllm.platforms import current_platform +from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.attention.backends.flashinfer import FlashInferBackend +from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec +from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler + +NUM_GPU_BLOCKS = [64] +NUM_CPU_BLOCKS = [256] +GPU_BLOCK_SIZES = [16] +GPU_BLOCKS_PER_CPU_BLOCK = [1, 3] +HEAD_SIZES = [64] +NUM_HEADS = [8] +NUM_LAYERS = [4] +DTYPES = [torch.bfloat16] +SEEDS = [0] +CUDA_DEVICES = ['cuda:0'] +NUM_MAPPINGS = [3] + + +@pytest.mark.parametrize("gpu_to_cpu", [True, False]) +@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES) +@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK) +@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS) +@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS) +@pytest.mark.parametrize("num_layers", NUM_LAYERS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_transfer( + gpu_to_cpu: bool, + num_mappings: int, + head_size: int, + num_heads: int, + gpu_block_size: int, + gpu_blocks_per_cpu_block: int, + num_gpu_blocks: int, + num_cpu_blocks: int, + num_layers: int, + dtype: torch.dtype, + seed: int, + device: str, +) -> None: + current_platform.seed_everything(seed) + + # create per-layer GPU KV caches + attn_backends_list = [ + FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend + ] + + gpu_caches = {} + attn_backends = {} + for i in range(num_layers): + layer_name = f'layer {i}' + + attn_backend = attn_backends_list[i % len(attn_backends_list)] + attn_backends[layer_name] = attn_backend + + gpu_cache_shape = attn_backend.get_kv_cache_shape( + num_gpu_blocks, gpu_block_size, num_heads, head_size) + gpu_caches[layer_name] = torch.rand(gpu_cache_shape, + dtype=dtype, + device=device) + + # create handler + cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size + handler = CpuGpuOffloadingHandler(attn_backends=attn_backends, + gpu_block_size=gpu_block_size, + cpu_block_size=cpu_block_size, + num_cpu_blocks=num_cpu_blocks, + gpu_caches=gpu_caches) + + # select block mappings + gpu_blocks = random.sample(range(num_gpu_blocks), + num_mappings * gpu_blocks_per_cpu_block) + cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings) + + # convert cpu blocks to gpu block size + cpu_blocks_in_gpu_block_size = [] + for cpu_block in cpu_blocks: + base_block_id = cpu_block * gpu_blocks_per_cpu_block + for i in range(gpu_blocks_per_cpu_block): + cpu_blocks_in_gpu_block_size.append(i + base_block_id) + + # maybe skip a GPU block to test writing to the middle of a CPU block + if gpu_to_cpu: + gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1:] + cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[ + gpu_blocks_per_cpu_block - 1:] + + # set transfer direction + if gpu_to_cpu: + src_kv_caches = handler.gpu_tensors + dst_kv_caches = handler.cpu_tensors + src_spec_class = GPULoadStoreSpec + dst_spec_class = CPULoadStoreSpec + src_blocks = gpu_blocks + dst_blocks = cpu_blocks + src_blocks_in_gpu_block_size = gpu_blocks + dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size + dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block + else: + src_kv_caches = handler.cpu_tensors + dst_kv_caches = handler.gpu_tensors + src_spec_class = CPULoadStoreSpec + dst_spec_class = GPULoadStoreSpec + src_blocks = cpu_blocks + dst_blocks = gpu_blocks + src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size + dst_blocks_in_gpu_block_size = gpu_blocks + dst_size_in_gpu_blocks = num_gpu_blocks + + # build dst -> src mapping + dst_to_src = {} + for src_block, dst_block in zip(src_blocks_in_gpu_block_size, + dst_blocks_in_gpu_block_size): + dst_to_src[dst_block] = src_block + + # build transfer specs + src_spec = src_spec_class(src_blocks) + dst_spec = dst_spec_class(dst_blocks) + + # clone src and dst tensors before transfer + orig_src_caches = [x.clone() for x in src_kv_caches] + orig_dst_caches = [x.clone() for x in dst_kv_caches] + + # call transfer function + assert handler.transfer_async(1, (src_spec, dst_spec)) + assert set(handler.transfer_events.keys()) == {1} + + # wait for transfer to complete + end_time = time.time() + 10 + while time.time() < end_time: + finished = handler.get_finished() + if finished: + assert finished == [(1, True)] + break + time.sleep(0.1) + + # verify src tensors did not change + for orig_tensor, tensor in zip(orig_src_caches, src_kv_caches): + assert torch.equal(orig_tensor, tensor) + + # verify dst tensors + for dst_block in range(dst_size_in_gpu_blocks): + src_block_candidate = dst_to_src.get(dst_block) + for src_cache, dst_cache, orig_dst_cache, kv_dim in zip( + src_kv_caches, dst_kv_caches, orig_dst_caches, + handler.kv_dim_before_num_blocks): + if kv_dim: + # iterate over key, value + for i in range(2): + if src_block_candidate is not None: + expected_value = src_cache[i][src_block_candidate] + else: + expected_value = orig_dst_cache[i][dst_block] + torch.testing.assert_close(dst_cache[i][dst_block].cpu(), + expected_value.cpu()) + else: + if src_block_candidate is not None: + expected_value = src_cache[src_block_candidate] + else: + expected_value = orig_dst_cache[dst_block] + torch.testing.assert_close(dst_cache[dst_block].cpu(), + expected_value.cpu()) diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py new file mode 100644 index 0000000000000..556c29247e5e7 --- /dev/null +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import numpy as np +import torch + +from vllm import _custom_ops as ops +from vllm.attention import AttentionBackend +from vllm.logger import init_logger +from vllm.utils import is_pin_memory_available +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, + TransferResult, TransferSpec) + +logger = init_logger(__name__) + + +def expand_block_ids(block_ids: np.ndarray, + block_size_factor: int, + output: np.ndarray, + skip_count: int = 0): + """ + Convert a list of block IDs to a list of matching block ids, + assuming each block is composed of actual block_size_factor blocks. + Outputs to output tensor. + The first skip_count blocks will be skipped. + Note that skip_count must be less than block_size_factor. + + For example, if block_ids = [0, 1, 3] and block_size_factor = 4, + then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15] + since 0 maps to [0, 1, 2, 3] + 1 maps to [4, 5, 6, 7] + and 3 maps to [12, 13, 14, 15] + """ + assert skip_count < block_size_factor + + first_range = np.arange(skip_count, block_size_factor) + full_range = np.arange(0, block_size_factor) + + output_idx = 0 + for i, block_id in enumerate(block_ids): + base_block_id = block_id * block_size_factor + indices = first_range if i == 0 else full_range + output_end_idx = output_idx + len(indices) + output[output_idx:output_end_idx] = base_block_id + indices + output_idx = output_end_idx + + +class CpuGpuOffloadingHandler(OffloadingHandler): + + def __init__(self, gpu_block_size: int, cpu_block_size: int, + num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type[AttentionBackend]]): + assert cpu_block_size % gpu_block_size == 0 + self.block_size_factor = cpu_block_size // gpu_block_size + + # cuda streams for gpu->cpu and cpu->gpu + self.d2h_stream = torch.cuda.Stream() + self.h2d_stream = torch.cuda.Stream() + + # job_id -> transfer cuda event + self.transfer_events: dict[int, torch.cuda.Event] = {} + # list of cuda events available for re-use + self.events_pool: list[torch.cuda.Event] = [] + + pin_memory = is_pin_memory_available() + + # allocate cpu tensors + logger.info("Allocating %d CPU tensors...", len(gpu_caches)) + self.gpu_tensors: list[torch.Tensor] = [] + self.cpu_tensors: list[torch.Tensor] = [] + self.kv_dim_before_num_blocks: list[bool] = [] + for layer_name, gpu_tensor in gpu_caches.items(): + self.gpu_tensors.append(gpu_tensor) + + gpu_shape = gpu_tensor.shape + test_shape = attn_backends[layer_name].get_kv_cache_shape( + num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256) + if test_shape[0] == 1234: + # shape is (num_blocks, ...) + num_blocks_idx = 0 + self.kv_dim_before_num_blocks.append(False) + else: + # shape should be (2, num_blocks, ...) + assert test_shape[0] == 2 + assert test_shape[1] == 1234 + assert gpu_shape[0] == 2 + + num_blocks_idx = 1 + self.kv_dim_before_num_blocks.append(True) + + cpu_shape = list(gpu_shape) + cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor + + logger.debug("Allocating CPU tensor of shape %r", cpu_shape) + self.cpu_tensors.append( + torch.zeros(cpu_shape, + dtype=gpu_tensor.dtype, + device="cpu", + pin_memory=pin_memory)) + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + src_spec, dst_spec = spec + if isinstance(src_spec, CPULoadStoreSpec): + assert isinstance(dst_spec, GPULoadStoreSpec) + stream = self.h2d_stream + src_tensors = self.cpu_tensors + dst_tensors = self.gpu_tensors + src_block_size_factor = self.block_size_factor + dst_block_size_factor = 1 + else: + assert isinstance(src_spec, GPULoadStoreSpec) + assert isinstance(dst_spec, CPULoadStoreSpec) + stream = self.d2h_stream + src_tensors = self.gpu_tensors + dst_tensors = self.cpu_tensors + src_block_size_factor = 1 + dst_block_size_factor = self.block_size_factor + + src_blocks = src_spec.block_ids + dst_blocks = dst_spec.block_ids + assert src_blocks.ndim == 1 + assert dst_blocks.ndim == 1 + + dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor) + src_sub_block_count = src_blocks.size * src_block_size_factor + + assert ( + src_sub_block_count == dst_blocks.size * dst_block_size_factor - + dst_sub_blocks_to_skip) + + src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64) + expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0]) + expand_block_ids(dst_blocks, + dst_block_size_factor, + src_to_dst[:, 1], + skip_count=dst_sub_blocks_to_skip) + src_to_dst_tensor = torch.from_numpy(src_to_dst) + + event = self.events_pool.pop() if self.events_pool \ + else torch.cuda.Event() + with torch.cuda.stream(stream): + for src_tensor, dst_tensor, kv_dim in zip( + src_tensors, dst_tensors, self.kv_dim_before_num_blocks): + if kv_dim: + src_key_cache = src_tensor[0] + dst_key_cache = dst_tensor[0] + ops.swap_blocks(src_key_cache, dst_key_cache, + src_to_dst_tensor) + src_value_cache = src_tensor[1] + dst_value_cache = dst_tensor[1] + ops.swap_blocks(src_value_cache, dst_value_cache, + src_to_dst_tensor) + else: + ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor) + event.record(stream) + + self.transfer_events[job_id] = event + + # success + return True + + def get_finished(self) -> list[TransferResult]: + results: list[TransferResult] = [] + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append((job_id, True)) + self.events_pool.append(event) + for job_id, _ in results: + del self.transfer_events[job_id] + return results From 6c117cff7d0110c74f97f68cda009595a2fdae5e Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 20 Sep 2025 01:15:19 +0800 Subject: [PATCH 470/584] [Frontend] Pass API server count to each process (#23717) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .../kernels/benchmark_w8a8_block_fp8.py | 2 +- examples/others/tensorize_vllm_model.py | 9 +-- .../test_api_server_process_manager.py | 2 +- tests/v1/test_external_lb_dp.py | 52 +++++++++++++++-- tests/v1/test_hybrid_lb_dp.py | 54 ++++++++++++++++-- tests/v1/test_internal_lb_dp.py | 57 ++++++++++++++++--- vllm/config/parallel.py | 25 ++++++++ vllm/engine/arg_utils.py | 9 ++- vllm/entrypoints/cli/serve.py | 18 ++---- vllm/entrypoints/openai/api_server.py | 38 ++++++++++--- vllm/multimodal/cache.py | 3 +- vllm/v1/engine/core_client.py | 3 +- 12 files changed, 221 insertions(+), 51 deletions(-) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index df2b713e46dc4..c6c8e0b0b936b 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -11,13 +11,13 @@ from datetime import datetime from typing import Any import torch -import triton from tqdm import tqdm from vllm.model_executor.layers.quantization.utils.fp8_utils import ( _w8a8_block_fp8_matmul, ) from vllm.platforms import current_platform +from vllm.triton_utils import triton from vllm.utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 559c7c493aca2..2b7f0beab2277 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import dataclasses import json import logging import os @@ -327,12 +325,7 @@ def main(): if args.command == "serialize": - eng_args_dict = {f.name: getattr(args, f.name) for f in - dataclasses.fields(EngineArgs)} - - engine_args = EngineArgs.from_cli_args( - argparse.Namespace(**eng_args_dict) - ) + engine_args = EngineArgs.from_cli_args(args) input_dir = tensorizer_dir.rstrip('/') suffix = args.suffix if args.suffix else uuid.uuid4().hex diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index a993e24ff838a..34b05ad17b025 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update): global WORKER_RUNTIME_SECONDS WORKER_RUNTIME_SECONDS = 0.5 - # Copy the args to avoid mutating the + # Copy the args to avoid mutating them args = api_server_args.copy() if not with_stats_update: diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py index 4a5c47fead58f..862a76f3c4e2d 100644 --- a/tests/v1/test_external_lb_dp.py +++ b/tests/v1/test_external_lb_dp.py @@ -9,6 +9,7 @@ from contextlib import AsyncExitStack import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform @@ -70,6 +71,8 @@ class ExternalLBServerManager: sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -127,11 +130,19 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, - default_server_args) as server_list: - yield server_list + server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args) + + with server_manager: + yield server_manager + + +@pytest.fixture +def servers(server_manager): + return server_manager.servers @pytest_asyncio.fixture @@ -144,6 +155,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + +def test_external_lb_server_info(server_manager): + servers = server_manager.servers + api_server_count = server_manager.api_server_count + + for i, (server, _) in enumerate(servers): + print(f"Testing {i=}") + + # Each request will hit one of the API servers + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(server) for _ in range(n_reqs) + ] + api_process_counts = [ + c["_api_process_count"] for c in parallel_configs + ] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] + + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py index 293b1257be6bb..552436f818d77 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/test_hybrid_lb_dp.py @@ -9,6 +9,7 @@ from contextlib import AsyncExitStack import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from tests.v1.test_utils import check_request_balancing @@ -92,6 +93,8 @@ class HybridLBServerManager: sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -150,12 +153,20 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count, - default_server_args, DP_SIZE_LOCAL, - TP_SIZE) as server_list: - yield server_list + server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args, DP_SIZE_LOCAL, + TP_SIZE) + + with server_manager: + yield server_manager + + +@pytest.fixture +def servers(server_manager): + return server_manager.servers @pytest_asyncio.fixture @@ -168,6 +179,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]): ] +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + +def test_hybrid_dp_server_info(server_manager): + servers = server_manager.servers + api_server_count = server_manager.api_server_count + + for i, (server, _) in enumerate(servers): + print(f"Testing {i=}") + + # Each request will hit one of the API servers + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(server) for _ in range(n_reqs) + ] + api_process_counts = [ + c["_api_process_count"] for c in parallel_configs + ] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] + + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py index 2b031865cad76..e965645711ee6 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/test_internal_lb_dp.py @@ -10,6 +10,7 @@ from typing import Optional, cast import openai # use the official client for correctness check import pytest import pytest_asyncio +import requests from tests.utils import RemoteOpenAIServer from tests.v1.test_utils import check_request_balancing @@ -101,6 +102,8 @@ class MultinodeInternalLBServerManager: sargs, auto_port=False, env_dict={ + "VLLM_SERVER_DEV_MODE": + "1", current_platform.device_control_env_var: ",".join( str( @@ -214,7 +217,10 @@ class APIOnlyServerManager: self.model_name, api_server_args, auto_port=False, - env_dict={}) # No GPUs needed for API-only server + env_dict={ + "VLLM_SERVER_DEV_MODE": "1", + # No GPUs needed for API-only server + }) server.__enter__() print(f"API-only server started successfully with " f"{self.api_server_count} API servers") @@ -293,14 +299,21 @@ def default_server_args(): @pytest.fixture(scope="module", params=[1, 4]) -def servers(request, default_server_args): +def server_manager(request, default_server_args): api_server_count = request.param - with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, - api_server_count, - default_server_args, - DP_SIZE // NUM_NODES, - TP_SIZE) as server_list: - yield server_list + server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, + api_server_count, + default_server_args, + DP_SIZE // NUM_NODES, + TP_SIZE) + + with server_manager: + yield server_manager + + +@pytest.fixture +def servers(server_manager): + return server_manager.servers @pytest.fixture(scope="module", params=[1, 4]) @@ -331,6 +344,34 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer, yield client +def _get_parallel_config(server: RemoteOpenAIServer): + response = requests.get(server.url_for("server_info?config_format=json")) + response.raise_for_status() + + vllm_config = response.json()["vllm_config"] + return vllm_config["parallel_config"] + + +def test_multinode_dp_server_info(server_manager): + head_server = server_manager.servers[0][0] + api_server_count = server_manager.api_server_count + + # Each request will hit one of the API servers + # `n_reqs` is set so that there is a good chance each server + # receives at least one request + n_reqs = 2 * api_server_count * api_server_count + parallel_configs = [ + _get_parallel_config(head_server) for _ in range(n_reqs) + ] + api_process_counts = [c["_api_process_count"] for c in parallel_configs] + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] + + assert all(c == api_server_count + for c in api_process_counts), api_process_counts + assert all(0 <= r < api_server_count + for r in api_process_ranks), api_process_ranks + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 8e92e54a96780..37a41bf6de71a 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -193,6 +193,25 @@ class ParallelConfig: not change by dcp, it simply reuse the GPUs of TP group, and tp_size needs to be divisible by dcp_size.""" + _api_process_count: int = 1 + """ + The number of API processes initialized. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + + _api_process_rank: int = 0 + """ + The rank of this API process, or `-1` for engine core processes + under API server scale-out. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world @@ -428,6 +447,12 @@ class ParallelConfig: if self.distributed_executor_backend is None and self.world_size == 1: self.distributed_executor_backend = "uni" + if not -1 <= self._api_process_rank < self._api_process_count: + raise ValueError( + "Invalid value of `_api_process_rank`. " + f"Expected to be `-1` or `[0, {self._api_process_count})`, " + f"but found: {self._api_process_rank}") + @property def use_ray(self) -> bool: return self.distributed_executor_backend == "ray" or ( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ecf4e486a016f..7a4bb0d41d236 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -333,6 +333,8 @@ class EngineArgs: enable_eplb: bool = ParallelConfig.enable_eplb expert_placement_strategy: ExpertPlacementStrategy = \ ParallelConfig.expert_placement_strategy + _api_process_count: int = ParallelConfig._api_process_count + _api_process_rank: int = ParallelConfig._api_process_rank num_redundant_experts: int = EPLBConfig.num_redundant_experts eplb_window_size: int = EPLBConfig.window_size eplb_step_interval: int = EPLBConfig.step_interval @@ -952,7 +954,10 @@ class EngineArgs: # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. - engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) + engine_args = cls(**{ + attr: getattr(args, attr) + for attr in attrs if hasattr(args, attr) + }) return engine_args def create_model_config(self) -> ModelConfig: @@ -1366,6 +1371,8 @@ class EngineArgs: worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, decode_context_parallel_size=self.decode_context_parallel_size, + _api_process_count=self._api_process_count, + _api_process_rank=self._api_process_rank, ) speculative_config = self.create_speculative_config( diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 803a3e004656a..de47bf00932e0 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -135,23 +135,20 @@ def run_headless(args: argparse.Namespace): def run_multi_api_server(args: argparse.Namespace): assert not args.headless - num_api_servers = args.api_server_count + num_api_servers: int = args.api_server_count assert num_api_servers > 0 - orig_mm_processor_cache_gb = args.mm_processor_cache_gb - if num_api_servers > 1: setup_multiprocess_prometheus() - # Not compatible with API server scale-out - args.mm_processor_cache_gb = 0 - listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) + engine_args._api_process_count = num_api_servers + engine_args._api_process_rank = -1 + usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) - model_config = vllm_config.model_config if num_api_servers > 1: if not envs.VLLM_USE_V1: @@ -161,10 +158,6 @@ def run_multi_api_server(args: argparse.Namespace): raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " "with api_server_count > 1") - if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0: - logger.warning("Multi-modal processor cache is disabled because " - "it is not compatible with `api_server_count > 1`.") - executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats @@ -221,9 +214,10 @@ def run_api_server_worker_proc(listen_address, client_config=None, **uvicorn_kwargs) -> None: """Entrypoint for individual API server worker processes.""" + client_config = client_config or {} + server_index = client_config.get("client_index", 0) # Set process title and add process-specific prefix to stdout and stderr. - server_index = client_config.get("client_index", 0) if client_config else 0 set_process_title("APIServer", str(server_index)) decorate_logs() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 11031cd616d20..b8ba7e81ef5f6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,13 +17,14 @@ from argparse import Namespace from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Annotated, Any, Callable, Optional +from typing import Annotated, Any, Callable, Literal, Optional import prometheus_client import pydantic import regex as re import uvloop -from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request +from fastapi import (APIRouter, Depends, FastAPI, Form, HTTPException, Query, + Request) from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -166,6 +167,9 @@ async def build_async_engine_client( # Context manager to handle engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit engine_args = AsyncEngineArgs.from_cli_args(args) + if client_config: + engine_args._api_process_count = client_config.get("client_count", 1) + engine_args._api_process_rank = client_config.get("client_index", 0) if disable_frontend_multiprocessing is None: disable_frontend_multiprocessing = bool( @@ -209,8 +213,12 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None - client_count = client_config.pop("client_count") if client_config else 1 - client_index = client_config.pop("client_index") if client_config else 0 + + # Don't mutate the input client_config + client_config = dict(client_config) if client_config else {} + client_count = client_config.pop("client_count", 1) + client_index = client_config.pop("client_index", 0) + try: async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, @@ -956,9 +964,22 @@ if envs.VLLM_SERVER_DEV_MODE: logger.warning("SECURITY WARNING: Development endpoints are enabled! " "This should NOT be used in production!") + PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) + @router.get("/server_info") - async def show_server_info(raw_request: Request): - server_info = {"vllm_config": str(raw_request.app.state.vllm_config)} + async def show_server_info( + raw_request: Request, + config_format: Annotated[Literal["text", "json"], + Query()] = "text", + ): + vllm_config: VllmConfig = raw_request.app.state.vllm_config + server_info = { + "vllm_config": + str(vllm_config) + if config_format == "text" else PydanticVllmConfig.dump_python( + vllm_config, mode="json", fallback=str) + # fallback=str is needed to handle e.g. torch.dtype + } return JSONResponse(content=server_info) @router.post("/reset_prefix_cache") @@ -1856,8 +1877,6 @@ async def run_server_worker(listen_address, if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) - server_index = client_config.get("client_index", 0) if client_config else 0 - # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) if log_config is not None: @@ -1873,7 +1892,8 @@ async def run_server_worker(listen_address, vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) - logger.info("Starting vLLM API server %d on %s", server_index, + logger.info("Starting vLLM API server %d on %s", + vllm_config.parallel_config._api_process_rank, listen_address) shutdown_task = await serve_http( app, diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 297b4c7fa7fbd..642ec3fd7e3f7 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -494,7 +494,8 @@ def _enable_processor_cache( def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool: parallel_config = vllm_config.parallel_config - supports_ipc_cache = (parallel_config.data_parallel_size == 1 + supports_ipc_cache = ((parallel_config._api_process_count == 1 + and parallel_config.data_parallel_size == 1) or parallel_config.data_parallel_external_lb) return supports_ipc_cache diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index bb0f37c6e0264..a84b0e55105b2 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -437,7 +437,7 @@ class MPClient(EngineCoreClient): self.engines_running = False self.stats_update_address: Optional[str] = None - if client_addresses is not None: + if client_addresses: # Engines are managed externally to this client. input_address = client_addresses["input_address"] output_address = client_addresses["output_address"] @@ -774,6 +774,7 @@ class AsyncMPClient(MPClient): client_addresses=client_addresses, ) + self.client_count = client_count self.client_index = client_index self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, Exception]]() From 2821986450bc31869714885ed4203650a42f3cb0 Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Sat, 20 Sep 2025 02:01:28 +0800 Subject: [PATCH 471/584] [Core] Modify the initialization parameters of the lora manager (#25249) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- tests/lora/test_lora_manager.py | 36 ++++++++++++++++++----- tests/lora/utils.py | 2 +- vllm/lora/{lora.py => lora_weights.py} | 0 vllm/lora/models.py | 2 +- vllm/lora/worker_manager.py | 23 ++++++++------- vllm/v1/worker/cpu_model_runner.py | 5 ++-- vllm/v1/worker/gpu_model_runner.py | 5 +--- vllm/v1/worker/lora_model_runner_mixin.py | 15 ++-------- vllm/v1/worker/tpu_model_runner.py | 4 +-- vllm/worker/model_runner.py | 11 ++----- 10 files changed, 51 insertions(+), 52 deletions(-) rename vllm/lora/{lora.py => lora_weights.py} (100%) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index a5802c108c6be..d7684fbf34abb 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,11 +8,12 @@ import torch from safetensors.torch import load_file from torch import nn +from vllm.config import ModelConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, RowParallelLinearWithLoRA) -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager) from vllm.lora.peft_helper import PEFTHelper @@ -435,10 +436,19 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, target_modules=["layer1.dense1", "dense2"], lora_dtype=DEFAULT_DTYPE, ) + + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig(model_config=model_config, + lora_config=lora_config) + + vllm_config.scheduler_config.max_num_seqs = 4 + vllm_config.scheduler_config.max_num_batched_tokens = 2 worker_adapter_manager = LRUCacheWorkerLoRAManager( - 4, 2, - dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size, - lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + + worker_adapter_manager.max_num_seqs = 4 + worker_adapter_manager.max_num_batched_tokens = 2 + worker_adapter_manager.create_lora_manager(dummy_model) mapping = LoRAMapping([], []) @@ -517,10 +527,20 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE) - worker_adapter_manager = WorkerLoRAManager( - 4, 2, dummy_model_gate_up.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, device, - EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig(model_config=model_config, + lora_config=lora_config) + + vllm_config.scheduler_config.max_num_seqs = 4 + vllm_config.scheduler_config.max_num_batched_tokens = 2 + + worker_adapter_manager = WorkerLoRAManager(vllm_config, device, + EMBEDDING_MODULES, + EMBEDDING_PADDING_MODULES) + worker_adapter_manager.vocab_size = ( + dummy_model_gate_up.unpadded_vocab_size - + lora_config.lora_extra_vocab_size) worker_adapter_manager.create_lora_manager(dummy_model_gate_up) dummy_lora_files = f"{tmp_path}/lora_adapter" diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 7cda90787b6f1..ab475904d4938 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -9,7 +9,7 @@ from typing import Optional, Union import torch from safetensors.torch import save_file -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights class DummyLoRAManager: diff --git a/vllm/lora/lora.py b/vllm/lora/lora_weights.py similarity index 100% rename from vllm/lora/lora.py rename to vllm/lora/lora_weights.py diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 25f90f2fa932b..9ea46be65cff3 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -14,7 +14,7 @@ from torch import nn from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.utils import (from_layer, from_layer_logits_processor, diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index e27b7d5fcf223..cdb2f86611d81 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -6,7 +6,7 @@ from typing import Any, Literal, Optional, Union import torch -from vllm.config.lora import LoRAConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) @@ -27,25 +27,26 @@ class WorkerLoRAManager: def __init__( self, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, + vllm_config: VllmConfig, device: torch.device, embedding_modules: dict[str, str], embedding_padding_modules: list[str], lora_model_cls: type[LoRAModel] = LoRAModel, - max_position_embeddings: Optional[int] = None, ): self._lora_model_cls = lora_model_cls self.embedding_modules = embedding_modules self.embedding_padding_modules = embedding_padding_modules self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False - self.max_num_seqs = max_num_seqs - self.max_num_batched_tokens = max_num_batched_tokens - self.vocab_size = vocab_size - self.lora_config = lora_config - self.max_position_embeddings = max_position_embeddings + self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs + self.max_num_batched_tokens = ( + vllm_config.scheduler_config.max_num_batched_tokens) + self.vocab_size = vllm_config.model_config.get_vocab_size() + self.lora_config = vllm_config.lora_config + + # Use get_text_config() in case of multimodal models + text_config = vllm_config.model_config.hf_config.get_text_config() + + self.max_position_embeddings = text_config.max_position_embeddings self.device = device # Lazily initialized by create_lora_manager. self._adapter_manager: LoRAModelManager diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index cd0f0af43e7e7..6a97f7ebc3fcd 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -107,9 +107,8 @@ class CPUModelRunner(GPUModelRunner): self.model = get_model(vllm_config=self.vllm_config) if self.lora_config: - self.model = self.load_lora_model(self.model, self.model_config, - self.scheduler_config, - self.lora_config, self.device) + self.model = self.load_lora_model(self.model, self.vllm_config, + self.device) def get_model(self) -> nn.Module: return self.model diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3ee2160a42ffe..9d0f26266f0c5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2552,10 +2552,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.model = model_loader.load_model( vllm_config=self.vllm_config, model_config=self.model_config) if self.lora_config: - self.model = self.load_lora_model(self.model, - self.model_config, - self.scheduler_config, - self.lora_config, + self.model = self.load_lora_model(self.model, self.vllm_config, self.device) if hasattr(self, "drafter"): logger.info("Loading drafter model...") diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 01d5f0525c4e2..e416f50322f48 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -11,7 +11,7 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import ModelConfig, SchedulerConfig +from vllm.config import VllmConfig from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -31,9 +31,7 @@ class LoRAModelRunnerMixin: LORA_WARMUP_RANK = 8 - def load_lora_model(self, model: nn.Module, model_config: ModelConfig, - scheduler_config: SchedulerConfig, - lora_config: LoRAConfig, + def load_lora_model(self, model: nn.Module, vllm_config: VllmConfig, device: torch.device) -> nn.Module: if not supports_lora(model): @@ -44,19 +42,12 @@ class LoRAModelRunnerMixin: logger.warning("Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # Use get_text_config() in case of multimodal models - text_config = model_config.hf_config.get_text_config() - # Add LoRA Manager to the Model Runner self.lora_manager = LRUCacheWorkerLoRAManager( - scheduler_config.max_num_seqs, - scheduler_config.max_num_batched_tokens, - model_config.get_vocab_size(), - lora_config, + vllm_config, device, model.embedding_modules, model.embedding_padding_modules, - max_position_embeddings=text_config.max_position_embeddings, ) return self.lora_manager.create_lora_manager(model) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 01a8e5c3f0dba..48070c1e3e7cb 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1178,9 +1178,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): "or sharding the weights on more chips. " f"See the detailed error: {e}") from e if self.lora_config is not None: - model = self.load_lora_model(model, self.model_config, - self.scheduler_config, - self.lora_config, self.device) + model = self.load_lora_model(model, self.vllm_config, self.device) replace_set_lora(model) # Sync all pending XLA execution during model initialization and weight diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 594382650e3c1..c91c871766cff 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1078,20 +1078,13 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): "Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # Use get_text_config() in case of multimodal models - text_config = self.model_config.hf_config.get_text_config() - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, - self.vocab_size, - self.lora_config, + self.vllm_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=text_config. - max_position_embeddings, ) + self.model = self.lora_manager.create_lora_manager(self.model) time_after_load = time.perf_counter() From d90e212a3a586b8a6ca9a424868abd0e6ef6779a Mon Sep 17 00:00:00 2001 From: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com> Date: Sat, 20 Sep 2025 02:15:13 +0800 Subject: [PATCH 472/584] Remove Redundant Assignment in Qwen3_VisionPatchMerger (#25224) Signed-off-by: Junhong <liujunhong11@huawei.com> Co-authored-by: Junhong <liujunhong11@huawei.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/qwen3_vl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c224b78e2c27c..17375ff0959d7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -223,9 +223,7 @@ class Qwen3_VisionPatchMerger(nn.Module): if norm_layer is None: norm_layer = partial(nn.LayerNorm, eps=1e-6) - self.use_postshuffle_norm = use_postshuffle_norm - self.norm = norm_layer( - self.hidden_size if use_postshuffle_norm else context_dim) + self.norm = norm_layer(context_dim) self.linear_fc1 = ColumnParallelLinear(self.hidden_size, self.hidden_size, bias=True, From 12aed7e453aea713495bd7cced6f9e2bb78aaa79 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 19:15:22 +0100 Subject: [PATCH 473/584] Encoder model support for the Transformers backend (#25174) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 37 ++++++++++----- tests/models/test_transformers.py | 36 ++++++++++++++- vllm/attention/backends/abstract.py | 8 ++-- vllm/model_executor/models/transformers.py | 54 +++++++++++++++++++--- 4 files changed, 111 insertions(+), 24 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 3a6738a27be09..cbc0a56a645ea 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -17,9 +17,24 @@ These models are what we list in [supported-text-models][supported-text-models] ### Transformers -vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases. +vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend". -To check if the modeling backend is Transformers, you can simply do this: +Currently, the Transformers backend works for the following: + +- Modalities: embedding models, language models and vision-language models* +- Architectures: encoder-only, decoder-only +- Attention types: full attention and/or sliding attention + +_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._ + +If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM: + +- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature) +- Any combination of the following vLLM parallelisation schemes: + - Pipeline parallel + - Tensor parallel + +Checking if the modeling backend is Transformers is as simple as: ```python from vllm import LLM @@ -27,16 +42,12 @@ llm = LLM(model=...) # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` -If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers! +If the printed type starts with `Transformers...` then it's using the Transformers model implementation! -!!! tip - You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md). +If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md). !!! note - vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. - -!!! note - In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. + For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. #### Custom models @@ -66,10 +77,11 @@ This section details the necessary modifications to make to a Transformers compa To make your model compatible with the Transformers backend, it needs: 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. + 1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`. 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 3. `MyModel` must contain `_supports_attention_backend = True`. -<details> +<details class="code"> <summary>modeling_my_model.py</summary> ```python @@ -78,6 +90,7 @@ from transformers import PreTrainedModel from torch import nn class MyAttention(nn.Module): + is_causal = False # Only do this for encoder-only models def forward(self, hidden_states, **kwargs): ... @@ -101,13 +114,13 @@ Here is what happens in the background when this model is loaded: 1. The config is loaded. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. +3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. That's it! For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: -<details> +<details class="code"> <summary>configuration_my_model.py</summary> ```python diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index ba9c3bebc437e..1817d4aeee9f9 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -9,7 +9,7 @@ from vllm.platforms import current_platform from ..conftest import HfRunner, VllmRunner from ..utils import multi_gpu_test, prep_prompts -from .utils import check_logprobs_close +from .utils import check_embeddings_close, check_logprobs_close def check_implementation( @@ -165,6 +165,40 @@ def test_embed_loading(vllm_runner, model): assert model_config.using_transformers_backend() +@pytest.mark.parametrize( + "model", + [ + # Encoder model + "BAAI/bge-base-en-v1.5", + ]) +def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model): + import transformers + from packaging.version import Version + installed = Version(transformers.__version__) + required = Version("4.57.0.dev0") + if installed < required: + pytest.skip("Encoder models with the Transformers backend require " + f"transformers>={required}, but got {installed}") + + with vllm_runner(model, max_model_len=512, + model_impl="transformers") as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + assert model_config.using_transformers_backend() + + vllm_outputs = vllm_model.embed(example_prompts) + + with hf_runner(model, is_sentence_transformer=True) as hf_model: + hf_outputs = hf_model.encode(example_prompts) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) + + @pytest.mark.parametrize( "model", ["jason9693/Qwen2.5-1.5B-apeach"], diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 75bcdc4bbcf0d..dfde67e1713c5 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -23,14 +23,14 @@ class AttentionType: Attention type. Use string to be compatible with `torch.compile`. """ - # Decoder attention between previous layer Q/K/V DECODER = "decoder" - # Encoder attention between previous layer Q/K/V for encoder-decoder + """Decoder attention between previous layer Q/K/V.""" ENCODER = "encoder" - # Encoder attention between previous layer Q/K/V + """Encoder attention between previous layer Q/K/V for encoder-decoder.""" ENCODER_ONLY = "encoder_only" - # Attention between dec. Q and enc. K/V for encoder-decoder + """Encoder attention between previous layer Q/K/V.""" ENCODER_DECODER = "encoder_decoder" + """Attention between dec. Q and enc. K/V for encoder-decoder.""" class AttentionBackend(ABC): diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 4f51441e28efa..f40a20dee63d7 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -27,7 +27,7 @@ from transformers import (AutoModel, BatchFeature, PretrainedConfig, PreTrainedModel) from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention import Attention +from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, VllmConfig) @@ -452,8 +452,9 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): self.pp_rank = self.pp_group.rank_in_group self.tp_size = get_tensor_model_parallel_world_size() - # To be updated in child classes for use in `load_weights` - self.skip_prefixes: Optional[list[str]] = None + # Weights to skip in `self.load_weights` + self.skip_prefixes: list[str] = [] + self.skip_substrs: list[str] = [] # Set correct attn and init on "meta" to delay allocating GPU tensors # TODO: @raushan, use the public `model.set_attn_implementation()` @@ -596,7 +597,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): _tensor_parallel(self.model) - def create_attention_instances(self) -> dict[int, Attention]: + def create_attention_instances( + self, + attn_type: AttentionType = AttentionType.DECODER + ) -> dict[int, Attention]: """ Create `Attention` instances to inform KV cache allocation. """ @@ -625,7 +629,8 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): cache_config=self.cache_config, quant_config=self.quant_config, per_layer_sliding_window=per_layer_sliding_window, - prefix=f"{i}.attn") + prefix=f"{i}.attn", + attn_type=attn_type) return attention_instances def init_parameters(self, module: nn.Module): @@ -685,7 +690,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self, skip_prefixes=self.skip_prefixes) + loader = AutoWeightsLoader( + self, + skip_prefixes=self.skip_prefixes, + skip_substrs=self.skip_substrs, + ) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) @@ -700,6 +709,37 @@ class TransformersModel(TransformersBase): "model.score": "score", }) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + # Some encoder models have the position_ids buffer in the checkpoint + # vLLM will always pass position_ids as an argument, so we skip loading + # the buffer if it exists + self.skip_substrs.append("position_ids") + + def create_attention_instances( + self, attn_type: AttentionType = AttentionType.DECODER): + # TODO(hmellor): Better way to detect encoder models + # In encoder models, the attention layers will have `is_causal=False` + is_encoder = lambda m: not getattr(m, "is_causal", True) + # vLLM does not support encoder-decoder models, so if any encoder layer + # is found, we assume the whole model is an encoder model + if any(is_encoder(m) for m in self.model.modules()): + attn_type = AttentionType.ENCODER_ONLY + + # Check minimum transformers version for encoder models support + if attn_type == AttentionType.ENCODER_ONLY: + import transformers + from packaging.version import Version + installed = Version(transformers.__version__) + required = Version("4.57.0.dev0") + if installed < required: + raise ValueError( + "Encoder models with the Transformers backend require " + f"transformers>={required}, but got {installed}") + + return super().create_attention_instances(attn_type) + @support_torch_compile(enable_if=can_enable_torch_compile) class TransformersForCausalLM(TransformersBase): @@ -710,7 +750,7 @@ class TransformersForCausalLM(TransformersBase): # Tell `TransformersBase.load_weights` to skip # `lm_head` if the model has tied word embeddings if self.text_config.tie_word_embeddings: - self.skip_prefixes = ["lm_head."] + self.skip_prefixes.append("lm_head.") if get_pp_group().is_last_rank: self.unpadded_vocab_size = self.text_config.vocab_size From 47fd08aaf9fe6616d5daf1f30c0377d8b8b7cf21 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Sat, 20 Sep 2025 02:16:32 +0800 Subject: [PATCH 474/584] [CI/Build] fix test function_calling (#25072) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> --- tests/entrypoints/openai/test_response_api_with_harmony.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 8d974d56b4450..40a22c04b08a5 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -515,6 +515,7 @@ async def test_function_calling(client: OpenAI, model_name: str): model=model_name, input="What's the weather like in Paris today?", tools=tools, + temperature=0.0, ) assert response is not None assert response.status == "completed" From 2506ce5189fa67fea905c1f18ca735191cec0d29 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang <Jialin.Ouyang@gmail.com> Date: Fri, 19 Sep 2025 11:22:53 -0700 Subject: [PATCH 475/584] [Core][Prefix Hash] Fix prefix hash metrics sliding window maintainance (#24990) Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> --- tests/v1/core/test_kv_cache_utils.py | 46 +++++++++++++++++++++++----- vllm/v1/core/kv_cache_utils.py | 13 ++++++-- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 319e6e84fba1e..4bf6bbbfeae28 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -513,27 +513,27 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn): assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None)) +def _stats(requests: int, queries: int, hits: int) -> PrefixCacheStats: + return PrefixCacheStats(requests=requests, queries=queries, hits=hits) + + def test_metrics(): """ Test the prefix caching metrics. """ - - def stats(requests, queries, hits): - return PrefixCacheStats(requests=requests, queries=queries, hits=hits) - metrics = PrefixCachingMetrics(max_recent_requests=5) assert metrics.hit_rate == 0.0 - metrics.observe(stats(1, 20, 9)) + metrics.observe(_stats(1, 20, 9)) # 9 / 20 = 0.45 assert metrics.hit_rate == 0.45 - metrics.observe(stats(4, 80, 16)) + metrics.observe(_stats(4, 80, 16)) # 25 / 100 = 0.25 assert metrics.hit_rate == 0.25 - metrics.observe(stats(1, 10, 2)) + metrics.observe(_stats(1, 10, 2)) # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 assert metrics.aggregated_requests == 5 @@ -549,6 +549,38 @@ def test_metrics(): assert not metrics.query_queue +def test_metrics_empty_stats(): + """ + Test the prefix caching metrics with empty stats. + """ + metrics = PrefixCachingMetrics(max_recent_requests=5) + metrics.observe(_stats(0, 0, 0)) + metrics.observe(_stats(1, 20, 9)) + metrics.observe(_stats(0, 0, 0)) + metrics.observe(_stats(4, 80, 16)) + metrics.observe(_stats(0, 0, 0)) + metrics.observe(_stats(1, 10, 2)) + # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 + assert metrics.aggregated_requests == 5 + assert metrics.aggregated_query_total == 90 + assert metrics.aggregated_query_hit == 18 + assert metrics.hit_rate == 0.2 + + # Only the latest added stats preserved 10 / 20 = 0.5 + metrics.observe(_stats(11, 20, 10)) + assert metrics.aggregated_requests == 11 + assert metrics.aggregated_query_total == 20 + assert metrics.aggregated_query_hit == 10 + assert metrics.hit_rate == 0.5 + + # Only the latest added stats preserved 30 / 40 = 0.75 + metrics.observe(_stats(22, 40, 30)) + assert metrics.aggregated_requests == 22 + assert metrics.aggregated_query_total == 40 + assert metrics.aggregated_query_hit == 30 + assert metrics.hit_rate == 0.75 + + def test_get_kv_cache_configs_multiple_workers(): model_config = ModelConfig(max_model_len=16) vllm_config = VllmConfig(model_config=model_config) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9fab36aba91b3..bc2ec5e42ea20 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -127,14 +127,23 @@ class PrefixCachingMetrics: if stats.reset: self.reset() + # DO NOT appending empty stats to avoid helpful info get kicked out + # due to sliding window. + if stats.requests == 0: + return + # Update the metrics. self.query_queue.append((stats.requests, stats.queries, stats.hits)) self.aggregated_requests += stats.requests self.aggregated_query_total += stats.queries self.aggregated_query_hit += stats.hits - # Remove the oldest stats if the number of requests exceeds. - if self.aggregated_requests > self.max_recent_requests: + # Remove the oldest stats until number of requests does not exceed + # the limit. + # NOTE: We preserve the latest added stats regardless. + while len( + self.query_queue + ) > 1 and self.aggregated_requests > self.max_recent_requests: old_requests, old_queries, old_hits = self.query_queue.popleft() self.aggregated_requests -= old_requests self.aggregated_query_total -= old_queries From 138f0d1e752d3a35cd959f3df8bf00370a2ace7b Mon Sep 17 00:00:00 2001 From: samzong <samzong.lu@gmail.com> Date: Sat, 20 Sep 2025 02:32:27 +0800 Subject: [PATCH 476/584] [Docs] add __init__.py to vllm/model_executor/layers/quantization/compressed_tensors/transform (#24974) Signed-off-by: samzong <samzong.lu@gmail.com> --- .../quantization/compressed_tensors/transform/__init__.py | 0 .../compressed_tensors/transform/schemes/__init__.py | 0 vllm/model_executor/models/blip2.py | 2 +- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/llava_next.py | 5 +++-- 5 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c1e7a7d498b11..a3131aa3812ef 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -680,7 +680,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, batch. Info: - [Blip2ImageInputs][] + [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs] """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 9591deea06ce9..4f15e1b5762ea 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -737,7 +737,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): inputs_embeds: Optional tensor of input embeddings. Info: - [LlavaImageInputs][] + [`LlavaImageInputs`][vllm.model_executor.models.llava.LlavaImageInputs] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 5e82f9799e0fe..beb3c33100595 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -527,7 +527,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, Unlike in LLaVA-1.5, the number of image tokens inputted to the language model depends on the original size of the input image. Including the original image token in the input, the required number of image tokens - is given by [get_llava_next_image_feature_size][]. + is given by [`LlavaNextProcessingInfo.get_num_image_tokens`][vllm.\ +model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens]. This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. @@ -540,7 +541,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, inputs_embeds: Optional tensor of input embeddings. Info: - [LlavaNextImageInputs][] + [`LlavaNextImageInputs`][vllm.model_executor.models.llava_next.LlavaNextImageInputs] """ if intermediate_tensors is not None: inputs_embeds = None From b716ab93a781156ab178513afc0e407cc72d443b Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Fri, 19 Sep 2025 11:37:57 -0700 Subject: [PATCH 477/584] [bugfix] fix structured outputs key missing issue from #24929 (#25195) Signed-off-by: Lu Fang <fanglu@fb.com> --- vllm/v1/core/sched/scheduler.py | 4 +++- vllm/v1/structured_output/utils.py | 11 ++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b08898d253cab..ef77d9e2d3ffd 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -578,8 +578,10 @@ class Scheduler(SchedulerInterface): scheduled_spec_decode_tokens, req_to_new_blocks, ) + scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs + + scheduled_resumed_reqs) structured_output_request_ids, grammar_bitmask = ( - self.get_grammar_bitmask(self.running, + self.get_grammar_bitmask(scheduled_requests, scheduled_spec_decode_tokens)) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 127c8876525b5..b9b09bea1e80f 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -90,13 +90,14 @@ def apply_grammar_bitmask( seq = sorted(scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1]) for req_id, _ in seq: - logit_index = struct_out_req_batch_indices[req_id] num_spec_tokens = len( scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = \ - grammar_bitmask[cumulative_index + i] - out_indices.append(logit_index + i) + if req_id in struct_out_req_batch_indices: + logit_index = struct_out_req_batch_indices[req_id] + for i in range(1 + num_spec_tokens): + sorted_bitmask[logit_index + i] = \ + grammar_bitmask[cumulative_index + i] + out_indices.append(logit_index + i) cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask From c59a0eca4204522ecaa28af1f1e38d50b1e2626f Mon Sep 17 00:00:00 2001 From: Or Ozeri <oro@il.ibm.com> Date: Fri, 19 Sep 2025 22:07:17 +0300 Subject: [PATCH 478/584] [KV offload][4/N] Offloading KV connector (#22595) Signed-off-by: Or Ozeri <oro@il.ibm.com> --- .../unit/test_offloading_connector.py | 505 ++++++++++++++++++ tests/v1/kv_connector/unit/utils.py | 3 +- .../kv_transfer/kv_connector/factory.py | 5 + .../kv_connector/v1/offloading_connector.py | 485 +++++++++++++++++ vllm/v1/kv_offload/factory.py | 53 ++ vllm/v1/kv_offload/spec.py | 61 +++ 6 files changed, 1111 insertions(+), 1 deletion(-) create mode 100644 tests/v1/kv_connector/unit/test_offloading_connector.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py create mode 100644 vllm/v1/kv_offload/factory.py create mode 100644 vllm/v1/kv_offload/spec.py diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py new file mode 100644 index 0000000000000..f9a4d2fb4de44 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_offloading_connector.py @@ -0,0 +1,505 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from typing import Any +from unittest.mock import MagicMock + +import pytest +import torch + +from vllm import SamplingParams +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_events import BlockRemoved, BlockStored +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole +from vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector import ( + OffloadingConnector, OffloadingConnectorMetadata) +from vllm.forward_context import ForwardContext +from vllm.utils import sha256 +from vllm.v1.core.kv_cache_utils import (BlockHash, get_request_block_hasher, + init_none_hash) +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent, + OffloadingManager, PrepareStoreOutput) +from vllm.v1.kv_offload.mediums import GPULoadStoreSpec +from vllm.v1.kv_offload.spec import OffloadingSpec +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, + TransferResult, TransferSpec) +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput +from vllm.v1.request import Request + +from .utils import (EOS_TOKEN_ID, create_model_runner_output, create_scheduler, + create_vllm_config) + + +class MockLoadStoreSpec(LoadStoreSpec): + + def __init__(self, block_hashes: Iterable[BlockHash]): + self.block_hashes: list[BlockHash] = list(block_hashes) + + @staticmethod + def medium() -> str: + return "Mock" + + def __repr__(self) -> str: + return repr(self.block_hashes) + + +class MockOffloadingHandler(OffloadingHandler): + + def __init__(self): + self.completed_transfers: list[TransferResult] = [] + self.completed_specs: list[TransferSpec] = [] + + def get_finished(self) -> list[TransferResult]: + finished = self.completed_transfers + self.completed_transfers = [] + return finished + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + self.completed_specs.append(spec) + self.completed_transfers.append((job_id, True)) + return True + + +class MockOffloadingSpec(OffloadingSpec): + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + + self.manager = MagicMock(spec=OffloadingManager) + self.manager.lookup.return_value = 0 + self.manager.prepare_load = lambda block_hashes: (MockLoadStoreSpec( + block_hashes)) + self.handler = MockOffloadingHandler() + + def get_manager(self) -> OffloadingManager: + return self.manager + + def get_handlers( + self, _ + ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], + OffloadingHandler]]: + + yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler + yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler + + def get_completed_transfers(self) -> list[TransferSpec]: + specs = self.handler.completed_specs + self.handler.completed_specs = [] + return specs + + +@dataclass +class TransferSummary: + gpu_block_indices: list[int] + offload_addresses: list[Any] + + +class RequestRunner: + + def __init__(self, offloaded_block_size: int, gpu_block_size: int, + num_gpu_blocks: int): + self.offloaded_block_size: int = offloaded_block_size + self.gpu_block_size: int = gpu_block_size + self.num_gpu_blocks: int = num_gpu_blocks + + self.req_id: int = -1 + + vllm_config = create_vllm_config(block_size=gpu_block_size, + max_num_batched_tokens=1000) + vllm_config.kv_transfer_config = KVTransferConfig( + kv_connector="OffloadingConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "spec_name": "MockOffloadingSpec", + "spec_module_path": + "tests.v1.kv_connector.unit.test_offloading_connector", + "block_size": offloaded_block_size, + }) + + self.scheduler: Scheduler = create_scheduler(vllm_config, + num_blocks=num_gpu_blocks) + self.worker_connector = OffloadingConnector(vllm_config, + KVConnectorRole.WORKER) + + # register worker kv_caches to enable OffloadingWorker creations + self.worker_connector.register_kv_caches( + kv_caches={"a": torch.empty(0)}) + + # extract connector of scheduler + scheduler_connector = self.scheduler.connector + assert scheduler_connector is not None + assert isinstance(scheduler_connector, OffloadingConnector) + self.scheduler_connector: OffloadingConnector = scheduler_connector + + # extract mocked OffloadingManager of scheduler connector + connector_scheduler = scheduler_connector.connector_scheduler + assert connector_scheduler is not None + manager = connector_scheduler.manager + assert isinstance(manager, MagicMock) + self.manager: MagicMock = manager + + assert connector_scheduler.gpu_block_size == gpu_block_size + assert connector_scheduler.offloaded_block_size == offloaded_block_size + + # extract OffloadingSpec of worker_connector + connector_worker = self.worker_connector.connector_worker + assert connector_worker is not None + offloading_spec = connector_worker.spec + assert isinstance(offloading_spec, MockOffloadingSpec) + self.offloading_spec: MockOffloadingSpec = offloading_spec + + # mapping (offloading address) -> gpu_block_index + self.offloaded: dict[Any, int] = {} + + self.pending_loads_count: int = 0 + self.pending_stores_count: int = 0 + + self.completed_loads: list[TransferSummary] = [] + self.completed_stores: list[TransferSummary] = [] + + # maps {block_id: block_offset} + self.gpu_block_index: dict[int, int] = {} + + init_none_hash(sha256) + self._block_hasher = get_request_block_hasher(gpu_block_size, sha256) + + self._dummy_ctx: ForwardContext = ForwardContext(no_compile_layers={}, + attn_metadata={}, + virtual_engine=0) + + def new_request(self, token_ids: list[int]): + assert not self.scheduler.requests + self.req_id += 1 + + req = Request( + request_id=str(self.req_id), + prompt_token_ids=token_ids, + sampling_params=SamplingParams(max_tokens=1000), + pooling_params=None, + eos_token_id=EOS_TOKEN_ID, + block_hasher=self._block_hasher, + ) + + self.scheduler.add_request(req) + + def _wait_for_transfers(self): + block_size_factor = self.offloaded_block_size // self.gpu_block_size + + while self.pending_loads_count or self.pending_stores_count: + for transfer_spec in ( + self.offloading_spec.get_completed_transfers()): + src_spec, dst_spec = transfer_spec + + if isinstance(src_spec, GPULoadStoreSpec): + store = True + gpu_spec = src_spec + offload_spec = dst_spec + else: + store = False + gpu_spec = dst_spec + offload_spec = src_spec + + assert isinstance(offload_spec, MockLoadStoreSpec) + assert isinstance(gpu_spec, GPULoadStoreSpec) + + gpu_block_indices: list[int] = [] + for block_id in gpu_spec.block_ids: + gpu_block_indices.append( + self.gpu_block_index[block_id.item()]) + + # list of (block_hash, sub_block_offset) + offload_addresses: list[Any] = [] + for block_hash in offload_spec.block_hashes: + for sub_block_idx in range(block_size_factor): + offload_addresses.append((block_hash, sub_block_idx)) + + if store: + assert len(gpu_block_indices) == len(offload_addresses) + + self.completed_stores.append( + TransferSummary(gpu_block_indices, offload_addresses)) + self.pending_stores_count -= 1 + else: + remainder_sub_block_count = (len(offload_addresses) - + len(gpu_block_indices)) + assert remainder_sub_block_count >= 0 + assert remainder_sub_block_count < block_size_factor + offload_addresses = offload_addresses[ + remainder_sub_block_count:] + + self.completed_loads.append( + TransferSummary(gpu_block_indices, offload_addresses)) + self.pending_loads_count -= 1 + + def _update_gpu_block_idx(self): + for blocks in (self.scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks.values()): + for block_idx, block in enumerate(blocks): + self.gpu_block_index[block.block_id] = block_idx + + def _run(self, decoded_tokens: list[int]): + """ + Runs multiple engine (scheduler + worker) steps. + Assumes a single request is running. + + Args: + decoded_tokens: the tokens to yield at each step. + """ + + tokens_iter = iter(decoded_tokens) + token_id = next(tokens_iter, None) + while token_id is not None: + assert self.scheduler.requests + + scheduler_output = self.scheduler.schedule() + self._update_gpu_block_idx() + + kv_connector_metadata = scheduler_output.kv_connector_metadata + assert kv_connector_metadata is not None + assert isinstance(kv_connector_metadata, + OffloadingConnectorMetadata) + + self.pending_loads_count += len(kv_connector_metadata.reqs_to_load) + self.pending_stores_count += len( + kv_connector_metadata.reqs_to_store) + + self.worker_connector.bind_connector_metadata( + kv_connector_metadata) + self.worker_connector.start_load_kv(self._dummy_ctx) + + if scheduler_output.total_num_scheduled_tokens > 0: + self.worker_connector.wait_for_save() + + finished_sending, finished_recving = ( + self.worker_connector.get_finished( + scheduler_output.finished_req_ids)) + + self.worker_connector.clear_connector_metadata() + + model_runner_output = create_model_runner_output( + reqs=self.scheduler.running, + finished_sending=list(finished_sending), + finished_recving=list(finished_recving), + token_id=token_id) + + if self.scheduler.running: + token_id = next(tokens_iter, None) + + self.scheduler.update_from_output(scheduler_output, + model_runner_output) + + self._wait_for_transfers() + + # run one more step to update finished stored + if EOS_TOKEN_ID in decoded_tokens: + assert not self.scheduler.running + + while self.scheduler.requests: + scheduler_output = self.scheduler.schedule() + + finished_sending, finished_recving = ( + self.worker_connector.get_finished( + scheduler_output.finished_req_ids)) + + assert not finished_recving + + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending) + + self.scheduler.update_from_output(scheduler_output, + model_runner_output) + + def run( + self, + decoded_tokens: list[int], + expected_stored_gpu_block_indexes: tuple[int, ...] = (), + expected_loaded_gpu_block_indexes: tuple[int, ...] = (), + ): + """ + Runs multiple engine (scheduler + worker) steps. + Assumes a single request is running. + + Args: + decoded_tokens: the tokens to yield at each step. + expected_stored_gpu_block_indexes: GPU block indexes + that are expected to be written during the run. + expected_loaded_gpu_block_indexes: GPU block indexes + that are expected to be loaded during the run. + """ + + self.manager.reset_mock() + self._run(decoded_tokens) + + loaded_gpu_block_indexes: set[int] = set() + for transfer in self.completed_loads: + for gpu_block_idx, offloaded_address in zip( + transfer.gpu_block_indices, transfer.offload_addresses): + loaded_gpu_block_indexes.add(gpu_block_idx) + assert gpu_block_idx == self.offloaded[offloaded_address] + + assert ( + set(expected_loaded_gpu_block_indexes) == loaded_gpu_block_indexes) + self.completed_loads.clear() + + stored_gpu_block_indexes: set[int] = set() + for transfer in self.completed_stores: + for gpu_block_idx, offloaded_address in zip( + transfer.gpu_block_indices, transfer.offload_addresses): + stored_gpu_block_indexes.add(gpu_block_idx) + self.offloaded[offloaded_address] = gpu_block_idx + + assert ( + set(expected_stored_gpu_block_indexes) == stored_gpu_block_indexes) + self.completed_stores.clear() + + +@pytest.fixture +def request_runner(): + runners = [] + + def runner_factory(offloaded_block_size, gpu_block_size, num_gpu_blocks): + runner = RequestRunner(offloaded_block_size=offloaded_block_size, + gpu_block_size=gpu_block_size, + num_gpu_blocks=num_gpu_blocks) + runners.append(runner) + return runner + + yield runner_factory # pass factory to the test + + +def generate_store_output(block_hashes: Iterable[BlockHash]): + block_hashes = list(block_hashes) + return PrepareStoreOutput( + block_hashes_to_store=list(block_hashes), + store_spec=MockLoadStoreSpec(block_hashes), + block_hashes_evicted=[], + ) + + +def test_offloading_connector(request_runner): + offloaded_block_size = 12 + gpu_block_size = 4 + num_gpu_blocks = 100 + block_size_factor = offloaded_block_size // gpu_block_size + + runner = request_runner(offloaded_block_size=offloaded_block_size, + gpu_block_size=gpu_block_size, + num_gpu_blocks=num_gpu_blocks) + + # 3 blocks, store just the middle block (skip first and last) + # blocks = [0, 1, 2], [3, 4, 5], [6, 7, 8] + runner.new_request(token_ids=[0] * offloaded_block_size * 3) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output(list(block_hashes)[1:2]) + runner.run(decoded_tokens=[0], expected_stored_gpu_block_indexes=(3, 4, 5)) + + # add block missing 1 token -> no offload + runner.run(decoded_tokens=[0] * (offloaded_block_size - 1)) + runner.manager.prepare_store.assert_not_called() + + # +1 token -> single block, fail prepare_store + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: None + runner.run(decoded_tokens=[0]) + runner.manager.prepare_store.assert_called() + + # 1 more block, now set block_hashes_to_store = [] + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.run(decoded_tokens=[0] * offloaded_block_size) + + # 1 more block, now check touch was called with all 6 blocks + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output(block_hashes) + runner.run(decoded_tokens=[0] * offloaded_block_size, + expected_stored_gpu_block_indexes=(15, 16, 17)) + runner.manager.touch.assert_called() + block_hashes1 = list(runner.manager.touch.call_args.args[0]) + assert len(block_hashes1) == 6 + + # terminate request + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + + # create a new request differing only on the last token + runner.new_request(token_ids=[0] * (offloaded_block_size * 6 - 1) + [1]) + runner.run(decoded_tokens=[0], + expected_stored_gpu_block_indexes=tuple( + range(6 * block_size_factor))) + runner.manager.touch.assert_called() + block_hashes2 = list(runner.manager.touch.call_args.args[0]) + assert len(block_hashes2) == 6 + + # verify hashes are the same, except for the last block + assert block_hashes1[:5] == block_hashes2[:5] + assert block_hashes1[5] != block_hashes2[5] + + # terminate request + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + + # full_block_tokens - num_computed_tokens < offloaded_block_size + runner.new_request(token_ids=[0] * gpu_block_size + [1] * + (offloaded_block_size - gpu_block_size)) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + runner.manager.lookup.assert_not_called() + + # single block lookup with no hits + runner.new_request(token_ids=[1] * offloaded_block_size) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.run(decoded_tokens=[EOS_TOKEN_ID]) + runner.manager.lookup.assert_called() + assert len(list(runner.manager.lookup.call_args.args[0])) == 1 + + # single block lookup with a hit + runner.scheduler.reset_prefix_cache() + runner.new_request(token_ids=[0] * offloaded_block_size) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.manager.lookup.return_value = 1 + runner.run(decoded_tokens=[EOS_TOKEN_ID], + expected_loaded_gpu_block_indexes=(0, 1, 2)) + + # single block lookup with a hit in a middle block + runner.new_request(token_ids=[0] * offloaded_block_size * 2 + + [1] * offloaded_block_size) + runner.manager.prepare_store.side_effect = \ + lambda block_hashes: generate_store_output([]) + runner.manager.lookup.return_value = 1 + runner.run(decoded_tokens=[EOS_TOKEN_ID], + expected_loaded_gpu_block_indexes=(3, 4, 5)) + + # test take_events + def to_hashes(int_hashes: list[int]) -> list[BlockHash]: + return [BlockHash(str(i).encode()) for i in int_hashes] + + def take_events() -> Iterable[OffloadingEvent]: + yield OffloadingEvent(block_hashes=to_hashes([1, 2, 3]), + block_size=16, + medium="A", + removed=False) + yield OffloadingEvent(block_hashes=to_hashes([4, 5, 6]), + block_size=32, + medium="B", + removed=True) + + runner.manager.take_events.side_effect = take_events + events = list(runner.scheduler_connector.take_events()) + assert len(events) == 2 + event = events[0] + assert isinstance(event, BlockStored) + assert event.block_hashes == to_hashes([1, 2, 3]) + assert event.block_size == 16 + assert event.medium == "A" + assert event.token_ids == [] + assert event.parent_block_hash is None + assert event.lora_id is None + event = events[1] + assert isinstance(event, BlockRemoved) + assert event.block_hashes == to_hashes([4, 5, 6]) + assert event.medium == "B" diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 0cae1c7bc0518..de52668e3dcf1 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -176,6 +176,7 @@ def create_model_runner_output( finished_sending: Optional[list[str]] = None, finished_recving: Optional[list[str]] = None, use_eos: bool = False, + token_id: int = 0, ) -> ModelRunnerOutput: """Make dummy model runner output for testing.""" @@ -184,7 +185,7 @@ def create_model_runner_output( req_id_to_index = {req_id: idx for idx, req_id in enumerate(req_ids)} # Make sampled tokens. - sampled_token = EOS_TOKEN_ID if use_eos else 0 + sampled_token = EOS_TOKEN_ID if use_eos else token_id sampled_token_ids = [[sampled_token] for _ in req_ids] kv_connector_output = None if ( diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 670f9c26b2104..873f130ed827e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -106,3 +106,8 @@ KVConnectorFactory.register_connector( "MultiConnector", "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector", "MultiConnector") + +KVConnectorFactory.register_connector( + "OffloadingConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector", + "OffloadingConnector") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py new file mode 100644 index 0000000000000..c23efa604544d --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -0,0 +1,485 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections import defaultdict +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from itertools import islice +from typing import Any, Optional + +import torch + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent +from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, + KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorMetadata) +from vllm.forward_context import ForwardContext +from vllm.logger import init_logger +from vllm.v1.core.kv_cache_manager import KVCacheBlocks +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_offload.abstract import OffloadingManager +from vllm.v1.kv_offload.factory import OffloadingSpecFactory +from vllm.v1.kv_offload.mediums import GPULoadStoreSpec +from vllm.v1.kv_offload.spec import OffloadingSpec +from vllm.v1.kv_offload.worker.worker import OffloadingWorker, TransferSpec +from vllm.v1.outputs import KVConnectorOutput +from vllm.v1.request import Request + +ReqId = str + +logger = init_logger(__name__) + + +@dataclass +class OffloadingConnectorMetadata(KVConnectorMetadata): + reqs_to_load: dict[ReqId, TransferSpec] + reqs_to_store: dict[ReqId, TransferSpec] + + +class OffloadingConnector(KVConnectorBase_V1): + + def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): + super().__init__(vllm_config, role) + + spec = OffloadingSpecFactory.create_spec(vllm_config) + + self.connector_scheduler: Optional[OffloadingConnectorScheduler] = None + self.connector_worker: Optional[OffloadingConnectorWorker] = None + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler = OffloadingConnectorScheduler(spec) + elif role == KVConnectorRole.WORKER: + self.connector_worker = OffloadingConnectorWorker(spec) + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, + OffloadingConnectorMetadata) + self.connector_worker.start_load_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + pass + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + pass + + def wait_for_save(self): + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, + OffloadingConnectorMetadata) + self.connector_worker.start_store_kv(self._connector_metadata) + + def get_finished(self, + finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + assert self.connector_worker is not None + return self.connector_worker.get_finished(finished_req_ids) + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens) + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens) + + def build_connector_meta( + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def update_connector_output(self, connector_output: KVConnectorOutput): + assert self.connector_scheduler is not None + self.connector_scheduler.update_connector_output(connector_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + assert self.connector_scheduler is not None + return self.connector_scheduler.request_finished(request, block_ids) + + def take_events(self) -> Iterable[KVCacheEvent]: + assert self.connector_scheduler is not None + return self.connector_scheduler.take_events() + + +class OffloadingConnectorScheduler: + """Implementation of Scheduler side methods""" + + def __init__(self, spec: OffloadingSpec): + self.gpu_block_size = spec.gpu_block_size + self.offloaded_block_size = spec.offloaded_block_size + self.block_size_factor = (self.offloaded_block_size // + self.gpu_block_size) + self.manager: OffloadingManager = spec.get_manager() + + self._requests: dict[ReqId, Request] = {} + # list of GPU block IDs per request + self._request_block_ids: dict[ReqId, list[int]] = {} + # requests to load for the current scheduler step + self._reqs_to_load: dict[ReqId, TransferSpec] = {} + # request blocks are stored in order + # index of next block (of size offloaded_block_size) to offload + self._next_stored_block_idx: dict[ReqId, int] = {} + + # request ID -> set(block hashes being stored/load) + self._reqs_being_stored = defaultdict[ReqId, set[BlockHash]](set) + self._reqs_being_loaded = defaultdict[ReqId, set[BlockHash]](set) + + def _get_block_hashes( + self, + req: Request, + start_idx: int = 0, + end_idx: Optional[int] = None, + ) -> Iterable[BlockHash]: + return islice( + req.block_hashes, + self.block_size_factor * start_idx + self.block_size_factor - 1, + self.block_size_factor * end_idx if end_idx else None, + self.block_size_factor) + + def get_num_new_matched_tokens( + self, request: Request, + num_computed_tokens: int) -> tuple[int, bool]: + """ + Get number of new tokens that can be loaded beyond the + num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + A tuple with the following elements: + - The number of tokens that can be loaded beyond what is + already computed. + - `True` if tokens will be loaded asynchronously + (between scheduler steps). + """ + num_blocks = request.num_tokens // self.offloaded_block_size + + assert (len(request.block_hashes) // + self.block_size_factor == num_blocks) + block_hashes = self._get_block_hashes(request) + + self.manager.touch(block_hashes) + + full_block_tokens = self.offloaded_block_size * num_blocks + if full_block_tokens - num_computed_tokens < self.offloaded_block_size: + # we can load less than a block, skip + return 0, False + + start_block_idx = num_computed_tokens // self.offloaded_block_size + hits = self.manager.lookup( + self._get_block_hashes(request, start_idx=start_block_idx)) + if hits == 0: + return 0, False + + num_hit_tokens = (self.offloaded_block_size * + (start_block_idx + hits) - num_computed_tokens) + logger.debug( + "Request %s hit %s offloaded tokens after %s GPU hit tokens", + request.request_id, + num_hit_tokens, + num_computed_tokens, + ) + if num_hit_tokens < self.offloaded_block_size: + return 0, False + + return num_hit_tokens, True + + def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks, + num_external_tokens: int): + self._requests[request.request_id] = request + # the block ids are updated in _get_reqs_to_store + self._request_block_ids[request.request_id] = [] + + if num_external_tokens == 0: + return + + block_groups = blocks.get_block_ids() + block_ids = block_groups[0] + + num_computed_gpu_blocks = sum(block.block_hash is not None + for block in blocks.blocks[0]) + num_computed_tokens = num_computed_gpu_blocks * self.gpu_block_size + full_block_tokens = num_computed_tokens + num_external_tokens + assert full_block_tokens % self.offloaded_block_size == 0 + + num_pending_gpu_blocks = len(block_ids) - num_computed_gpu_blocks + assert (num_external_tokens == num_pending_gpu_blocks * + self.gpu_block_size) + + start_block_idx = num_computed_tokens // self.offloaded_block_size + num_blocks = full_block_tokens // self.offloaded_block_size + + assert (len(request.block_hashes) // self.block_size_factor + >= num_blocks) + block_hashes = self._get_block_hashes(request, + start_idx=start_block_idx, + end_idx=num_blocks) + + src_spec = self.manager.prepare_load(block_hashes) + dst_spec = GPULoadStoreSpec(block_ids[num_computed_gpu_blocks:]) + + block_hashes = self._get_block_hashes(request, + start_idx=start_block_idx, + end_idx=num_blocks) + + self._reqs_to_load[request.request_id] = (src_spec, dst_spec) + self._reqs_being_loaded[request.request_id].update(block_hashes) + self._next_stored_block_idx[request.request_id] = num_blocks + + def _get_reqs_to_store(self, scheduler_output: SchedulerOutput): + reqs_to_store: dict[ReqId, TransferSpec] = {} + # iterate over both new and cached requests + for req_id, new_block_id_groups, preempted in yield_req_data( + scheduler_output): + + if preempted: + self._request_block_ids[req_id] = [] + + if new_block_id_groups: + new_block_ids = new_block_id_groups[0] + self._request_block_ids[req_id] += new_block_ids + + block_ids = self._request_block_ids[req_id] + + req = self._requests[req_id] + new_tokens = scheduler_output.num_scheduled_tokens[req_id] + total_tokens = req.num_computed_tokens + new_tokens + num_blocks = total_tokens // self.offloaded_block_size + start_block_idx = self._next_stored_block_idx.get(req_id, 0) + num_new_blocks = num_blocks - start_block_idx + + if num_new_blocks <= 0: + continue + + num_gpu_blocks = num_blocks * self.block_size_factor + assert len(req.block_hashes) >= num_gpu_blocks + + new_block_hashes = self._get_block_hashes( + req, start_idx=start_block_idx, end_idx=num_blocks) + store_output = self.manager.prepare_store(new_block_hashes) + if store_output is None: + logger.warning("Cannot store %s blocks", num_new_blocks) + break + + self._next_stored_block_idx[req_id] = num_blocks + + if not store_output.block_hashes_to_store: + continue + block_hashes_to_store = set(store_output.block_hashes_to_store) + + block_hashes = self._get_block_hashes(req, end_idx=num_blocks) + self.manager.touch(block_hashes) + + new_block_hashes = self._get_block_hashes( + req, start_idx=start_block_idx, end_idx=num_blocks) + dst_spec = store_output.store_spec + src_block_ids: list[int] = [] + for idx, blk_hash in enumerate(new_block_hashes): + if blk_hash not in block_hashes_to_store: + continue + offloaded_block_idx = start_block_idx + idx + gpu_block_idx = offloaded_block_idx * self.block_size_factor + for i in range(self.block_size_factor): + src_block_ids.append(block_ids[gpu_block_idx + i]) + src_spec = GPULoadStoreSpec(src_block_ids) + + reqs_to_store[req_id] = (src_spec, dst_spec) + self._reqs_being_stored[req_id] |= block_hashes_to_store + + logger.debug( + "Request %s offloading %s blocks starting from block #%d", + req_id, + len(block_hashes_to_store), + start_block_idx, + ) + + return reqs_to_store + + def build_connector_meta( + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + meta = OffloadingConnectorMetadata( + reqs_to_load=self._reqs_to_load, + reqs_to_store=self._get_reqs_to_store(scheduler_output)) + self._reqs_to_load = {} + return meta + + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + for req_id in connector_output.finished_sending or []: + block_hashes = self._reqs_being_stored.pop(req_id, None) + if block_hashes: + self.manager.complete_store(block_hashes) + + for req_id in connector_output.finished_recving or []: + block_hashes = self._reqs_being_loaded.pop(req_id, None) + if block_hashes: + self.manager.complete_load(block_hashes) + + def request_finished( + self, + request: Request, + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + """ + Called when a request has finished, before its blocks are freed. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + req_id = request.request_id + self._requests.pop(req_id, None) + self._request_block_ids.pop(req_id, None) + self._next_stored_block_idx.pop(req_id, None) + + request_being_stored = req_id in self._reqs_being_stored + return request_being_stored, None + + def take_events(self) -> Iterable[KVCacheEvent]: + """Take the KV cache events from the connector. + + Returns: + A list of KV cache events. + """ + for event in self.manager.take_events(): + if event.removed: + yield BlockRemoved(block_hashes=event.block_hashes, + medium=event.medium) + else: + yield BlockStored(block_hashes=event.block_hashes, + parent_block_hash=None, + token_ids=[], + lora_id=None, + block_size=event.block_size, + medium=event.medium) + + +class OffloadingConnectorWorker: + """Implementation of Worker side methods""" + + def __init__(self, spec: OffloadingSpec): + self.spec = spec + self.worker = OffloadingWorker() + + self._job_counter = 0 + + # req_id -> (job_id, store) + self._jobs: dict[int, tuple[ReqId, bool]] = {} + # req_id -> active job IDs + self._load_job: dict[ReqId, int] = {} + # req_id -> set(active job IDs) + self._store_jobs = defaultdict[ReqId, set[int]](set) + + self._finished_reqs_waiting_for_store: set[ReqId] = set() + + def _generate_job_id(self) -> int: + job_id = self._job_counter + self._job_counter = job_id + 1 + return job_id + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + for src_cls, dst_cls, handler in (self.spec.get_handlers(kv_caches)): + self.worker.register_handler(src_cls, dst_cls, handler) + + def start_load_kv(self, metadata: OffloadingConnectorMetadata): + for req_id, transfer_spec in metadata.reqs_to_load.items(): + job_id = self._generate_job_id() + self._jobs[job_id] = (req_id, False) + assert req_id not in self._load_job + self._load_job[req_id] = job_id + assert self.worker.transfer_async(job_id, transfer_spec) + + def start_store_kv(self, metadata: OffloadingConnectorMetadata): + for req_id, transfer_spec in metadata.reqs_to_store.items(): + job_id = self._generate_job_id() + self._jobs[job_id] = (req_id, True) + self._store_jobs[req_id].add(job_id) + assert self.worker.transfer_async(job_id, transfer_spec) + + def get_finished(self, + finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + Returns a list of request IDs that finished loading or storing. + + Returns: + ids of requests that have finished asynchronous transfer + tuple of (sending/saving ids, recving/loading ids). + """ + finished_sending = set() + finished_recving = set() + for job_id, success in self.worker.get_finished(): + # we currently do not support job failures + assert success + req_id, store = self._jobs.pop(job_id) + if store: + req_jobs = self._store_jobs[req_id] + req_jobs.remove(job_id) + if req_jobs: + continue + + if req_id in self._finished_reqs_waiting_for_store: + self._finished_reqs_waiting_for_store.remove(req_id) + finished_sending.add(req_id) + del self._store_jobs[req_id] + else: + req_job = self._load_job[req_id] + assert job_id == req_job + del self._load_job[req_id] + finished_recving.add(req_id) + + for req_id in finished_req_ids: + pending_req_jobs = self._store_jobs.get(req_id) + if pending_req_jobs: + self._finished_reqs_waiting_for_store.add(req_id) + elif pending_req_jobs is not None: + finished_sending.add(req_id) + del self._store_jobs[req_id] + + return finished_sending, finished_recving + + +def yield_req_data( + scheduler_output) -> Iterator[tuple[str, tuple[list[int], ...], bool]]: + """ + Yields: + (req_id, new_block_id_groups, preempted) + """ + # new requests + for req_data in scheduler_output.scheduled_new_reqs: + yield req_data.req_id, req_data.block_ids, False + + # cached requests + cached_reqs = scheduler_output.scheduled_cached_reqs + yield from zip(cached_reqs.req_ids, cached_reqs.new_block_ids, + cached_reqs.resumed_from_preemption) diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py new file mode 100644 index 0000000000000..6365ab4a6db75 --- /dev/null +++ b/vllm/v1/kv_offload/factory.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +from typing import TYPE_CHECKING, Callable + +from vllm.logger import init_logger +from vllm.v1.kv_offload.spec import OffloadingSpec + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + + +class OffloadingSpecFactory: + _registry: dict[str, Callable[[], type[OffloadingSpec]]] = {} + + @classmethod + def register_spec(cls, name: str, module_path: str, + class_name: str) -> None: + """Register a spec with a lazy-loading module and class name.""" + if name in cls._registry: + raise ValueError(f"Connector '{name}' is already registered.") + + def loader() -> type[OffloadingSpec]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + cls._registry[name] = loader + + @classmethod + def create_spec( + cls, + config: "VllmConfig", + ) -> OffloadingSpec: + kv_transfer_config = config.kv_transfer_config + assert kv_transfer_config is not None + extra_config = kv_transfer_config.kv_connector_extra_config + spec_name = extra_config.get("spec_name", "CPUOffloadingSpec") + if spec_name in cls._registry: + spec_cls = cls._registry[spec_name]() + else: + spec_module_path = extra_config.get("spec_module_path") + if spec_module_path is None: + raise ValueError(f"Unsupported spec type: {spec_name}") + spec_module = importlib.import_module(spec_module_path) + spec_cls = getattr(spec_module, spec_name) + assert issubclass(spec_cls, OffloadingSpec) + logger.info("Creating offloading spec with name: %s", spec_name) + return spec_cls(config) + + +# Register various specs here. diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py new file mode 100644 index 0000000000000..ed23d5e51934f --- /dev/null +++ b/vllm/v1/kv_offload/spec.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from collections.abc import Iterator +from typing import TYPE_CHECKING + +import torch + +from vllm.logger import init_logger +from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager +from vllm.v1.kv_offload.worker.worker import OffloadingHandler + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + + +class OffloadingSpec(ABC): + """Spec for an offloading connector""" + + def __init__(self, vllm_config: "VllmConfig"): + logger.warning( + "Initializing OffloadingSpec. This API is experimental and " + "subject to change in the future as we iterate the design.") + self.vllm_config = vllm_config + + kv_transfer_config = vllm_config.kv_transfer_config + assert kv_transfer_config is not None + self.extra_config = kv_transfer_config.kv_connector_extra_config + + self.gpu_block_size = vllm_config.cache_config.block_size + self.offloaded_block_size = int( + self.extra_config.get("block_size", self.gpu_block_size)) + + assert self.offloaded_block_size % self.gpu_block_size == 0 + + @abstractmethod + def get_manager(self) -> OffloadingManager: + """ + Get an OffloadingManager that will be used + by the scheduler-side offloading connector to track + offloaded blocks and manage evictions. + """ + pass + + @abstractmethod + def get_handlers( + self, kv_caches: dict[str, torch.Tensor] + ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], + OffloadingHandler]]: + """ + Get offloading handlers along with their respective src and dst types. + + Args: + kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor. + + Yields: + Tuples of (src_type, dst_type, offloading_handler). + """ + pass From a2a5f79e09075f50943750a2376b3b5d95a47ac5 Mon Sep 17 00:00:00 2001 From: qizixi <22851944+zixi-qi@users.noreply.github.com> Date: Fri, 19 Sep 2025 12:07:26 -0700 Subject: [PATCH 479/584] Optimize triton unified attention performance for sliding window attention (#24390) Signed-off-by: zixi-qi <qizixi@meta.com> --- .../test_triton_unified_attention.py | 2 +- .../attention/ops/triton_unified_attention.py | 26 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index ab91560e995c8..5cff29b15aa3f 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -83,7 +83,7 @@ def ref_paged_attn( @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("sliding_window", [None, 64, 128, 256]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 591b68bfa6468..9e7cafc174287 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -184,8 +184,30 @@ def kernel_unified_attention_2d( # this prefix can be skipped) num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) - # iterate through tiles - for j in range(0, num_tiles): + # ---- Sliding-window tile pruning -------------------- + # Default: keep previous global behavior + tile_start = 0 + tile_end = num_tiles + if SLIDING_WINDOW > 0: + # Query rows covered by this Q-block + qpos_lo = q_block_local_idx * BLOCK_Q + qpos_hi = tl.minimum( + qpos_lo + (BLOCK_M - 1) // num_queries_per_kv, + cur_batch_query_len - 1, + ) + # For sliding window, each query position q can only attend to + # keys in the range [q_abs - SLIDING_WINDOW + 1, q_abs] + # where q_abs = context_len + q + # The union of allowed key positions for this Q-block is: + # [context_len + qpos_lo - SLIDING_WINDOW + 1, context_len + qpos_hi] + first_allowed_key = context_len + qpos_lo - SLIDING_WINDOW + 1 + last_allowed_key = context_len + qpos_hi + # Convert to tile indices and clamp + tile_start = tl.maximum(0, first_allowed_key // TILE_SIZE) + tile_end = tl.minimum((last_allowed_key // TILE_SIZE) + 1, num_tiles) + + # iterate through tiles (now limited to the sliding window range) + for j in range(tile_start, tile_end): seq_offset = j * TILE_SIZE + offs_t tile_mask = seq_offset < max_seq_prefix_len From 7852b82b93d189a5374d403f9735d6849ba4c64e Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath <varunsundar08@gmail.com> Date: Fri, 19 Sep 2025 15:14:09 -0400 Subject: [PATCH 480/584] [Bugfix] GPT OSS Attritbute error on H100 (#25228) Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> --- vllm/model_executor/layers/fused_moe/config.py | 4 ++-- vllm/model_executor/layers/quantization/mxfp4.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 742df3dbdc6af..b14bc06e913cf 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import torch @@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.utils import cdiv, has_triton_kernels from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe -if TYPE_CHECKING and has_triton_kernels: +if has_triton_kernels(): from triton_kernels.matmul_ogs import PrecisionConfig logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 28c1e60ccd08a..5c3f8a891276b 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -638,8 +638,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): return None if self.mxfp4_backend == Mxfp4Backend.TRITON: - w1_scale = layer.w13_precision_config - w2_scale = layer.w2_precision_config + w1_scale = self.w13_precision_config + w2_scale = self.w2_precision_config else: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale From 4bdf40021821dfb085e2cbe2f4dc0b7ad7b2e635 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:42:01 -0400 Subject: [PATCH 481/584] [Bugfix] Fix chunked a2_scales in modular kernels (#25264) Signed-off-by: Bill Nell <bnell@redhat.com> --- .../model_executor/layers/fused_moe/batched_deep_gemm_moe.py | 1 + .../layers/fused_moe/batched_triton_or_deep_gemm_moe.py | 3 ++- vllm/model_executor/layers/fused_moe/cutlass_moe.py | 4 +++- vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 3 ++- .../layers/fused_moe/flashinfer_cutlass_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 4 +++- vllm/model_executor/layers/fused_moe/fused_moe.py | 3 ++- .../layers/fused_moe/gpt_oss_triton_kernels_moe.py | 1 + vllm/model_executor/layers/fused_moe/modular_kernel.py | 5 +++++ vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py | 2 ++ vllm/model_executor/layers/fused_moe/trtllm_moe.py | 1 + 11 files changed, 23 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index e9dfb22bea27b..cf0b965cc8c51 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -286,6 +286,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index 8b9070f098898..c3c4f4a5d190e 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -126,6 +126,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -136,5 +137,5 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): assert experts is not None experts.apply(output, hidden_states, w1, w2, topk_weights, topk_ids, activation, global_num_experts, expert_map, a1q_scale, - workspace13, workspace2, expert_tokens_meta, + a2_scale, workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 957ffca0d1246..8c2ff580575f5 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -241,6 +241,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -262,7 +263,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, global_num_experts, expert_map, self.w1_scale, self.w2_scale, - a1q_scale, self.a2_scale, self.ab_strides1, self.ab_strides2, + a1q_scale, a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, self.c_strides2, workspace13, workspace2, expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, @@ -705,6 +706,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], # unused + a2_scale: Optional[torch.Tensor], # unused workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 8830b95df7cf0..51a4f275e98cc 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -214,13 +214,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): assert a1q_scale is not None - assert self.a2_scale is None + assert a2_scale is None assert self.block_shape is not None assert self.w1_scale is not None assert self.w2_scale is not None diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 6eeec18a6ec87..a074da883088e 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -129,6 +129,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: Optional[torch.Tensor], workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index fe6ac458a9593..660bae3146026 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -688,6 +688,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -879,6 +880,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -970,7 +972,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache1.view(-1, N)) qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input( - intermediate_cache2, self.a2_scale, max_num_tokens, E, N, + intermediate_cache2, a2_scale, max_num_tokens, E, N, expert_num_tokens, self.quant_dtype, self.per_act_token_quant, self.block_shape) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index d4de3f640865e..6c2a5bda7cbaa 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1598,6 +1598,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -1690,7 +1691,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): a2q_scale: Optional[torch.Tensor] = None qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( - intermediate_cache2, self.a2_scale, self.quant_dtype, + intermediate_cache2, a2_scale, self.quant_dtype, self.per_act_token_quant, self.block_shape) invoke_fused_moe_kernel( diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 614a83ad1158c..08a9b34a42457 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -179,6 +179,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 729f8e39cf0f7..a16c254fadf66 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -519,6 +519,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[ExpertTokensMetadata], @@ -634,6 +635,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, ) -> torch.Tensor: @@ -671,6 +673,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts=global_num_experts, expert_map=expert_map, a1q_scale=a1q_scale, + a2_scale=a2_scale, workspace13=workspace13, workspace2=workspace2, expert_tokens_meta=expert_tokens_meta, @@ -718,6 +721,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts=local_num_experts, expert_map=expert_map, a1q_scale=a1q_scale, + a2_scale=self.fused_experts.a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -803,6 +807,7 @@ class FusedMoEModularKernel(torch.nn.Module): local_num_experts=local_num_experts, expert_map=expert_map, a1q_scale=c_a1q_scale, + a2_scale=c_a2_scale, expert_tokens_meta=c_expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index b2dbc306a6148..3de80ff85747e 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -111,6 +111,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], @@ -134,6 +135,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts, expert_map, a1q_scale, + a2_scale, workspace13, workspace2, expert_tokens_meta, diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 8e5f6acc9df63..05ed93c942c8e 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -103,6 +103,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, expert_map: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], From e57fc15971dd3c3aff9b2bdcb43741b5f34c329e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 19 Sep 2025 20:43:33 +0100 Subject: [PATCH 482/584] Specify platform in `pip-compile` `pre-commit` hook so it runs on MacOS (#25273) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- requirements/test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 13ad3af97d839..a4ea888af3f3e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,7 +49,7 @@ repos: rev: 0.6.17 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28] files: ^requirements/test\.(in|txt)$ - repo: local hooks: diff --git a/requirements/test.txt b/requirements/test.txt index 39040f210b2fd..3519aa524f418 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 +# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28 absl-py==2.1.0 # via rouge-score accelerate==1.0.1 From 48ecb4438b2845f757edf228c5455ca6095938af Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Fri, 19 Sep 2025 16:06:49 -0400 Subject: [PATCH 483/584] [Perf] Use FlashInfer RoPE for RotaryEmbedding.forward_cuda when available (#21126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> --- .../layers/rotary_embedding/base.py | 38 +++++++++++---- .../layers/rotary_embedding/common.py | 46 +++++++++++++++++++ .../rotary_embedding/deepseek_scaling_rope.py | 4 +- .../rotary_embedding/llama4_vision_rope.py | 2 +- .../layers/rotary_embedding/mrope.py | 2 + 5 files changed, 78 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index db50eb08db3ff..3dc249ae9adb9 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -6,6 +6,8 @@ from typing import Optional import torch from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer from .common import apply_rotary_emb_torch @@ -30,9 +32,17 @@ class RotaryEmbedding(CustomOp): self.base = base self.is_neox_style = is_neox_style self.dtype = dtype + # Flashinfer only supports head_size=64, 128, 256, 512. + # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202 + self.use_flashinfer = (self.enabled() + and dtype in (torch.float16, torch.bfloat16) + and current_platform.is_cuda() + and has_flashinfer() + and self.head_size in [64, 128, 256, 512]) cache = self._compute_cos_sin_cache() - cache = cache.to(dtype) + if not self.use_flashinfer: + cache = cache.to(dtype) self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) @@ -57,6 +67,14 @@ class RotaryEmbedding(CustomOp): cache = torch.cat((cos, sin), dim=-1) return cache + def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None: + # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`) + # is expensive, so avoid calling it if possible + if self.cos_sin_cache.device != query.device or \ + self.cos_sin_cache.dtype != query.dtype: + self.cos_sin_cache = self.cos_sin_cache.to(query.device, + dtype=query.dtype) + def forward_native( self, positions: torch.Tensor, @@ -94,15 +112,16 @@ class RotaryEmbedding(CustomOp): query: torch.Tensor, key: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if self.use_flashinfer: + torch.ops.vllm.flashinfer_rotary_embedding(positions, query, key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style) + return query, key + from vllm import _custom_ops as ops - # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`) - # is expensive, so avoid calling it if possible - if self.cos_sin_cache.device != query.device or \ - self.cos_sin_cache.dtype != query.dtype: - self.cos_sin_cache = self.cos_sin_cache.to(query.device, - dtype=query.dtype) - + self._match_cos_sin_cache_dtype(query) # ops.rotary_embedding() is an in-place operation # that updates the query and key tensors. ops.rotary_embedding(positions, query, key, self.head_size, @@ -117,8 +136,7 @@ class RotaryEmbedding(CustomOp): ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: from vllm._ipex_ops import ipex_ops as ops - self.cos_sin_cache = self.cos_sin_cache.to(positions.device, - dtype=query.dtype) + self._match_cos_sin_cache_dtype(query) # ops.rotary_embedding() is an in-place operation # that updates the query and key tensors. if key is None: diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 8d821bea19e3e..e3cd0a8e788eb 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -6,6 +6,7 @@ import math import torch from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op if current_platform.is_cuda(): from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb @@ -103,3 +104,48 @@ def yarn_get_mscale(scale: float = 1) -> float: if scale <= 1: return 1.0 return 0.1 * math.log(scale) + 1.0 + + +def _flashinfer_rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + """Custom op wrapper for flashinfer's rotary embedding. + + This is an in-place operation that modifies query and key tensors directly. + """ + from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace + + apply_rope_with_cos_sin_cache_inplace( + positions=positions, + query=query, + key=key, + head_size=head_size, + cos_sin_cache=cos_sin_cache, + is_neox=is_neox, + ) + + +def _flashinfer_rotary_embedding_fake( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + return + + +# Register flashinfer rotary embedding custom op +direct_register_custom_op( + op_name="flashinfer_rotary_embedding", + op_func=_flashinfer_rotary_embedding, + mutates_args=["query", "key"], # These tensors are modified in-place + fake_impl=_flashinfer_rotary_embedding_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index 7ac2e4bb6c34f..736ec2c1dd3a3 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -97,15 +97,13 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" assert key is not None + self._match_cos_sin_cache_dtype(query) query_rot = query[..., :self.rotary_dim] key_rot = key[..., :self.rotary_dim] if self.rotary_dim < self.head_size: query_pass = query[..., self.rotary_dim:] key_pass = key[..., self.rotary_dim:] - if self.cos_sin_cache.device != positions.device: - self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to( - positions.device) cos_sin = self.cos_sin_cache[torch.add(positions, offsets) if offsets is not None else positions] cos, sin = cos_sin.chunk(2, dim=-1) diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py index 37ead43e22bc4..8717280353068 100644 --- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -59,7 +59,7 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): key: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: assert key is not None - self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device) + self._match_cos_sin_cache_dtype(query) query_ = torch.view_as_complex(query.float().reshape( *query.shape[:-1], -1, 2)) key_ = torch.view_as_complex(key.float().reshape( diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index ccc59bbbe233f..17d04a1ad715c 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -245,6 +245,7 @@ class MRotaryEmbedding(RotaryEmbedding): assert positions.ndim == 1 or positions.ndim == 2 assert key is not None + self._match_cos_sin_cache_dtype(query) num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) @@ -293,6 +294,7 @@ class MRotaryEmbedding(RotaryEmbedding): assert positions.ndim == 1 or positions.ndim == 2 assert key is not None + self._match_cos_sin_cache_dtype(query) num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) From b1a63d1b3be996babec6411e3abe559796f76ca9 Mon Sep 17 00:00:00 2001 From: nvjullin <jullin@nvidia.com> Date: Sat, 20 Sep 2025 04:36:34 +0800 Subject: [PATCH 484/584] [BugFix] Make FlashInferMetadataBuilder non-blocking (#25040) Signed-off-by: Julien Lin <jullin@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- vllm/v1/attention/backends/flashinfer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index dda6dd4fbea7a..cb092aa74e7f1 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -585,9 +585,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): kv_data_type=self.kv_cache_dtype, ) else: - attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device) + attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to( + self.device, non_blocking=True) attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to( - self.device) + self.device, non_blocking=True) if num_decodes > 0: pure_decode = num_prefills == 0 From ddc9048394ae6294d0db7fd67270efea59c3a065 Mon Sep 17 00:00:00 2001 From: David-Wen <18927700430@163.com> Date: Sat, 20 Sep 2025 04:44:24 +0800 Subject: [PATCH 485/584] Fix: Correct FusedMoE layer reference in auto_round quantization (#24818) Signed-off-by: David-Wen <18927700430@163.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/quantization/auto_round.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index 1ca92273430dd..bf5141fa48943 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -241,7 +241,7 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: - return AWQMoEMethod(quant_args_marlin, layer.moe) + return AWQMoEMethod(quant_args_marlin, layer.moe_config) from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) @@ -327,7 +327,7 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: - return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config) else: from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) From e69e0b8b5fc5ef2958f7b1fc159119e9c4c0e2d2 Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:40:16 -0400 Subject: [PATCH 486/584] [Frontend] Responses API messages out, just harmony for now (#24985) Signed-off-by: Alec Solder <alecs@fb.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> --- .../openai/test_response_api_with_harmony.py | 15 +++++++++++++++ vllm/entrypoints/openai/protocol.py | 17 ++++++++++++++++- vllm/entrypoints/openai/serving_responses.py | 13 +++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 40a22c04b08a5..f3c3148577b85 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -744,3 +744,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str): assert response_2 is not None assert response_2.status == "completed" assert response_2.output_text is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_output_messages_enabled(client: OpenAI, model_name: str, + server): + response = await client.responses.create( + model=model_name, + input="What is the capital of South Korea?", + extra_body={"enable_response_messages": True}) + + assert response is not None + assert response.status == "completed" + assert len(response.input_messages) > 0 + assert len(response.output_messages) > 0 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 05d5d6d964dd3..c30681318f693 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -328,6 +328,13 @@ class ResponsesRequest(OpenAIBaseModel): "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " "to 256 bit). Not supported by vLLM engine V0.")) + + enable_response_messages: bool = Field( + default=False, + description=( + "Dictates whether or not to return messages as part of the " + "response object. Currently only supported for non-streaming " + "non-background and gpt-oss only. ")) # --8<-- [end:responses-extra-params] _DEFAULT_SAMPLING_PARAMS = { @@ -1831,6 +1838,11 @@ class ResponsesResponse(OpenAIBaseModel): model: str object: Literal["response"] = "response" output: list[ResponseOutputItem] + # These are populated when enable_response_messages is set to True + # TODO: Currently an issue where content of harmony messages + # is not available when these are serialized. Metadata is available + input_messages: Optional[list[ChatCompletionMessageParam]] = None + output_messages: Optional[list[ChatCompletionMessageParam]] = None parallel_tool_calls: bool temperature: float tool_choice: ToolChoice @@ -1860,6 +1872,8 @@ class ResponsesResponse(OpenAIBaseModel): output: list[ResponseOutputItem], status: ResponseStatus, usage: Optional[ResponseUsage] = None, + input_messages: Optional[list[ChatCompletionMessageParam]] = None, + output_messages: Optional[list[ChatCompletionMessageParam]] = None, ) -> "ResponsesResponse": incomplete_details: Optional[IncompleteDetails] = None @@ -1868,7 +1882,6 @@ class ResponsesResponse(OpenAIBaseModel): # TODO: implement the other reason for incomplete_details, # which is content_filter # incomplete_details = IncompleteDetails(reason='content_filter') - return cls( id=request.request_id, created_at=created_time, @@ -1877,6 +1890,8 @@ class ResponsesResponse(OpenAIBaseModel): metadata=request.metadata, model=model_name, output=output, + input_messages=input_messages, + output_messages=output_messages, parallel_tool_calls=request.parallel_tool_calls, temperature=sampling_params.temperature, tool_choice=request.tool_choice, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4894623aeac28..6e243671af242 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -475,9 +475,14 @@ class OpenAIServingResponses(OpenAIServing): # "completed" is implemented as the "catch-all" for now. status: ResponseStatus = "completed" + input_messages = None + output_messages = None if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) + if request.enable_response_messages: + input_messages = context.messages[:context.num_init_messages] + output_messages = context.messages[context.num_init_messages:] num_tool_output_tokens = context.num_tool_output_tokens if len(output) > 0: if context.finish_reason == "length": @@ -496,6 +501,12 @@ class OpenAIServingResponses(OpenAIServing): output = self._make_response_output_items(request, final_output, tokenizer) + # TODO: context for non-gptoss models doesn't use messages + # so we can't get them out yet + if request.enable_response_messages: + raise NotImplementedError( + "enable_response_messages is currently" + " only supported for gpt-oss") # Calculate usage. assert final_res.prompt_token_ids is not None num_tool_output_tokens = 0 @@ -519,6 +530,8 @@ class OpenAIServingResponses(OpenAIServing): response = ResponsesResponse.from_request( request, sampling_params, + input_messages=input_messages, + output_messages=output_messages, model_name=model_name, created_time=created_time, output=output, From 711e912946d23f4ccc1f554b1524c960553c5e28 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 19 Sep 2025 18:23:19 -0400 Subject: [PATCH 487/584] [Compile] Fix Compile Warning for Ignoring `MIN_BLOCK_PER_SM` (#25193) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- csrc/launch_bounds_utils.h | 38 +++++++++++++++++++ .../activation_nvfp4_quant_fusion_kernels.cu | 6 ++- csrc/quantization/fp4/nvfp4_experts_quant.cu | 10 +++-- csrc/quantization/fp4/nvfp4_quant_kernels.cu | 8 ++-- 4 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 csrc/launch_bounds_utils.h diff --git a/csrc/launch_bounds_utils.h b/csrc/launch_bounds_utils.h new file mode 100644 index 0000000000000..d5a89690111bc --- /dev/null +++ b/csrc/launch_bounds_utils.h @@ -0,0 +1,38 @@ +#pragma once + +#include <cuda_runtime_api.h> +#include <algorithm> + +// maximum blocks per SM cap +#ifndef VLLM_LAUNCH_BLOCKS_CAP + #define VLLM_LAUNCH_BLOCKS_CAP 4 +#endif + +// compile-time estimate of max threads per SM for launch bounds. +#ifndef VLLM_MAX_THREADS_PER_SM + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + #define VLLM_MAX_THREADS_PER_SM 1536 + #else + #define VLLM_MAX_THREADS_PER_SM 2048 + #endif +#endif + +// compute the number of blocks per SM to request in __launch_bounds__ +#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL)) +#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \ + (((VAL) <= 0) \ + ? 1 \ + : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP)) +#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \ + VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS)) + +// runtime-time helper to compute blocks/SM +static inline int vllm_runtime_blocks_per_sm(int block_threads) { + int device = -1; + cudaGetDevice(&device); + int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM; + cudaDeviceGetAttribute(&max_threads_per_sm, + cudaDevAttrMaxThreadsPerMultiProcessor, device); + int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1; + return VLLM_CLAMP_BLOCKS_PER_SM(blocks); +} diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index 74fde23782ce5..7539f836ecf37 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -26,6 +26,7 @@ #include "dispatch_utils.h" #include "cuda_utils.h" +#include "launch_bounds_utils.h" #include "nvfp4_utils.cuh" namespace vllm { @@ -63,7 +64,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec, // Use UE4M3 by default. template <class Type, bool UE8M0_SF = false> -__global__ void __launch_bounds__(1024, 4) +__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) { @@ -131,7 +132,8 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d] const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024)); - int const numBlocksPerSM = 2048 / block.x; + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast<int>(block.x)); dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); VLLM_DISPATCH_HALF_TYPES( diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index ce3ba2c19b9eb..6d385e0dd94e7 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -26,12 +26,13 @@ #include "dispatch_utils.h" #include "nvfp4_utils.cuh" +#include "launch_bounds_utils.h" namespace vllm { // Use UE4M3 by default. template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false> -__global__ void __launch_bounds__(512, 4) +__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, @@ -129,7 +130,7 @@ __global__ void __launch_bounds__(512, 4) // Kernel for LARGE_M_TOPK = true (large m_topk optimized version) template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false> -__global__ void __launch_bounds__(1024, 4) +__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, @@ -233,8 +234,9 @@ void quant_impl(void* output, void* output_scale, void* input, int const workSizePerRow = k / ELTS_PER_THREAD; int const totalWorkSize = m_topk * workSizePerRow; dim3 block(std::min(workSizePerRow, 512)); - // Get number of blocks per SM (assume we can fully utilize the SM). - int const numBlocksPerSM = 2048 / block.x; + // Get number of blocks per SM + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast<int>(block.x)); dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x), multiProcessorCount * numBlocksPerSM)); while (grid.x <= multiProcessorCount && block.x > 64) { diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 0c1b9ef0664d7..5575ee8e4197e 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -26,13 +26,14 @@ #include "dispatch_utils.h" #include "cuda_utils.h" +#include "launch_bounds_utils.h" #include "nvfp4_utils.cuh" namespace vllm { // Use UE4M3 by default. template <class Type, bool UE8M0_SF = false> -__global__ void __launch_bounds__(512, 4) +__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) { using PackedVec = PackedVec<Type>; @@ -75,8 +76,9 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, // Grid, Block size. // Each thread converts 8 values. dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); - // Get number of blocks per SM (assume we can fully utilize the SM). - int const numBlocksPerSM = 2048 / block.x; + // Get number of blocks per SM + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast<int>(block.x)); dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); // Launch the cvt kernel. From 431535b522c62f1422848e22dfd83bec2d04111a Mon Sep 17 00:00:00 2001 From: Zhiyu <zhiyuc@nvidia.com> Date: Fri, 19 Sep 2025 15:40:33 -0700 Subject: [PATCH 488/584] Enable modelopt gemma3 nvfp4/fp8, make workflow more robust (#22771) Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> --- .../moe/test_modular_kernel_combinations.py | 3 +- vllm/compilation/backends.py | 7 ++- vllm/config/model.py | 3 ++ .../fused_moe/gpt_oss_triton_kernels_moe.py | 4 +- .../layers/quantization/modelopt.py | 53 +++++++++++++------ vllm/model_executor/models/gemma3.py | 16 ++++++ vllm/model_executor/models/siglip.py | 18 ++++++- 7 files changed, 82 insertions(+), 22 deletions(-) diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 19c4301bd23d5..1c7e62d7aa4c8 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -11,7 +11,8 @@ import pytest import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.config import VllmConfig, current_platform, set_current_vllm_config +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 3cc0fc3106f5a..d6bdb31a3c630 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -31,8 +31,11 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: - if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( - "2.8.0.dev"): + # Use standalone compile only if requested, version is new enough, + # and the symbol actually exists in this PyTorch build. + if (envs.VLLM_USE_STANDALONE_COMPILE + and is_torch_equal_or_newer("2.8.0.dev") + and hasattr(torch._inductor, "standalone_compile")): logger.debug("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: diff --git a/vllm/config/model.py b/vllm/config/model.py index 21457d3660a23..4e847922b61e6 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -964,6 +964,9 @@ class ModelConfig: "modelopt", "modelopt_fp4", "petit_nvfp4", + # Ensure heavy backends are probed last to avoid unnecessary + # imports during override detection (e.g., MXFP4 imports Triton) + "mxfp4", ] quantization_methods = [ q for q in supported_quantization if q not in overrides diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 08a9b34a42457..f12d3807517ff 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -20,10 +20,10 @@ if has_triton_kernels(): from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, matmul_ogs) from triton_kernels.routing import routing - except ModuleNotFoundError: + except (ModuleNotFoundError, AttributeError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " - "version is compatible.") + "version is compatible. Error: %s", e) def triton_kernel_moe_forward( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7eac40825ac33..1083f398a3a20 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -160,6 +160,7 @@ class ModelOptFp8Config(QuantizationConfig): def is_layer_excluded(self, prefix: str) -> bool: """ Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and substring matching. This method handles both regular models and multimodal models that use the language_model prefix. For multimodal models, it checks if the @@ -168,11 +169,18 @@ class ModelOptFp8Config(QuantizationConfig): if self.exclude_modules is None: return False - # Check if any excluded module matches the prefix + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping): + return True + + # Then check substring matching for patterns not caught by exact match for module in self.exclude_modules: - if (module in prefix - or (prefix.startswith("language_model.") - and module in prefix.removeprefix("language_model."))): + # Skip exact matches already handled above + if (module != prefix and + (module in prefix or + (prefix.startswith("language_model.") + and module in prefix.removeprefix("language_model.")))): return True return False @@ -180,9 +188,10 @@ class ModelOptFp8Config(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if (is_layer_skipped(prefix, self.exclude_modules, - self.packed_modules_mapping) - or self.is_layer_excluded(prefix)): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if ("vision_tower" in prefix or "vision_model" in prefix): return UnquantizedLinearMethod() return ModelOptFp8LinearMethod(self) elif isinstance(layer, Attention): @@ -778,22 +787,34 @@ class ModelOptNvFp4Config(QuantizationConfig): return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo, exclude_modules, group_size) - def is_layer_excluded(self, prefix: str, - exclude_modules: list[str]) -> bool: + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and pattern matching. + """ + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping): + return True + + # Check regex pattern matching for patterns not caught by exact match import regex as re - for pattern in exclude_modules: - regex_str = pattern.replace('.', r'\.').replace('*', r'.*') - if re.fullmatch(regex_str, prefix): - return True + for pattern in self.exclude_modules: + # Skip patterns that would be caught by exact matching + if '*' in pattern or '.' in pattern: + regex_str = pattern.replace('.', r'\.').replace('*', r'.*') + if re.fullmatch(regex_str, prefix): + return True return False def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if (is_layer_skipped(prefix, self.exclude_modules, - self.packed_modules_mapping) - or self.is_layer_excluded(prefix, self.exclude_modules)): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if ("vision_tower" in prefix or "vision_model" in prefix): return UnquantizedLinearMethod() return ModelOptNvFp4LinearMethod(self) elif isinstance(layer, Attention): diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 1263e3049a14a..7246308d59028 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -446,6 +446,22 @@ class Gemma3Model(nn.Module): weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue + + # Check if this is a scale parameter that needs remapping first + if name.endswith( + (".k_scale", ".v_scale", ".q_scale", ".prob_scale")): + # Try to remap the scale name first + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is not None and remapped_name in params_dict: + # Successfully remapped, use the remapped name + param = params_dict[remapped_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(remapped_name) + continue + # If remapping failed, continue with normal processing + for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 3630f59f53e0a..eb49d6d2c3350 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -20,7 +20,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs @@ -506,6 +507,21 @@ class SiglipVisionModel(nn.Module): if layer_idx >= layer_count: continue + # Check if this is a scale parameter that needs remapping first + if name.endswith( + (".k_scale", ".v_scale", ".q_scale", ".prob_scale")): + # Try to remap the scale name first + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is not None and remapped_name in params_dict: + # Successfully remapped, use the remapped name + param = params_dict[remapped_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(remapped_name) + continue + # If remapping failed, continue with normal processing + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From ee7a66dd9a5ead46f062502af33766f45076f05d Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:59:41 -0700 Subject: [PATCH 489/584] allow disable flashinfer prefill (#25276) Signed-off-by: Lu Fang <fanglu@fb.com> --- vllm/envs.py | 3 +++ vllm/v1/attention/backends/mla/common.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 19e2f8635275d..294a0b920fb78 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_NO_USAGE_STATS: bool = False + VLLM_DISABLE_FLASHINFER_PREFILL: bool = False VLLM_DO_NOT_TRACK: bool = False VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 @@ -479,6 +480,8 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", + "VLLM_DISABLE_FLASHINFER_PREFILL": + lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1", "VLLM_DO_NOT_TRACK": lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( "DO_NOT_TRACK", None) or "0") == "1", diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index a990cb2f1a972..5b307810de930 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -412,7 +412,8 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. - return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL + return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available + and not envs.VLLM_USE_CUDNN_PREFILL and current_platform.is_device_capability(100)) From 14c1432789c9c1b66308481b2c37439d3ee6661a Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Fri, 19 Sep 2025 16:34:07 -0700 Subject: [PATCH 490/584] [BugFix] Fix async scheduling CPU tensor race take 2 (#25279) Signed-off-by: Nick Hill <nhill@redhat.com> --- vllm/v1/worker/gpu_model_runner.py | 52 ++++++++++++++++++------------ 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d0f26266f0c5..3539f75612050 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1903,7 +1903,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } - elif (self.enable_prompt_embeds and get_pp_group().is_first_rank): + elif self.enable_prompt_embeds and get_pp_group().is_first_rank: # Get the input embeddings for the tokens that are not input embeds, # then put them into the appropriate positions. # TODO(qthequartermasterman): Since even when prompt embeds are @@ -2125,6 +2125,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): invalid_req_indices, ) + @contextmanager + def synchronize_input_prep(self): + if self.prepare_inputs_event is None: + yield + return + + # Ensure prior step has finished with reused CPU tensors. + # This is required in the async scheduling case because + # the CPU->GPU transfer happens async. + self.prepare_inputs_event.synchronize() + try: + yield + finally: + self.prepare_inputs_event.record() + @torch.inference_mode() def execute_model( self, @@ -2132,33 +2147,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]: with record_function_or_nullcontext("Preprocess"): - self._update_states(scheduler_output) - if not scheduler_output.total_num_scheduled_tokens: - if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if there's no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, - self.vllm_config) - if self.cache_config.kv_sharing_fast_prefill: - assert not self.input_batch.num_prompt_logprobs, ( - "--kv-sharing-fast-prefill produces incorrect logprobs for " - "prompt tokens, tokens, please disable it when the requests" - " need prompt logprobs") + with self.synchronize_input_prep(): + # Update persistent batch states. + self._update_states(scheduler_output) + + if not scheduler_output.total_num_scheduled_tokens: + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward( + scheduler_output, self.vllm_config) + if self.cache_config.kv_sharing_fast_prefill: + assert not self.input_batch.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect " + "logprobs for prompt tokens, tokens, please disable " + "it when the requests need prompt logprobs") - if self.prepare_inputs_event is not None: - # Ensure prior step has finished with reused CPU tensors. - self.prepare_inputs_event.synchronize() - try: # Prepare the decoder inputs. (attn_metadata, logits_indices, spec_decode_metadata, num_scheduled_tokens_np, spec_decode_common_attn_metadata, max_query_len, ubatch_slices, num_tokens_after_padding ) = self._prepare_inputs(scheduler_output) - finally: - if self.prepare_inputs_event is not None: - self.prepare_inputs_event.record() - ( num_scheduled_tokens, num_input_tokens, From 3da17c2cc2c2e1d750020e033535f942f156f64c Mon Sep 17 00:00:00 2001 From: Lucas Kabela <lucasakabela@gmail.com> Date: Fri, 19 Sep 2025 17:27:21 -0700 Subject: [PATCH 491/584] [Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090) Signed-off-by: Lucas Kabela <lucaskabela@meta.com> --- tests/compile/test_basic_correctness.py | 16 ++-------------- tests/compile/test_full_graph.py | 4 +--- vllm/compilation/wrapper.py | 10 ++++------ vllm/envs.py | 5 ----- vllm/v1/worker/gpu_model_runner.py | 4 +--- vllm/worker/model_runner.py | 8 +++----- 6 files changed, 11 insertions(+), 36 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index fd2b1866e62e1..a1e5127ebeeb2 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -20,7 +20,6 @@ class TestSetting: tp_size: int attn_backend: str method: str - fullgraph: bool # we cannot afford testing the full Cartesian product @@ -36,7 +35,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # llama model with quantization TestSetting( @@ -46,7 +44,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # MoE model TestSetting( @@ -56,7 +53,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # embedding model TestSetting( @@ -73,7 +69,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), TestSetting( model="BAAI/bge-base-en-v1.5", @@ -82,7 +77,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), # vision language model TestSetting( @@ -92,7 +86,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate_with_image", - fullgraph=False, ), ], ) @@ -109,9 +102,8 @@ def test_compile_correctness( tp_size = test_setting.tp_size attn_backend = test_setting.attn_backend method = test_setting.method - fullgraph = test_setting.fullgraph - if cuda_device_count_stateless() != pp_size * tp_size: - pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got " + if cuda_device_count_stateless() < pp_size * tp_size: + pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got " f"{cuda_device_count_stateless()}") with monkeypatch.context() as m: @@ -149,9 +141,5 @@ def test_compile_correctness( ]: all_args.append(final_args + [f"-O{level}"]) all_envs.append({}) - if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: - # "DYNAMO_ONCE" will always use fullgraph - all_envs[-1][ - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 84178344a5f36..3439a1b29038d 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -79,9 +79,7 @@ def test_full_graph( ): model, model_kwargs = model_info - with monkeypatch.context() as m: - # make sure these models can be captured in full graph mode - m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") + with monkeypatch.context(): print(f"MODEL={model}") run_model(optimization_level, model, model_kwargs) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 96d4eae2ee9aa..930e4d27b410f 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -10,7 +10,6 @@ from typing import Callable, Optional import torch -import vllm.envs as envs from vllm.config import (CompilationLevel, CUDAGraphMode, get_current_vllm_config) from vllm.logger import init_logger @@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher: options = get_current_vllm_config( ).compilation_config.inductor_compile_config - compiled_callable = torch.compile( - self.forward, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend, - options=options) + compiled_callable = torch.compile(self.forward, + fullgraph=True, + backend=backend, + options=options) self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ diff --git a/vllm/envs.py b/vllm/envs.py index 294a0b920fb78..3991a789d80f6 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), - # Internal flag to enable Dynamo fullgraph capture - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": - lambda: bool( - os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - # Feature flag to enable/disable Inductor standalone compile. # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is # enabled by default. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3539f75612050..dffadd1d769b7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model.compile( - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model.compile(fullgraph=True, backend=backend) return # for other compilation levels, cudagraph behavior is controlled by # CudagraphWraper and CudagraphDispatcher of vllm. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c91c871766cff..f662f5a85eff6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -18,7 +18,6 @@ import torch.distributed import torch.nn as nn from tqdm.auto import tqdm -import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState @@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model = torch.compile( - self.model, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model = torch.compile(self.model, + fullgraph=True, + backend=backend) def get_model(self) -> nn.Module: return self.model From a36c675817867235d368faf7e8d81e0ed3333d9c Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser <mbayser@br.ibm.com> Date: Fri, 19 Sep 2025 21:33:25 -0300 Subject: [PATCH 492/584] Don't skip special tokens with hermes-style tool calling (#25281) Signed-off-by: Max de Bayser <mbayser@br.ibm.com> --- .../openai/tool_parsers/hermes_tool_parser.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index a6ce33af6bd00..e74c420da1d3c 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -98,6 +98,15 @@ class Hermes2ProToolParser(ToolParser): else: return delta_text + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + if request.tools and request.tool_choice != 'none': + # do not skip special tokens because the tool_call tokens are + # marked "special" in some models. Since they are skipped + # prior to the call to the tool parser, it breaks tool calling. + request.skip_special_tokens = False + return request + def extract_tool_calls( self, model_output: str, From c7e713616a53a097809609d5a7b536e8bfad4ab8 Mon Sep 17 00:00:00 2001 From: Andrew Sansom <andrew@protopia.ai> Date: Fri, 19 Sep 2025 19:33:40 -0500 Subject: [PATCH 493/584] test: Remove vestigial skip for prompt embeds tests after landing v1 Prompt Embeds support (#25291) Signed-off-by: Andrew Sansom <andrew@protopia.ai> --- tests/entrypoints/openai/test_completion_with_prompt_embeds.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 0e3fc82f0c033..176c1825530e4 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -14,9 +14,6 @@ from transformers import AutoConfig from ...utils import RemoteOpenAIServer -pytest.skip("Skipping prompt_embeds test until V1 supports it.", - allow_module_level=True) - # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" From b8a287a0a8a035073d67b5101687e3a753dd02ac Mon Sep 17 00:00:00 2001 From: Andrew Sansom <andrew@protopia.ai> Date: Fri, 19 Sep 2025 19:46:23 -0500 Subject: [PATCH 494/584] [docs] Prompt Embedding feature support (#25288) Signed-off-by: Andrew Sansom <andrew@protopia.ai> --- docs/features/README.md | 34 ++++++++++++++++++---------------- docs/features/prompt_embeds.md | 3 --- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/docs/features/README.md b/docs/features/README.md index d8e26ec02aecc..10cc448cc2ee3 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -36,22 +36,23 @@ th:not(:first-child) { } </style> -| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | -| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | -| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | -| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | -| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | -| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | -| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | -| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | -| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | -| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | -| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | -| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | +| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | +| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | | +| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | +| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | +| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | +| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | | +| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | +| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | | +| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | | +| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | | +| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | | +| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | | +| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | | +| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ | \* Chunked prefill and prefix caching are only applicable to last-token pooling. <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models. @@ -76,3 +77,4 @@ th:not(:first-child) { | multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ | | best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](gh-issue:25097) | diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index 83993bd0140fa..f9d3c1fb6c23d 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -6,9 +6,6 @@ This page teaches you how to pass prompt embedding inputs to vLLM. The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary. -!!! note - Prompt embeddings are currently only supported in the v0 engine. - ## Offline Inference To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: From 8945b001db3202f882108e50d16b6f9c5e6f01ed Mon Sep 17 00:00:00 2001 From: Boyuan Feng <boyuan@meta.com> Date: Fri, 19 Sep 2025 18:02:15 -0700 Subject: [PATCH 495/584] [torch.compile] CUDAGraph Inductor partition integration (#24281) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Boyuan Feng <boyuan@meta.com> Signed-off-by: Boyuan Feng <fby.1994@gmail.com> Signed-off-by: boyuanfeng <boyuan@meta.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> --- tests/compile/piecewise/test_simple.py | 71 ++++++++++++++++++---- tests/compile/silly_attention.py | 1 + tests/compile/test_full_graph.py | 59 +++++++++++++++++- tests/compile/test_fusion_attn.py | 16 ++++- vllm/attention/layer.py | 2 + vllm/compilation/backends.py | 10 ++- vllm/compilation/decorators.py | 57 ++++++++++++++++- vllm/config/compilation.py | 84 ++++++++++++++++++++++---- vllm/v1/cudagraph_dispatcher.py | 12 ++-- 9 files changed, 280 insertions(+), 32 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 84f4945c82725..41055f431569c 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -15,6 +15,7 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, VllmConfig, set_current_vllm_config) from vllm.envs import VLLM_USE_V1 from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from ..silly_attention import get_global_counter, reset_global_counter @@ -50,16 +51,21 @@ class SillyModel(nn.Module): return x -@pytest.mark.parametrize("use_inductor", [True, False]) -@torch.inference_mode() -def test_simple_piecewise_compile(use_inductor): - assert VLLM_USE_V1 - +def _run_simple_model( + splitting_ops, + use_inductor_graph_partition, + use_inductor, + expected_num_piecewise_graphs_seen, + expected_num_piecewise_capturable_graphs_seen, + expected_num_backend_compilations, + expected_num_cudagraph_captured, +): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, use_inductor=use_inductor, - splitting_ops=["silly.attention"], + splitting_ops=splitting_ops, + use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_copy_inputs=True, cudagraph_capture_sizes=[1, 2], )) @@ -70,11 +76,11 @@ def test_simple_piecewise_compile(use_inductor): with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=5, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=3, # 1 + num_layers - num_backend_compilations=3, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured= - 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen= + expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ), set_forward_context(None, vllm_config=vllm_config): # background context # warm up with background context @@ -104,3 +110,46 @@ def test_simple_piecewise_compile(use_inductor): output = model(input) assert get_global_counter() == 2 assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) + + +@pytest.mark.parametrize("use_inductor", [True, False]) +@torch.inference_mode() +def test_simple_piecewise_compile(use_inductor): + assert VLLM_USE_V1 + _run_simple_model( + splitting_ops=["silly.attention"], + use_inductor_graph_partition=False, + use_inductor=use_inductor, + expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1 + expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers + expected_num_backend_compilations= + 3, # num_piecewise_capturable_graphs_seen + expected_num_cudagraph_captured= + 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ) + + +@torch.inference_mode() +@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []]) +def test_simple_inductor_graph_partition(splitting_ops): + assert VLLM_USE_V1 + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + + _run_simple_model( + # inductor graph partition automatically resets splitting_ops + # to be an empty list + splitting_ops=splitting_ops, + use_inductor_graph_partition=True, + use_inductor=True, + expected_num_piecewise_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_piecewise_capturable_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_backend_compilations= + 1, # since not splitting at fx graph level + expected_num_cudagraph_captured= + 6, # inductor graph partition still captures 6 + # graph, same as fx graph partition. + ) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index 13eb0bf4b1fa1..baedafbae99f1 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -60,4 +60,5 @@ direct_register_custom_op( mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 3439a1b29038d..870aa553ca628 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -3,6 +3,7 @@ from __future__ import annotations +import logging import tempfile from typing import Any, Optional, Union @@ -10,9 +11,13 @@ import pytest import torch from tests.quantization.utils import is_quant_method_supported +from tests.v1.attention.utils import _Backend from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel, PassConfig +from vllm.attention.selector import global_force_attn_backend_context_manager +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + PassConfig) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from ..utils import create_new_process_for_each_test @@ -105,6 +110,18 @@ def test_full_graph( (CompilationConfig(level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()), ("facebook/opt-125m", {})), + ] + [ + # graph inductor partition + ( + CompilationConfig( + level=CompilationLevel.PIECEWISE, + # inductor graph partition uses + # torch._C.Tag.cudagraph_unsafe to specify splitting ops + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + compile_sizes=[1, 2]), + model) for model in models_list(all=False) + if is_torch_equal_or_newer("2.9.0.dev") ]) # only test some of the models @create_new_process_for_each_test() @@ -112,11 +129,51 @@ def test_custom_compile_config( compilation_config: CompilationConfig, model_info: tuple[str, dict[str, Any]], ): + if (compilation_config.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + model, model_kwargs = model_info print(f"MODEL={model}") run_model(compilation_config, model, model_kwargs) +def test_inductor_graph_partition_attn_fusion(caplog_vllm): + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + + model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + custom_ops=["+quant_fp8"], + pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True), + ) + model_kwargs = { + "kv_cache_dtype": "fp8", + "max_model_len": 1024, + } + with caplog_vllm.at_level( + logging.DEBUG), global_force_attn_backend_context_manager( + _Backend.FLASHINFER): + run_model(compilation_config, model, model_kwargs) + + try: + assert ("Fused quantization onto 48 attention nodes" + in caplog_vllm.text), caplog_vllm.text + except AssertionError: + # Note: this message is only triggered when the compilation goes + # through the custom pass. Due to multiple layers of cache on + # PyTorch side, the compilation of a graph may be cached such + # that custom pass directly goes through cache. In this case, + # we go through this branch and assert that the pass is not + # triggered. + assert "Fused quantization" not in caplog_vllm.text + + def run_model(compile_config: Union[int, CompilationConfig], model: str, model_kwargs: dict[str, Any]): prompts = [ diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 6baf4bf83f499..022f183b31932 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from vllm.v1.kv_cache_interface import AttentionSpec FP8_DTYPE = current_platform.fp8_dtype() @@ -339,6 +340,10 @@ else: @pytest.mark.parametrize( "split_attention", [False, True] if current_platform.is_rocm() else [False]) +# TODO(boyuan): test inductor graph partition on rocm +@pytest.mark.parametrize( + "use_inductor_graph_partition", + [False] if current_platform.is_rocm() else [False, True]) @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA") @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8") @@ -352,9 +357,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, dtype: torch.dtype, model_name: str, model_class: type[AttentionQuantPatternModel], backend: _Backend, split_attention: bool, - monkeypatch, dist_init): + use_inductor_graph_partition: bool, + monkeypatch, dist_init, caplog_vllm): """Test AttentionStaticQuantPattern fusion pass""" + if use_inductor_graph_partition and not is_torch_equal_or_newer( + "2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + monkeypatch.setenv("VLLM_USE_V1", "1") if split_attention: monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1") @@ -372,6 +383,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, custom_ops=["+quant_fp8"], + use_inductor_graph_partition=use_inductor_graph_partition, ), cache_config=CacheConfig(cache_dtype="fp8")) @@ -444,6 +456,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, backend=test_backend, fullgraph=True) assert model_compiled.attn._o_scale_float is None + result_fused_1 = model_compiled(q, k, v) if backend == _Backend.FLASHINFER: @@ -453,6 +466,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, # _o_scale_float assert model_compiled.attn._o_scale_float is not None result_fused_2 = model_compiled(q, k, v) + assert model_compiled.attn._o_scale_float is not None torch.testing.assert_close(result_unfused, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 8d5ebd93e063d..3d1269c0ecea8 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -577,6 +577,7 @@ direct_register_custom_op( mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) @@ -627,4 +628,5 @@ direct_register_custom_op( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index d6bdb31a3c630..17fc727b8fc70 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -329,6 +329,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] global compilation_start_time + compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( submod, @@ -339,7 +340,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): num_graphs=len(self.compile_submod_names), runtime_shape=None) # Lazy import here to avoid circular import - from .cuda_graph import CUDAGraphOptions from .cuda_piecewise_backend import PiecewiseBackend piecewise_backend = PiecewiseBackend( @@ -347,7 +347,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_dynamic_shape, self.vllm_backend) - if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and + not self.compilation_config.use_inductor_graph_partition): + # We're using Dynamo-based piecewise splitting, so we wrap + # the whole subgraph with a static graph wrapper. + from .cuda_graph import CUDAGraphOptions + # resolve the static graph wrapper class (e.g. CUDAGraphWrapper # class) as platform dependent. static_graph_wrapper_class = resolve_obj_by_qualname( diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 41d9fcb824b01..b7a6e23c1aa79 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib import inspect from typing import Callable, Optional, TypeVar, Union, overload from unittest.mock import patch @@ -14,7 +15,7 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import CompilationLevel, VllmConfig from vllm.logger import init_logger from vllm.sequence import IntermediateTensors -from vllm.utils import supports_dynamo +from vllm.utils import resolve_obj_by_qualname, supports_dynamo from .monitor import start_monitoring_torch_compile @@ -301,8 +302,11 @@ def _support_torch_compile( with patch.object(InliningInstructionTranslator, 'inline_call', patched_inline_call), torch._dynamo.config.patch( - **dynamo_config_patches): + **dynamo_config_patches + ), maybe_use_cudagraph_partition_wrapper( + self.vllm_config): output = self.compiled_callable(*args, **kwargs) + return output # usually, capturing the model once is enough, and then we can @@ -314,3 +318,52 @@ def _support_torch_compile( cls.__call__ = __call__ return cls + + +@contextlib.contextmanager +def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): + """ + Context manager to set/unset customized cudagraph partition wrappers. + + If we're using Inductor-based graph partitioning, we currently have the + whole `fx.Graph` before Inductor lowering and and the piecewise + splitting happens after all graph passes and fusions. Here, we add + a custom hook for Inductor to wrap each partition with our static + graph wrapper class to maintain more control over static graph + capture and replay. + """ + from vllm.config import CUDAGraphMode + + compilation_config = vllm_config.compilation_config + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + from torch._inductor.utils import CUDAGraphWrapperMetadata + + from vllm.compilation.cuda_graph import CUDAGraphOptions + from vllm.platforms import current_platform + + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + def customized_cudagraph_wrapper(f, + metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions - 1, + )) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper) + + yield + + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + torch._inductor.utils.set_customized_partition_wrappers(None) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 3618f472e742d..22b38daf46c39 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -299,6 +299,26 @@ class CompilationConfig: minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead. """ + use_inductor_graph_partition: bool = False + """Use inductor graph partition to split the graph at cudagraph_unsafe ops. + This partition happens at inductor codegen time after all passes and fusions + are finished. It generates a single `call` function which wraps + cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops + outside the partition functions. For a graph with N cudagraph-unsafe ops + (e.g., Attention), there would be N+1 partitions. To mark an op as + cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when + register the custom op. + + This config supports both full cudagraph and piecewise cudagraph without + compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper + to each partition. For N+1 partitions, there would be N+1 + CUDAGraph wrapper instances. + + For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the + inductor `call` function in the model runner. The top-level full cudagraph + capture ignores all partitioning. + """ + pass_config: PassConfig = field(default_factory=PassConfig) """Custom inductor passes, see PassConfig for more details""" @@ -461,6 +481,12 @@ class CompilationConfig: "since full_cuda_graph is deprecated.") self.cudagraph_mode = CUDAGraphMode.FULL + if (self.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + raise ValueError("use_inductor_graph_partition is only " + "supported with torch>=2.9.0.dev. Set " + "use_inductor_graph_partition=False instead.") + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -540,19 +566,36 @@ class CompilationConfig: "set_splitting_ops_for_v1 should only be called when " "level is CompilationLevel.PIECEWISE") + use_inductor_graph_partition_msg = ( + "When use_inductor_graph_partition=True, splitting_ops " + "are ignored and set to an empty list. Instead, " + "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is " + "used to annotate custom ops for graph partition.") + if self.splitting_ops is None: - # NOTE: When using full cudagraph, instead of setting an empty - # list and capture the full cudagraph inside the flattened fx - # graph, we keep the piecewise fx graph structure but capture the - # full cudagraph outside the fx graph. This reduces some cpu - # overhead when the runtime batch_size is not cudagraph captured. - # see https://github.com/vllm-project/vllm/pull/20059 for details. - # make a copy to avoid mutating the class-level list via reference. - self.splitting_ops = list(self._attention_ops) + if self.use_inductor_graph_partition: + # When using inductor graph partition, we set splitting_ops + # to be empty and rely on torch._C.Tag.cudagraph_unsafe to + # annotate custom ops as splitting ops. + logger.warning_once(use_inductor_graph_partition_msg) + self.splitting_ops = [] + else: + # NOTE: When using full cudagraph, instead of setting an empty + # list and capture the full cudagraph inside the flattened fx + # graph, we keep the piecewise fx graph structure but capture + # the full cudagraph outside the fx graph. This reduces some + # cpu overhead when the runtime batch_size is not cudagraph + # captured. see https://github.com/vllm-project/vllm/pull/20059 + # for details. make a copy to avoid mutating the class-level + # list via reference. + self.splitting_ops = list(self._attention_ops) elif len(self.splitting_ops) == 0: - logger.warning_once("Using piecewise compilation with empty " - "splitting_ops.") - if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.warning_once( + "Using piecewise compilation with empty " + "splitting_ops and use_inductor_graph_partition" + f"={self.use_inductor_graph_partition}.") + if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE + and not self.use_inductor_graph_partition): logger.warning_once( "When compilation level is piecewise with empty " "splitting_ops, PIECEWISE cudagraph_mode will be " @@ -562,7 +605,26 @@ class CompilationConfig: "any problems.") self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] + elif self.use_inductor_graph_partition: + logger.warning_once(use_inductor_graph_partition_msg) + self.splitting_ops = [] def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) + + def is_attention_compiled_piecewise(self) -> bool: + use_fx_graph_piecewise_compilation = ( + self.level == CompilationLevel.PIECEWISE + and self.splitting_ops_contain_attention()) + + inductor_used = (self.level == CompilationLevel.PIECEWISE + and self.use_inductor) or ( + self.level >= CompilationLevel.DYNAMO_AS_IS + and self.backend == "inductor") + use_inductor_piecewise_compilation = ( + inductor_used and self.use_inductor_graph_partition + and not self.splitting_ops_contain_attention()) + + return use_fx_graph_piecewise_compilation or \ + use_inductor_piecewise_compilation diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index d2db7dcb3f091..ea4fba8eeea6d 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional -from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.forward_context import BatchDescriptor from vllm.logger import init_logger @@ -39,11 +39,15 @@ class CudagraphDispatcher: CUDAGraphMode.FULL: set(), } - assert not self.cudagraph_mode.requires_piecewise_compilation() or \ - (self.compilation_config.level == CompilationLevel.PIECEWISE and - self.compilation_config.splitting_ops_contain_attention()), \ + not_use_piecewise_compilation = ( + not self.cudagraph_mode.requires_piecewise_compilation()) + + assert not_use_piecewise_compilation or \ + self.compilation_config.is_attention_compiled_piecewise(), \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ + "and attention should be in splitting_ops or "\ + "inductor splitting should be used. " \ f"cudagraph_mode={self.cudagraph_mode}, "\ f"compilation_level={self.compilation_config.level}, "\ f"splitting_ops={self.compilation_config.splitting_ops}" From a25ade5d473fc00107bd3950141d8211331d3377 Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Fri, 19 Sep 2025 18:06:34 -0700 Subject: [PATCH 496/584] [BugFix] Ensure appropriate guards in destructors (#25284) Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/compilation/collective_fusion.py | 2 +- .../kv_transfer/kv_connector/v1/nixl_connector.py | 9 +++++---- vllm/executor/executor_base.py | 3 --- vllm/v1/worker/gpu_worker.py | 3 ++- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 71274420c3426..0658b59a2e215 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -1183,7 +1183,7 @@ class AllReduceFusionPass(VllmInductorPass): self.end_and_log() def __del__(self): - if self.disabled: + if getattr(self, "disabled", True): return if flashinfer_comm is not None: flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index ff62f60e5a42c..d3a08af088c11 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -569,9 +569,10 @@ class NixlConnectorWorker: def __del__(self): """Cleanup background threads on destruction.""" - self._handshake_initiation_executor.shutdown(wait=False) - if self._nixl_handshake_listener_t: - self._nixl_handshake_listener_t.join(timeout=0) + if executor := getattr(self, "_handshake_initiation_executor", None): + executor.shutdown(wait=False) + if listener_t := getattr(self, "_nixl_handshake_listener_t", None): + listener_t.join(timeout=0) @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, @@ -1379,4 +1380,4 @@ class NixlKVConnectorStats(KVConnectorStats): # TODO: reduce stats to a single value, calculate latency/throughput return { "num_successful_transfers": self.data["num_successful_transfers"] - } \ No newline at end of file + } diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index d18bef1256af5..42aa8d14a21eb 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -235,9 +235,6 @@ class ExecutorBase(ABC): """Shutdown the executor.""" self.collective_rpc("shutdown") - def __del__(self): - self.shutdown() - async def execute_model_async( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6855526583f04..8b1e1bb8f45ca 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -683,7 +683,8 @@ class Worker(WorkerBase): tensorizer_config=tensorizer_config, ) def shutdown(self) -> None: - self.model_runner.ensure_kv_transfer_shutdown() + if runner := getattr(self, "model_runner", None): + runner.ensure_kv_transfer_shutdown() def init_worker_distributed_environment( From 535d80056b72443e68a96c1e4a1049cd9a85587d Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Fri, 19 Sep 2025 19:02:38 -0700 Subject: [PATCH 497/584] [Misc] Support more collective_rpc return types (#25294) Signed-off-by: Nick Hill <nhill@redhat.com> --- tests/v1/engine/test_engine_core_client.py | 203 ++++++++++++++++++++- vllm/v1/serial_utils.py | 60 ++++-- 2 files changed, 246 insertions(+), 17 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 625a3470e8025..992c4e01386e5 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -8,7 +8,7 @@ import time import uuid from dataclasses import dataclass from threading import Thread -from typing import Optional, Union +from typing import Any, Optional, Union from unittest.mock import MagicMock import pytest @@ -331,6 +331,46 @@ def echo_dc( return [val for _ in range(3)] if return_list else val +# Dummy utility function to test dict serialization with custom types. +def echo_dc_dict( + self, + msg: str, + return_dict: bool = False, +) -> Union[MyDataclass, dict[str, MyDataclass]]: + print(f"echo dc dict util function called: {msg}") + val = None if msg is None else MyDataclass(msg) + # Return dict of dataclasses to verify support for returning dicts + # with custom value types. + if return_dict: + return {"key1": val, "key2": val, "key3": val} + else: + return val + + +# Dummy utility function to test nested structures with custom types. +def echo_dc_nested( + self, + msg: str, + structure_type: str = "list_of_dicts", +) -> Any: + print(f"echo dc nested util function called: {msg}, " + f"structure: {structure_type}") + val = None if msg is None else MyDataclass(msg) + + if structure_type == "list_of_dicts": # noqa + # Return list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}] + return [{"a": val, "b": val}, {"c": val, "d": val}] + elif structure_type == "dict_of_lists": + # Return dict of lists: {"list1": [val, val], "list2": [val, val]} + return {"list1": [val, val], "list2": [val, val]} + elif structure_type == "deep_nested": + # Return deeply nested: {"outer": [{"inner": [val, val]}, + # {"inner": [val]}]} + return {"outer": [{"inner": [val, val]}, {"inner": [val]}]} + else: + return val + + @pytest.mark.asyncio(loop_scope="function") async def test_engine_core_client_util_method_custom_return( monkeypatch: pytest.MonkeyPatch): @@ -384,6 +424,167 @@ async def test_engine_core_client_util_method_custom_return( client.shutdown() +@pytest.mark.asyncio(loop_scope="function") +async def test_engine_core_client_util_method_custom_dict_return( + monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Must set insecure serialization to allow returning custom types. + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Monkey-patch core engine utility function to test. + m.setattr(EngineCore, "echo_dc_dict", echo_dc_dict, raising=False) + + engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + try: + # Test utility method returning custom / non-native data type. + core_client: AsyncMPClient = client + + # Test single object return + result = await core_client.call_utility_async( + "echo_dc_dict", "testarg3", False) + assert isinstance(result, + MyDataclass) and result.message == "testarg3" + + # Test dict return with custom value types + result = await core_client.call_utility_async( + "echo_dc_dict", "testarg3", True) + assert isinstance(result, dict) and len(result) == 3 + for key, val in result.items(): + assert key in ["key1", "key2", "key3"] + assert isinstance(val, + MyDataclass) and val.message == "testarg3" + + # Test returning dict with None values + result = await core_client.call_utility_async( + "echo_dc_dict", None, True) + assert isinstance(result, dict) and len(result) == 3 + for key, val in result.items(): + assert key in ["key1", "key2", "key3"] + assert val is None + + finally: + client.shutdown() + + +@pytest.mark.asyncio(loop_scope="function") +async def test_engine_core_client_util_method_nested_structures( + monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Must set insecure serialization to allow returning custom types. + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Monkey-patch core engine utility function to test. + m.setattr(EngineCore, "echo_dc_nested", echo_dc_nested, raising=False) + + engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + try: + core_client: AsyncMPClient = client + + # Test list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}] + result = await core_client.call_utility_async( + "echo_dc_nested", "nested1", "list_of_dicts") + assert isinstance(result, list) and len(result) == 2 + for i, item in enumerate(result): + assert isinstance(item, dict) + if i == 0: + assert "a" in item and "b" in item + assert isinstance( + item["a"], + MyDataclass) and item["a"].message == "nested1" + assert isinstance( + item["b"], + MyDataclass) and item["b"].message == "nested1" + else: + assert "c" in item and "d" in item + assert isinstance( + item["c"], + MyDataclass) and item["c"].message == "nested1" + assert isinstance( + item["d"], + MyDataclass) and item["d"].message == "nested1" + + # Test dict of lists: {"list1": [val, val], "list2": [val, val]} + result = await core_client.call_utility_async( + "echo_dc_nested", "nested2", "dict_of_lists") + assert isinstance(result, dict) and len(result) == 2 + assert "list1" in result and "list2" in result + for key, lst in result.items(): + assert isinstance(lst, list) and len(lst) == 2 + for item in lst: + assert isinstance( + item, MyDataclass) and item.message == "nested2" + + # Test deeply nested: {"outer": [{"inner": [val, val]}, + # {"inner": [val]}]} + result = await core_client.call_utility_async( + "echo_dc_nested", "nested3", "deep_nested") + assert isinstance(result, dict) and "outer" in result + outer_list = result["outer"] + assert isinstance(outer_list, list) and len(outer_list) == 2 + + # First dict in outer list should have "inner" with 2 items + inner_dict1 = outer_list[0] + assert isinstance(inner_dict1, dict) and "inner" in inner_dict1 + inner_list1 = inner_dict1["inner"] + assert isinstance(inner_list1, list) and len(inner_list1) == 2 + for item in inner_list1: + assert isinstance(item, + MyDataclass) and item.message == "nested3" + + # Second dict in outer list should have "inner" with 1 item + inner_dict2 = outer_list[1] + assert isinstance(inner_dict2, dict) and "inner" in inner_dict2 + inner_list2 = inner_dict2["inner"] + assert isinstance(inner_list2, list) and len(inner_list2) == 1 + assert isinstance( + inner_list2[0], + MyDataclass) and inner_list2[0].message == "nested3" + + # Test with None values in nested structures + result = await core_client.call_utility_async( + "echo_dc_nested", None, "list_of_dicts") + assert isinstance(result, list) and len(result) == 2 + for item in result: + assert isinstance(item, dict) + for val in item.values(): + assert val is None + + finally: + client.shutdown() + + @pytest.mark.parametrize( "multiprocessing_mode,publisher_config", [(True, "tcp"), (False, "inproc")], diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 50c1470c67edc..c812a2ec6427a 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -7,7 +7,7 @@ import pickle from collections.abc import Sequence from inspect import isclass from types import FunctionType -from typing import Any, Optional, Union +from typing import Any, Callable, Optional, Union import cloudpickle import msgspec @@ -59,6 +59,42 @@ def _typestr(val: Any) -> Optional[tuple[str, str]]: return t.__module__, t.__qualname__ +def _encode_type_info_recursive(obj: Any) -> Any: + """Recursively encode type information for nested structures of + lists/dicts.""" + if obj is None: + return None + if type(obj) is list: + return [_encode_type_info_recursive(item) for item in obj] + if type(obj) is dict: + return {k: _encode_type_info_recursive(v) for k, v in obj.items()} + return _typestr(obj) + + +def _decode_type_info_recursive( + type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any], + Any]) -> Any: + """Recursively decode type information for nested structures of + lists/dicts.""" + if type_info is None: + return data + if isinstance(type_info, dict): + assert isinstance(data, dict) + return { + k: _decode_type_info_recursive(type_info[k], data[k], convert_fn) + for k in type_info + } + if isinstance(type_info, list) and ( + # Exclude serialized tensors/numpy arrays. + len(type_info) != 2 or not isinstance(type_info[0], str)): + assert isinstance(data, list) + return [ + _decode_type_info_recursive(ti, d, convert_fn) + for ti, d in zip(type_info, data) + ] + return convert_fn(type_info, data) + + class MsgpackEncoder: """Encoder with custom torch tensor and numpy array serialization. @@ -129,12 +165,10 @@ class MsgpackEncoder: result = obj.result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: return None, result - # Since utility results are not strongly typed, we also encode - # the type (or a list of types in the case it's a list) to - # help with correct msgspec deserialization. - return _typestr(result) if type(result) is not list else [ - _typestr(v) for v in result - ], result + # Since utility results are not strongly typed, we recursively + # encode type information for nested structures of lists/dicts + # to help with correct msgspec deserialization. + return _encode_type_info_recursive(result), result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: raise TypeError(f"Object of type {type(obj)} is not serializable" @@ -288,15 +322,9 @@ class MsgpackDecoder: if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must " "be set to use custom utility result types") - assert isinstance(result_type, list) - if len(result_type) == 2 and isinstance(result_type[0], str): - result = self._convert_result(result_type, result) - else: - assert isinstance(result, list) - result = [ - self._convert_result(rt, r) - for rt, r in zip(result_type, result) - ] + # Use recursive decoding to handle nested structures + result = _decode_type_info_recursive(result_type, result, + self._convert_result) return UtilityResult(result) def _convert_result(self, result_type: Sequence[str], result: Any) -> Any: From c308501cb6a922af8c4183bd65be0094dd73de9a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 20 Sep 2025 04:11:03 +0100 Subject: [PATCH 498/584] Improve weight loading for encoder models in Transformers backend (#25289) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 28 ++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index f40a20dee63d7..3bd4d10316ec6 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -702,21 +702,45 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): class TransformersModel(TransformersBase): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ + # Handle BERT-like models + "bert": "model", # Add `model.` prefix for base model checkpoints "": "model.", - # Remove `model.` from places it should not be + # Remove `model.` prefix if it was already there "model.model.": "model.", + # Pooling adapters will be adjacent to `model` + "model.pooler": "pooler", "model.score": "score", + # Classifier adapter's classifier layer is renamed to score + "model.classifier": "score", + }, + orig_to_new_suffix={ + # Replace legacy suffixes used for norms + ".gamma": ".weight", + ".beta": ".bias", }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) - # Some encoder models have the position_ids buffer in the checkpoint + # After creating a pooling model, `pooler` will be duplicated. + # The one inside `model` comes from the Transformers modelling code. + # The one after `model` is an adapter from vLLM. + # We want to use the adapter so we nullify the original pooler. + if getattr(self.model, "pooler", None) is not None: + self.skip_prefixes.append("pooler.") + self.model.pooler = torch.nn.Identity() + + # Some encoder models have the position_ids buffer in the checkpoint. # vLLM will always pass position_ids as an argument, so we skip loading # the buffer if it exists self.skip_substrs.append("position_ids") + # Some encoder models have the bias of the final classifier layer + # in the checkpoint. vLLM does not use this bias, so we skip loading + # it if it exists + self.skip_substrs.append("score.bias") + def create_attention_instances( self, attn_type: AttentionType = AttentionType.DECODER): # TODO(hmellor): Better way to detect encoder models From 36429096171ff8785645c40c662d859dddedd829 Mon Sep 17 00:00:00 2001 From: JartX <sagformas@epdcenter.es> Date: Sat, 20 Sep 2025 05:18:13 +0200 Subject: [PATCH 499/584] [BUGFIX] GPTQ quantization compatibility for Qwen3 Next MOE models (AutoGPTQ and AutoRound-GPTQ) (#25268) Signed-off-by: JartX <sagformas@epdcenter.es> --- vllm/model_executor/models/qwen3_next.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 98749c160ba4d..ce917f92bd2e5 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -148,9 +148,11 @@ class Qwen3NextSparseMoeBlock(nn.Module): def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): # GPTQ configs do not have a list of ignored modules, however AutoGPTQ - # seems to avoid gate quantization. - # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4 - if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + # seems to avoid gate quantization while AutoRound does. + if isinstance( + quant_config, + (GPTQConfig, + GPTQMarlinConfig)) and not quant_config.autoround_version: return None return quant_config From b7f186bbb3101e97bb9ad42b7ffb3cdb4bb590fd Mon Sep 17 00:00:00 2001 From: Nick Hill <nhill@redhat.com> Date: Fri, 19 Sep 2025 21:28:31 -0700 Subject: [PATCH 500/584] [BugFix] Exclude self when checking for port collision (#25286) Signed-off-by: Nick Hill <nhill@redhat.com> --- vllm/utils/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index d4013a69e99fe..fd1c0af31269c 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -987,8 +987,10 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]: if sys.platform.startswith("darwin"): return None + our_pid = os.getpid() for conn in psutil.net_connections(): - if conn.laddr.port == port: + if conn.laddr.port == port and (conn.pid is not None + and conn.pid != our_pid): try: return psutil.Process(conn.pid) except psutil.NoSuchProcess: From 6c5f82e5aa87cd73ce03ce10fc44138f75ee1aea Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" <chendi.xue@intel.com> Date: Fri, 19 Sep 2025 23:41:23 -0500 Subject: [PATCH 501/584] [BUG FIX][NON-CUDA]quick fix to avoid call cudagraph_unsafe in attention (#25298) Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> --- vllm/attention/layer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 3d1269c0ecea8..544a720524429 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -29,6 +29,10 @@ from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None +try: + tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, ) +except AttributeError: + tag_cudagraph_unsafe = () # type: ignore[assignment] def check_xformers_availability(): @@ -577,7 +581,7 @@ direct_register_custom_op( mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, - tags=(torch._C.Tag.cudagraph_unsafe, ), + tags=tag_cudagraph_unsafe, ) @@ -628,5 +632,5 @@ direct_register_custom_op( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, - tags=(torch._C.Tag.cudagraph_unsafe, ), + tags=tag_cudagraph_unsafe, ) From f91480b2d44c263fb600b5cba5b0e6c7a195f742 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Sat, 20 Sep 2025 13:29:54 +0800 Subject: [PATCH 502/584] [Bugfix] fix tool call arguments is empty (#25223) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Co-authored-by: xin.li <xin.li@daocloud.io> --- .../test_completion_with_function_calling.py | 60 +++++++++++++++++++ vllm/entrypoints/chat_utils.py | 8 ++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 3649cefa9bf42..4355603fcd70b 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import datetime from typing import Union import openai # use the official client for correctness check @@ -284,3 +285,62 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str, output.extend(chunk.choices[0].delta.tool_calls) for o in output: assert o.id is None or o.id == 'functions.get_current_weather:0' + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("arguments", ["{}", '']) +async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str, + arguments: str): + # Step 1: Define a tool that requires no parameters + tools = [{ + "type": "function", + "function": { + "name": "get_current_time", + "description": + "Get the current date and time. No parameters needed.", + "parameters": { + "type": "object", + "properties": {}, # No parameters + "required": [] # No required fields + } + } + }] + messages = [{"role": "user", "content": "What time is it now?"}] + # Step 2: Send user message and let model decide whether to call the tool + response = await client.chat.completions.create( + model=model_name, + messages=messages, + tools=tools, + tool_choice="auto" # Let model choose automatically + ) + + # Step 3: Check if model wants to call a tool + message = response.choices[0].message + if message.tool_calls: + # Get the first tool call + tool_call = message.tool_calls[0] + tool_name = tool_call.function.name + # Step 4: Execute the tool locally (no parameters) + if tool_name == "get_current_time": + # Test both empty string and "{}" for no-arg tool calls + tool_call.function.arguments = arguments + messages.append(message) + current_time = datetime.datetime.now() + result = current_time.isoformat() + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": result, + }) + # Step 5: Send tool result back to model to continue conversation + final_response = await client.chat.completions.create( + model=model_name, + messages=messages, + ) + # Output final natural language response + assert final_response.choices[0].message.content is not None + + else: + # No tool called — just print model's direct reply + assert message.content is not None diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 00ef39f134653..c2c0ad74ef431 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1450,9 +1450,11 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: and isinstance(message["tool_calls"], list) ): for item in message["tool_calls"]: - item["function"]["arguments"] = json.loads( - item["function"]["arguments"] - ) + # if arguments is None or empty string, set to {} + if content := item["function"].get("arguments"): + item["function"]["arguments"] = json.loads(content) + else: + item["function"]["arguments"] = {} def parse_chat_messages( From c60e6137f0bf2034853919b3a9d705d7e06b93cf Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 20 Sep 2025 13:30:22 +0800 Subject: [PATCH 503/584] [Optimization] Avoid repeated model architecture conversion for pooling models (#25261) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/config/model.py | 22 +++++++++++++++++++++- vllm/model_executor/model_loader/utils.py | 17 ++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 4e847922b61e6..921322bb475c5 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -322,8 +322,28 @@ class ModelConfig: factors.append(self.override_generation_config) factors.append(self.rope_scaling) factors.append(self.rope_theta) + # hf_config can control how the model looks! - factors.append(self.hf_config.to_json_string()) + try: + hf_config_json = self.hf_config.to_json_string(use_diff=False) + except TypeError: + from transformers import PretrainedConfig + + from vllm.utils.jsontree import json_map_leaves + + # Handle nested HF configs with unserializable values gracefully + hf_config_json = json.dumps( + json_map_leaves( + lambda v: v.to_dict() + if isinstance(v, PretrainedConfig) else str(v), + self.hf_config.to_dict(), + ), + indent=2, + sort_keys=True, + ) + "\n" + + factors.append(hf_config_json) + str_factors = str(factors) assert_hashable(str_factors) return hashlib.sha256(str(factors).encode()).hexdigest() diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index bd1773c753a93..e007d431880eb 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -165,7 +165,11 @@ def device_loading_context(module: torch.nn.Module, # New parameters or parameters already on target device are untouched -def get_model_architecture( +_MODEL_ARCH_BY_HASH = dict[str, tuple[type[nn.Module], str]]() +"""Caches the outputs of `_get_model_architecture`.""" + + +def _get_model_architecture( model_config: ModelConfig) -> tuple[type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) @@ -209,6 +213,17 @@ def get_model_architecture( return model_cls, arch +def get_model_architecture( + model_config: ModelConfig) -> tuple[type[nn.Module], str]: + key = model_config.compute_hash() + if key in _MODEL_ARCH_BY_HASH: + return _MODEL_ARCH_BY_HASH[key] + + model_arch = _get_model_architecture(model_config) + _MODEL_ARCH_BY_HASH[key] = model_arch + return model_arch + + def get_model_cls(model_config: ModelConfig) -> type[nn.Module]: return get_model_architecture(model_config)[0] From 9607d5eb449711b349d4c2bee0a9c94afcc7ed14 Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Fri, 19 Sep 2025 23:43:59 -0700 Subject: [PATCH 504/584] [Hybrid Allocator] Support full attention with different hidden size (#25101) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- tests/v1/core/test_kv_cache_utils.py | 118 +++++++++++++++++++--- vllm/v1/core/kv_cache_utils.py | 144 ++++++++++++++++++++------- vllm/v1/engine/core.py | 16 ++- vllm/v1/kv_cache_interface.py | 70 +++++++++++++ vllm/v1/worker/gpu_model_runner.py | 65 ++++++------ vllm/v1/worker/utils.py | 3 +- 6 files changed, 324 insertions(+), 92 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 4bf6bbbfeae28..4cb7ed6ce3824 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -18,12 +18,14 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.kv_cache_utils import ( BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, - get_kv_cache_configs, get_max_concurrency_for_kv_cache_config, - get_request_block_hasher, hash_block_tokens, init_none_hash, - is_kv_cache_type_uniform, make_block_hash_with_group_id) + generate_scheduler_kv_cache_config, get_kv_cache_configs, + get_max_concurrency_for_kv_cache_config, get_request_block_hasher, + hash_block_tokens, init_none_hash, is_kv_cache_spec_uniform, + make_block_hash_with_group_id) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - KVCacheTensor, SlidingWindowSpec) + KVCacheTensor, SlidingWindowSpec, + UniformTypeKVCacheSpecs) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -927,36 +929,36 @@ def test_merge_kv_cache_spec(): assert merged_layer_spec.sliding_window == 1 -def test_is_kv_cache_type_uniform(): +def test_is_kv_cache_spec_uniform(): kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_kv_cache_spec(num_kv_heads=32), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), } - assert not is_kv_cache_type_uniform(kv_cache_spec) + assert not is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2), } - assert not is_kv_cache_type_uniform(kv_cache_spec) + assert not is_kv_cache_spec_uniform(kv_cache_spec) @pytest.mark.parametrize( @@ -1286,14 +1288,28 @@ def test_get_kv_cache_config_one_worker(): ], ) - # different hidden size, unimplemented + # different hidden size kv_cache_specs_hybrid = { 'layer_1': new_kv_cache_spec(head_size=128), - 'layer_2': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=64), } - with pytest.raises(NotImplementedError): - get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid], - [mem_per_block_per_layer * 2 * 32])[0] + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 3 * 32])[0] + assert kv_cache_config_hybrid == KVCacheConfig( + num_blocks=32, + kv_cache_tensors=[ + KVCacheTensor(size=mem_per_block_per_layer * 32 * 2, + shared_by=["layer_1"]), + KVCacheTensor(size=mem_per_block_per_layer * 32, + shared_by=["layer_2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer_1", "layer_2"], + UniformTypeKVCacheSpecs( + block_size=16, + kv_cache_specs=kv_cache_specs_hybrid)) + ]) # Test num_gpu_blocks_override vllm_config.cache_config.num_gpu_blocks_override = 16 @@ -1324,3 +1340,75 @@ def test_get_kv_cache_configs_attention_free(): kv_cache_groups=[], ) ] + + +def test_generate_uniform_type_kv_cache_specs(): + # All layers are full attention, can be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=128), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec == UniformTypeKVCacheSpecs( + block_size=16, kv_cache_specs=kv_cache_specs) + + # Full attention + sliding window, cannot be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_sliding_window_spec(sliding_window=1), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + # different order of full attention + sliding window, cannot be merged + kv_cache_specs = { + 'layer_1': new_sliding_window_spec(sliding_window=1), + 'layer_2': new_kv_cache_spec(), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + # Same-size sliding window, can be merged + kv_cache_specs = { + 'layer_1': new_sliding_window_spec(sliding_window=1), + 'layer_2': new_sliding_window_spec(sliding_window=1, head_size=128), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec == UniformTypeKVCacheSpecs( + block_size=16, kv_cache_specs=kv_cache_specs) + + # different block sizes, cannot be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(block_size=16), + 'layer_2': new_kv_cache_spec(block_size=32), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + +def test_generate_scheduler_kv_cache_config(): + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=128), + } + kv_cache_configs = [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer_1', 'layer_2'], + UniformTypeKVCacheSpecs( + block_size=16, + kv_cache_specs=kv_cache_specs)), + ], + ) + ] + scheduler_kv_cache_config = generate_scheduler_kv_cache_config( + kv_cache_configs) + assert scheduler_kv_cache_config == KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer_1', 'layer_2'], new_kv_cache_spec()) + ], + ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index bc2ec5e42ea20..3ccd00121f8ed 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """KV-Cache Utilities.""" +import copy import os from collections import defaultdict, deque from collections.abc import Iterable, Sequence @@ -15,7 +16,8 @@ from vllm.utils import GiB_bytes, cdiv, sha256_cbor from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - KVCacheTensor, SlidingWindowSpec) + KVCacheTensor, SlidingWindowSpec, + UniformTypeKVCacheSpecs) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -750,7 +752,7 @@ def create_kv_cache_group_specs( return kv_cache_groups -def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: +def is_kv_cache_spec_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ Whether all layers in the given KVCacheSpec have the same KV cache spec. Note that we regard FullAttentionSpec with and without sliding window as @@ -793,6 +795,21 @@ def get_max_concurrency_for_kv_cache_config( return max_concurrency +def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int: + """ + Override the number of kv cache blocks if `num_gpu_blocks_override` is set. + """ + if vllm_config.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = \ + vllm_config.cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) + num_blocks = num_gpu_blocks_override + + return num_blocks + + def get_num_blocks(vllm_config: VllmConfig, num_layers: int, available_memory: int, page_size: int) -> int: """ @@ -806,13 +823,7 @@ def get_num_blocks(vllm_config: VllmConfig, num_layers: int, """ num_blocks = int(available_memory // page_size // num_layers) num_blocks = max(num_blocks, 0) - if vllm_config.cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = \ - vllm_config.cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) - num_blocks = num_gpu_blocks_override + num_blocks = may_override_num_blocks(vllm_config, num_blocks) return num_blocks @@ -825,11 +836,11 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int: return page_sizes.pop() -def _get_kv_cache_groups_uniform_type( +def _get_kv_cache_groups_uniform_spec( kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ - Generates the KV cache configuration for a model with one type of KV cache. - Divide the available memory equally among all layers. + Generates the KV cache configuration for a model with the same KV cache + spec for all layers. Args: kv_cache_specs: The kv cache spec of each attention layer in the model @@ -842,6 +853,22 @@ def _get_kv_cache_groups_uniform_type( [list(kv_cache_specs.keys())]) +def _get_kv_cache_groups_uniform_type( + spec: UniformTypeKVCacheSpecs) -> list[KVCacheGroupSpec]: + """ + Generates the KV cache configuration for a model with one type of KV cache + but different hidden sizes. All layers are merged into one group. + + Args: + spec: The UniformTypeKVCacheSpecs of the model + + Returns: + The generated KVCacheGroupSpecs + """ + + return [KVCacheGroupSpec(list(spec.kv_cache_specs.keys()), spec)] + + def is_kv_cache_page_size_uniform( kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ @@ -1000,28 +1027,45 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig, ) # Determine how model runners should initialize the KV cache tensors. - # We will have group_size memory pools, each is shared by one layer from - # each group. As layers of different groups have different block table, - # they will use different parts of the shared Tensor. - # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), - # (sw.1, padding) will be: (group_size = 2) - # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 - # full.1, sw.2: share another Tensor with size=available_memory//2 - group_size = max(len(group.layer_names) for group in kv_cache_groups) + if len(kv_cache_groups) == 1 and \ + isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs): + # Special case: all layers have the same type of KV cache but with + # different hidden size. Allocate different amount of memory for each + # layer based on its hidden size. + num_blocks = available_memory // kv_cache_groups[ + 0].kv_cache_spec.page_size_bytes + num_blocks = may_override_num_blocks(vllm_config, num_blocks) + per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs + kv_cache_tensors = [ + KVCacheTensor(size=per_layer_specs[layer_name].page_size_bytes * + num_blocks, + shared_by=[layer_name]) + for layer_name in kv_cache_groups[0].layer_names + ] + else: + # General case: + # We will have group_size memory pools, each is shared by one layer from + # each group. As layers of different groups have different block table, + # they will use different parts of the shared Tensor. + # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), + # (sw.1, padding) will be: (group_size = 2) + # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 + # full.1, sw.2: share another Tensor with size=available_memory//2 + group_size = max(len(group.layer_names) for group in kv_cache_groups) - page_size = get_uniform_page_size(kv_cache_specs) - assert group_size > 0, "group_size must be greater than 0" - num_blocks = get_num_blocks(vllm_config, group_size, available_memory, - page_size) - per_memory_pool_size = page_size * num_blocks - kv_cache_tensors = [] - for i in range(group_size): - shared_by = [] - for j in range(len(kv_cache_groups)): - if i < len(kv_cache_groups[j].layer_names): - shared_by.append(kv_cache_groups[j].layer_names[i]) - kv_cache_tensors.append( - KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by)) + page_size = get_uniform_page_size(kv_cache_specs) + assert group_size > 0, "group_size must be greater than 0" + num_blocks = get_num_blocks(vllm_config, group_size, available_memory, + page_size) + kv_cache_tensors = [] + for i in range(group_size): + shared_by = [] + for j in range(len(kv_cache_groups)): + if i < len(kv_cache_groups[j].layer_names): + shared_by.append(kv_cache_groups[j].layer_names[i]) + kv_cache_tensors.append( + KVCacheTensor(size=page_size * num_blocks, + shared_by=shared_by)) kv_cache_config = KVCacheConfig( num_blocks=num_blocks, @@ -1059,7 +1103,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): kv_cache_spec: The kv cache spec of each attention layer in the model """ - if is_kv_cache_type_uniform(kv_cache_spec): + if is_kv_cache_spec_uniform(kv_cache_spec): return logger.warning( @@ -1097,7 +1141,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): attention_chunk_size=spec.attention_chunk_size, ) - if not is_kv_cache_type_uniform(kv_cache_spec): + if not is_kv_cache_spec_uniform(kv_cache_spec): raise ValueError("Hybrid KV cache manager is disabled but failed to " "convert the KV cache specs to one unified type.") @@ -1122,11 +1166,16 @@ def get_kv_cache_groups( # This returns an empty list to allow for the KVCacheManager to handle # attention free models. return [] - elif is_kv_cache_type_uniform(kv_cache_spec): + elif is_kv_cache_spec_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. - return _get_kv_cache_groups_uniform_type(kv_cache_spec) + return _get_kv_cache_groups_uniform_spec(kv_cache_spec) + elif uniform_spec := UniformTypeKVCacheSpecs.from_specs(kv_cache_spec): + # All layers need the same number of token slots (e.g., all layers are + # full attention, or all layers are sliding window attention with the + # same window size). Put all layers into one group. + return _get_kv_cache_groups_uniform_type(uniform_spec) elif is_kv_cache_page_size_uniform(kv_cache_spec): # Model contains multiple attention types, but KV cache of all layers # have the same physical memory per block per layer. Split the layers @@ -1137,6 +1186,27 @@ def get_kv_cache_groups( raise NotImplementedError +def generate_scheduler_kv_cache_config( + kv_cache_configs: list[KVCacheConfig]) -> KVCacheConfig: + """ + Generate the KV cache configuration for the scheduler. + """ + assert all([ + cfg.num_blocks == kv_cache_configs[0].num_blocks + for cfg in kv_cache_configs + ]) + # All workers have the same kv_cache_config except layer names, so use + # an arbitrary one to initialize the scheduler. + cfg = copy.deepcopy(kv_cache_configs[0]) + for group in cfg.kv_cache_groups: + if isinstance(group.kv_cache_spec, UniformTypeKVCacheSpecs): + # All layers in the UniformTypeKVCacheSpecs have the same type, + # so use an arbitrary one to initialize the scheduler. + group.kv_cache_spec = next( + iter(group.kv_cache_spec.kv_cache_specs.values())) + return cfg + + def get_kv_cache_configs(vllm_config: VllmConfig, kv_cache_specs: list[dict[str, KVCacheSpec]], available_memory: list[int]) -> list[KVCacheConfig]: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a022e9c0d7058..a43042a5510a8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -29,7 +29,9 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket, resolve_obj_by_qualname, set_process_title) -from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs, +from vllm.v1.core.kv_cache_utils import (BlockHash, + generate_scheduler_kv_cache_config, + get_kv_cache_configs, get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.interface import SchedulerInterface @@ -196,16 +198,10 @@ class EngineCore: kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs, available_gpu_memory) - - # All workers have the same kv_cache_config except layer names, so use - # an arbitrary one to initialize the scheduler. - assert all([ - cfg.num_blocks == kv_cache_configs[0].num_blocks - for cfg in kv_cache_configs - ]) - num_gpu_blocks = kv_cache_configs[0].num_blocks + scheduler_kv_cache_config = generate_scheduler_kv_cache_config( + kv_cache_configs) + num_gpu_blocks = scheduler_kv_cache_config.num_blocks num_cpu_blocks = 0 - scheduler_kv_cache_config = kv_cache_configs[0] # Initialize kv cache and warmup the execution self.model_executor.initialize_from_config(kv_cache_configs) diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 0cf92a680a689..f72cc8f93a6c2 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -234,6 +234,76 @@ class CrossAttentionSpec(AttentionSpec): return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes +@dataclass(frozen=True) +class UniformTypeKVCacheSpecs(KVCacheSpec): + """ + A KV cache spec for multiple layers with the same type of attention. Here, + same types means always need the same number of token slots. For example, + sliding window attentions with different window sizes are not the same type + and should not be merged into one UniformTypeKVCacheSpecs. + """ + kv_cache_specs: dict[str, KVCacheSpec] + + @property + def page_size_bytes(self) -> int: + return sum(spec.page_size_bytes + for spec in self.kv_cache_specs.values()) + + def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: + max_num_pages = max( + cdiv(spec.max_memory_usage_bytes(vllm_config), + spec.page_size_bytes) + for spec in self.kv_cache_specs.values()) + return max_num_pages * self.page_size_bytes + + @classmethod + def is_uniform_type(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> bool: + """ + Whether all layers have the same type of KV cache spec. + """ + block_sizes = set(spec.block_size for spec in kv_cache_specs.values()) + if len(block_sizes) > 1: + # Different block sizes, not uniform. + return False + one_spec = next(iter(kv_cache_specs.values())) + if isinstance(one_spec, (FullAttentionSpec, CrossAttentionSpec)): + return all( + isinstance(spec, type(one_spec)) + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, SlidingWindowSpec): + return all( + isinstance(spec, SlidingWindowSpec) + and spec.sliding_window == one_spec.sliding_window + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, ChunkedLocalAttentionSpec): + return all( + isinstance(spec, ChunkedLocalAttentionSpec) + and spec.attention_chunk_size == one_spec.attention_chunk_size + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, MambaSpec): + return all( + isinstance(spec, MambaSpec) and spec.num_speculative_blocks == + one_spec.num_speculative_blocks + for spec in kv_cache_specs.values()) + else: + # NOTE(Chen): Please add new branches for new KV cache spec types. + raise NotImplementedError( + f"Unsupported KV cache spec type: {type(one_spec)}") + + @classmethod + def from_specs(cls, kv_cache_specs: dict[str, + KVCacheSpec]) -> Optional[Self]: + """ + Return a SameTypeKVCacheSpecs object if all layers have the same type + of KV cache spec. Return None if not. + """ + if cls.is_uniform_type(kv_cache_specs): + block_size = next(iter(kv_cache_specs.values())).block_size + return cls(block_size=block_size, kv_cache_specs=kv_cache_specs) + else: + return None + + @dataclass class KVCacheTensor: """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index dffadd1d769b7..233df8f1b0e9b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -8,7 +8,7 @@ from collections import defaultdict from collections.abc import Iterator from contextlib import contextmanager from copy import deepcopy -from typing import TYPE_CHECKING, Any, Optional, Union, cast +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast import numpy as np import torch @@ -74,7 +74,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, EncoderOnlyAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - MambaSpec, SlidingWindowSpec) + MambaSpec, SlidingWindowSpec, + UniformTypeKVCacheSpecs) # yapf: enable from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, LogprobsLists, LogprobsTensors, @@ -1187,7 +1188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, num_common_prefix_blocks, - kv_cache_group_spec.kv_cache_spec, + attn_group.kv_cache_spec, builder, ) @@ -3453,12 +3454,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert len(self.attn_groups) == 0, \ "Attention backends are already initialized" - def get_attn_backends_for_layers( - layer_names: list[str] - ) -> dict[type[AttentionBackend], list[str]]: - layers = get_layers_from_vllm_config(self.vllm_config, - AttentionLayerBase, - layer_names) + class AttentionGroupKey(NamedTuple): + attn_backend: type[AttentionBackend] + kv_cache_spec: KVCacheSpec + + def get_attn_backends_for_group( + kv_cache_group_spec: KVCacheGroupSpec, + ) -> dict[AttentionGroupKey, list[str]]: + layers = get_layers_from_vllm_config( + self.vllm_config, AttentionLayerBase, + kv_cache_group_spec.layer_names) attn_backends = {} attn_backend_layers = defaultdict(list) # Dedupe based on full class name; this is a bit safer than @@ -3466,7 +3471,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # attention backend subclasses (e.g. ChunkedLocalAttention) unless # they are cached correctly, there will be different objects per # layer. - for layer_name in layer_names: + for layer_name in kv_cache_group_spec.layer_names: attn_backend = layers[layer_name].get_attn_backend() if layer_name in self.kv_sharing_fast_prefill_eligible_layers: @@ -3475,8 +3480,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_backend, ) - key = attn_backend.full_cls_name() - attn_backends[key] = attn_backend + full_cls_name = attn_backend.full_cls_name() + layer_kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if isinstance(layer_kv_cache_spec, UniformTypeKVCacheSpecs): + layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[ + layer_name] + key = (full_cls_name, layer_kv_cache_spec) + attn_backends[key] = AttentionGroupKey(attn_backend, + layer_kv_cache_spec) attn_backend_layers[key].append(layer_name) return { attn_backends[k]: v @@ -3484,11 +3495,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): } def create_attn_groups( - attn_backends_map: dict[AttentionBackend, list[str]], - kv_cache_spec: KVCacheSpec, + attn_backends_map: dict[AttentionGroupKey, list[str]], ) -> list[AttentionGroup]: attn_groups: list[AttentionGroup] = [] - for attn_backend, layer_names in attn_backends_map.items(): + for (attn_backend, + kv_cache_spec), layer_names in attn_backends_map.items(): attn_metadata_builders = [] attn_metadata_builders.append(attn_backend.get_builder_cls()( kv_cache_spec, @@ -3506,16 +3517,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): )) attn_group = AttentionGroup(attn_backend, attn_metadata_builders, - layer_names) + layer_names, kv_cache_spec) attn_groups.append(attn_group) return attn_groups for kv_cache_group_spec in kv_cache_config.kv_cache_groups: - kv_cache_spec = kv_cache_group_spec.kv_cache_spec - attn_backends = get_attn_backends_for_layers( - kv_cache_group_spec.layer_names) - self.attn_groups.append( - create_attn_groups(attn_backends, kv_cache_spec)) + attn_backends = get_attn_backends_for_group(kv_cache_group_spec) + self.attn_groups.append(create_attn_groups(attn_backends)) # Calculate reorder batch threshold (if needed) self.calculate_reorder_batch_threshold() @@ -3680,14 +3688,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _attn_group_iterator(self) -> Iterator[AttentionGroup]: return itertools.chain.from_iterable(self.attn_groups) - def _kv_cache_spec_attn_group_iterator( - self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]: + def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]: if not self.kv_cache_config.kv_cache_groups: return - for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups): - for attn_group in attn_groups: - yield self.kv_cache_config.kv_cache_groups[ - kv_cache_spec_id].kv_cache_spec, attn_group + for attn_groups in self.attn_groups: + yield from attn_groups def _reshape_kv_cache_tensors( self, @@ -3707,7 +3712,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): """ kv_caches: dict[str, torch.Tensor] = {} has_attn, has_mamba = False, False - for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec attn_backend = group.backend for layer_name in group.layer_names: if layer_name in self.runner_only_attn_layers: @@ -3787,7 +3793,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_caches: The KV cache buffer of each layer. """ - for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec for layer_name in group.layer_names: kv_cache = kv_caches[layer_name] if (isinstance(kv_cache_spec, AttentionSpec) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index b76ac633892f3..021d18b2500f0 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -15,7 +15,7 @@ from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget -from vllm.v1.kv_cache_interface import KVCacheGroupSpec +from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec if TYPE_CHECKING: from vllm.attention.layer import Attention @@ -132,6 +132,7 @@ class AttentionGroup: backend: type[AttentionBackend] metadata_builders: list[AttentionMetadataBuilder] layer_names: list[str] + kv_cache_spec: KVCacheSpec def get_metadata_builder(self, ubatch_id: Optional[int] = None From be874c020196080305baf988ed8c1c82047323be Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Sat, 20 Sep 2025 00:04:05 -0700 Subject: [PATCH 505/584] [Bugfix] Fix Qwen3-VL-MoE weight loading for EP (#25300) Signed-off-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/qwen3_vl_moe.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 625f94cf7ad77..7912cf3ea52b0 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -122,9 +122,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): def load_fused_expert_weights(self, name: str, params_dict: dict, loaded_weight: torch.Tensor, shard_id: str, - num_experts: int): + num_experts: int) -> bool: param = params_dict[name] weight_loader = typing.cast(Callable[..., bool], param.weight_loader) + loaded_local_expert = False for expert_id in range(num_experts): curr_expert_weight = loaded_weight[expert_id] success = weight_loader(param, @@ -133,9 +134,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): shard_id, expert_id, return_success=True) - if not success: - return False - return True + if success: + loaded_local_expert = True + + return loaded_local_expert def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -345,4 +347,4 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): for _ in range(self.deepstack_num_level) ] if self.use_deepstack else None self.visual_dim = config.vision_config.out_hidden_size - self.multiscale_dim = self.visual_dim * self.deepstack_num_level \ No newline at end of file + self.multiscale_dim = self.visual_dim * self.deepstack_num_level From 3d9a1d2de5091455bb2fbf6b21fc9188fd4612a4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 20 Sep 2025 15:14:35 +0800 Subject: [PATCH 506/584] [V1] Support `LLM.apply_model` (#18465) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- tests/conftest.py | 12 +--- tests/kernels/moe/test_mxfp4_moe.py | 37 ++++++---- .../multimodal/generation/test_qwen2_vl.py | 46 ++++++------ tests/models/quantization/test_awq.py | 2 +- tests/quantization/test_compressed_tensors.py | 18 +++-- tests/quantization/test_fp8.py | 8 +-- tests/quantization/test_gptq_dynamic.py | 71 ++++++++++--------- tests/quantization/test_lm_head.py | 4 +- tests/quantization/test_modelopt.py | 10 +-- tests/quantization/test_ptpc_fp8.py | 47 +++++++----- tests/quantization/test_quark.py | 26 +++---- .../test_register_quantization_config.py | 17 +++-- vllm/engine/llm_engine.py | 7 +- vllm/entrypoints/llm.py | 9 ++- vllm/executor/executor_base.py | 33 +++++---- vllm/v1/engine/llm_engine.py | 7 +- vllm/worker/worker_base.py | 9 ++- 17 files changed, 194 insertions(+), 169 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3cd93f4ad3289..e8e95357ff5b9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -987,17 +987,7 @@ class VllmRunner: return [req_output.outputs.score for req_output in req_outputs] def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - if hasattr(self.llm.llm_engine, "model_executor"): - # This works either in V0 or in V1 with - # VLLM_ENABLE_V1_MULTIPROCESSING=0 - executor = self.llm.llm_engine.model_executor - return executor.apply_model(func) - - # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1 - def _apply_model(self): - return func(self.get_model()) - - return self.llm.llm_engine.collective_rpc(_apply_model) + return self.llm.apply_model(func) def get_llm(self) -> LLM: return self.llm diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index a3b8f07638d9a..61d3311cc1624 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -1,21 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import importlib import importlib.metadata from dataclasses import dataclass +from importlib.util import find_spec from typing import Optional import pytest import torch from packaging import version +from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 + QuarkLinearMethod, QuarkW4A4MXFP4) +from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501 + QuarkW4A4MXFp4MoEMethod) from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer -QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( - "quark") is not None and version.parse( - importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') +QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse( + importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda( ) and current_platform.is_device_capability(100) @@ -39,6 +42,12 @@ class ModelCase: tp: int +@pytest.fixture(scope="function", autouse=True) +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + @pytest.mark.parametrize('model_case', [ ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1), ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8), @@ -55,21 +64,19 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): tensor_parallel_size=model_case.tp, load_format="dummy") as llm: - # TODO: llm.apply_model(check_model) currently relies on V0 internals. - # Re-enable this later. - # def check_model(model): - # layer = model.model.layers[0] + def check_model(model): + layer = model.model.layers[0] - # qkv_proj = layer.self_attn.qkv_proj + qkv_proj = layer.self_attn.qkv_proj - # assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) - # assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4) + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4) - # assert isinstance(layer.mlp.experts.quant_method, - # QuarkW4A4MXFp4MoEMethod) + assert isinstance(layer.mlp.experts.quant_method, + QuarkW4A4MXFp4MoEMethod) - # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4": - # llm.apply_model(check_model) + if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4": + llm.apply_model(check_model) output = llm.generate_greedy("Today I am in the French Alps and", max_tokens=20) diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index a81f5e7ec8872..e56f4e4075be4 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -10,6 +10,7 @@ from PIL import Image from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video +from vllm.utils import set_default_torch_num_threads from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) @@ -17,11 +18,9 @@ from ...utils import check_logprobs_close @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - V1 Test: batch_make_xxxxx_embeddings calls a V0 internal - """ - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") models = ["Qwen/Qwen2-VL-2B-Instruct"] @@ -126,9 +125,8 @@ def batch_make_image_embeddings( image_grid_thw_on_device = image_grid_thw.to(visual.device, dtype=torch.int64) return visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + grid_thw=image_grid_thw_on_device).cpu() - # V1 Test: this calls a V0 internal. image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches @@ -210,7 +208,7 @@ def batch_make_video_embeddings( video_grid_thw_on_device = video_grid_thw.to(visual.device, dtype=torch.int64) return visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + grid_thw=video_grid_thw_on_device).cpu() # V1 Test: this calls a V0 internal. video_embeds = torch.concat(llm.apply_model(get_image_embeds)) @@ -266,19 +264,22 @@ def run_embedding_input_test( processor = AutoProcessor.from_pretrained(model) # max_model_len should be greater than image_feature_size - with vllm_runner(model, - runner="generate", - max_model_len=4000, - max_num_seqs=3, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: + with set_default_torch_num_threads(1): + vllm_model = vllm_runner( + model, + runner="generate", + max_model_len=4000, + max_num_seqs=3, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) + with vllm_model: outputs_per_case_for_original_input = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, @@ -329,9 +330,8 @@ def run_embedding_input_test( @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [10]) def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, - size_factors, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: + size_factors, dtype, max_tokens, + num_logprobs, monkeypatch) -> None: images = [asset.pil_image for asset in image_assets] inputs_per_case: list[tuple[ diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index bd696198931ff..7005e435ecf46 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -112,7 +112,7 @@ def test_awq_models(vllm_runner, image_assets, source_model, quant_model, monkeypatch) -> None: # Test V1: this test hangs during setup on single-scale input. - # TODO: fixure out why and re-enable this on V1. + # TODO: figure out why and re-enable this on V1. monkeypatch.setenv("VLLM_USE_V1", "0") run_awq_test( vllm_runner, diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 484f53246f349..b7949a488ad05 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -43,12 +43,9 @@ ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [ @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - if not current_platform.is_cpu(): - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.parametrize( @@ -176,10 +173,11 @@ def test_compressed_tensors_w8a8_logprobs( dtype = "bfloat16" - # skip language translation prompt for the static per tensor asym model - if (model_path == - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" - ): # noqa: E501 + # skip language translation prompt for the static per tensor models + if model_path in ( + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", + ): example_prompts = example_prompts[0:-1] with hf_runner(model_path, dtype=dtype) as hf_model: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index d781f462b4ad7..db53061cf2d1a 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -60,8 +60,8 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, if use_rocm_aiter: monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: def check_model(model): @@ -104,8 +104,8 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, if use_rocm_aiter: monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index aea50e99c1dd5..00a5946ed0154 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -31,41 +31,46 @@ MODEL_QUANT = [ @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, monkeypatch): - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") - - vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( GPTQLinearMethod) - for name, submodule in (vllm_model.llm.llm_engine.model_executor. - driver_worker.model_runner.model.named_modules()): - if name == "lm_head": - assert isinstance(submodule.quant_method, linear_method_cls) - elif name == 'model.layers.0.self_attn.qkv_proj': - # The first layer is quantized using bits=4, group_size=128 - # desc_act=True - assert isinstance(submodule.quant_method, linear_method_cls) - config = submodule.quant_method.quant_config - assert config.weight_bits == 4 - assert config.group_size == 128 - assert config.desc_act - elif name == 'model.layers.1.self_attn.qkv_proj': - # The second layer is quantized using bits=8, group_size=32 - # desc_act=False - assert isinstance(submodule.quant_method, linear_method_cls) - config = submodule.quant_method.quant_config - assert get_dynamic_override(config, layer_name=name, - key="bits") == 8 - assert get_dynamic_override(config, - layer_name=name, - key="group_size") == 32 - assert not get_dynamic_override( - config, layer_name=name, key="desc_act") - elif (name == 'model.layers.2.self_attn.qkv_proj' - or name == 'model.layers.2.mlp.gate_up_proj'): - # All other layers (layer index >= 2) are not quantized - assert isinstance(submodule.quant_method, UnquantizedLinearMethod) + with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm: - del vllm_model + def check_model(model): + for name, submodule in model.named_modules(): + if name == "lm_head": + assert isinstance(submodule.quant_method, + linear_method_cls) + elif name == 'model.layers.0.self_attn.qkv_proj': + # The first layer is quantized using bits=4, group_size=128 + # desc_act=True + assert isinstance(submodule.quant_method, + linear_method_cls) + config = submodule.quant_method.quant_config + assert config.weight_bits == 4 + assert config.group_size == 128 + assert config.desc_act + elif name == 'model.layers.1.self_attn.qkv_proj': + # The second layer is quantized using bits=8, group_size=32 + # desc_act=False + assert isinstance(submodule.quant_method, + linear_method_cls) + config = submodule.quant_method.quant_config + assert get_dynamic_override(config, + layer_name=name, + key="bits") == 8 + assert get_dynamic_override(config, + layer_name=name, + key="group_size") == 32 + assert not get_dynamic_override( + config, layer_name=name, key="desc_act") + elif (name == 'model.layers.2.self_attn.qkv_proj' + or name == 'model.layers.2.mlp.gate_up_proj'): + # All other layers (layer index >= 2) are not quantized + assert isinstance(submodule.quant_method, + UnquantizedLinearMethod) + + llm.apply_model(check_model) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index b24964a9d0a9f..e69d4ad349c38 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -29,8 +29,8 @@ def test_lm_head( lm_head_quantized: bool, monkeypatch, ) -> None: - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model: diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index c60a03f44baec..e7174be73626a 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -11,16 +11,12 @@ import pytest import torch from tests.quantization.utils import is_quant_method_supported -from vllm.platforms import current_platform @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - if not current_platform.is_cpu(): - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.skipif(not is_quant_method_supported("modelopt"), diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index 5f78bc30504c0..088b68510cffa 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -13,6 +13,16 @@ from vllm.model_executor.layers.quantization.ptpc_fp8 import ( PTPCFp8LinearMethod) from vllm.platforms import current_platform +UNSUPPORTED_STR = ( + "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only " + "support output dtype of bfloat16. torch.float16 is specified.") + + +@pytest.fixture(scope="function", autouse=True) +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + @pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"), reason="PTPC FP8 is not supported on this GPU type.") @@ -21,14 +31,22 @@ from vllm.platforms import current_platform @pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"]) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"]) def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: - try: - with vllm_runner("facebook/opt-125m", - dtype=dtype, - quantization="ptpc_fp8", - kv_cache_dtype=kv_cache_dtype) as llm: + llm = vllm_runner("facebook/opt-125m", + dtype=dtype, + quantization="ptpc_fp8", + kv_cache_dtype=kv_cache_dtype) + except AssertionError as e: + if str(e) == UNSUPPORTED_STR: + # If the error message matches, the test passes + return + else: + # If the error message does not match, re-raise the exception + raise - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + with llm: + + def check_model(model): fc1 = model.model.decoder.layers[0].fc1 assert isinstance(fc1.quant_method, PTPCFp8LinearMethod) if kv_cache_dtype == "ptpc_fp8": @@ -40,17 +58,8 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: if current_platform.has_device_capability(94): # For GPUs with hardware support, we keep weights in fp8 assert fc1.weight.dtype == torch.float8_e4m3fnuz - else: - pytest.skip() - output = llm.generate_greedy("Hello my name is", max_tokens=20) - assert output - except AssertionError as e: - if str( - e - ) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.": # noqa: E501 - # If the error message matches, the test passes - pass - else: - # If the error message does not match, re-raise the exception - raise + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index c09931971e6fb..930f4acb328fd 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -7,10 +7,10 @@ Run `pytest tests/quantization/test_quark.py`. See also `tests/kernels/moe/test_mxfp4_moe.py`. """ -import importlib import importlib.metadata import os from dataclasses import dataclass +from importlib.util import find_spec import huggingface_hub import lm_eval @@ -24,9 +24,8 @@ from vllm.platforms import current_platform from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch -QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( - "quark") is not None and version.parse( - importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') +QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse( + importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') if QUARK_MXFP4_AVAILABLE: from quark.torch.export.nn.modules.realquantizer import ( @@ -43,11 +42,9 @@ except huggingface_hub.errors.RepositoryNotFoundError: @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8']) @@ -132,13 +129,12 @@ def test_quark_fp8_parity(vllm_runner): } with (vllm_runner(quark_model_id, **llm_kwargs) as quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle): - quark_model = (quark_handle.llm.llm_engine.model_executor. - driver_worker.model_runner.model) - quark_state_dict = quark_model.state_dict() - fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker. - model_runner.model) - fp8_state_dict = fp8_model.state_dict() + def get_state_dict(model): + return {k: v.cpu() for k, v in model.state_dict().items()} + + quark_state_dict, = quark_handle.apply_model(get_state_dict) + fp8_state_dict, = fp8_handle.apply_model(get_state_dict) assert fp8_state_dict.keys() == quark_state_dict.keys() diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 84705e92c85bb..03fe59d7e3bff 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -105,18 +105,21 @@ def test_register_quantization_config(): ]) def test_custom_quant(vllm_runner, model, monkeypatch): """Test infer with the custom quantization method.""" - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + with vllm_runner(model_name=model, quantization="custom_quant", enforce_eager=True) as llm: - model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + qkv_proj = layer.self_attn.qkv_proj - # Check the quantization method is FakeQuantLinearMethod - assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + # Check the quantization method is FakeQuantLinearMethod + assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 708f3bbeeff15..014bc56bc8ece 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -13,6 +13,7 @@ from typing import Sequence as GenericSequence from typing import Set, Type, Union, cast import torch +import torch.nn as nn from typing_extensions import TypeVar import vllm.envs as envs @@ -55,6 +56,7 @@ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind from vllm.version import __version__ as VLLM_VERSION from vllm.worker.model_runner_base import InputProcessingError +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -1817,13 +1819,16 @@ class LLMEngine: return sampling_params def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, args: tuple = (), kwargs: Optional[dict[str, Any]] = None) -> list[_R]: return self.model_executor.collective_rpc(method, timeout, args, kwargs) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + return self.collective_rpc("apply_model", args=(func, )) + if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e21bfce0ab085..f2282c40f7073 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -522,9 +522,14 @@ class LLM: """ Run a function directly on the model inside each worker, returning the result for each of them. + + !!! warning + To reduce the overhead of data transfer, avoid returning large + arrays or tensors from this method. If you must return them, + make sure you move them to CPU first to avoid taking up additional + VRAM! """ - executor = self.llm_engine.model_executor - return executor.apply_model(func) + return self.llm_engine.apply_model(func) def _get_beam_search_lora_requests( self, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 42aa8d14a21eb..b75b94ad0acc2 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -5,11 +5,10 @@ import asyncio import time from abc import ABC, abstractmethod from functools import cached_property -from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, - Union) +from typing import Any, Awaitable, Callable, List, Optional, Set, Union import torch.nn as nn -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated import vllm.platforms from vllm.config import VllmConfig @@ -63,10 +62,10 @@ class ExecutorBase(ABC): @abstractmethod def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[_R]: """ Execute an RPC call on all workers. @@ -91,7 +90,7 @@ class ExecutorBase(ABC): """ raise NotImplementedError - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. @@ -99,9 +98,10 @@ class ExecutorBase(ABC): ExecutorBase may require modification of the result, e.g. to ensure the selected cache sizes are compatible with all workers. - Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks - are blocks that are "active" on the device and can be appended to. - num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where + `num_gpu_blocks` are blocks that are "active" on the device and can be + appended to. + `num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be appended to. """ results = self.collective_rpc("determine_num_available_blocks") @@ -127,16 +127,15 @@ class ExecutorBase(ABC): self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) + @deprecated("`llm_engine.model_executor.apply_model` will no longer work " + "in V1 Engine. Please replace with `llm_engine.apply_model` " + "and set `VLLM_ALLOW_INSECURE_SERIALIZATION=1`.") def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: """ Run a function directly on the model inside each worker, returning the result for each of them. """ - - def rpc_func(worker: WorkerBase) -> _R: - return func(worker.get_model()) - - return self.collective_rpc(rpc_func) + return self.collective_rpc("apply_model", args=(func, )) @cached_property # Avoid unnecessary RPC calls def supported_tasks(self) -> tuple[SupportedTask, ...]: @@ -308,8 +307,8 @@ class DistributedExecutorBase(ExecutorBase): def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[Any]: return self._run_workers(method, *args, **(kwargs or {})) @abstractmethod diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c93bfc35f0aeb..907656d1b24cb 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -5,6 +5,7 @@ from collections.abc import Mapping from copy import copy from typing import Any, Callable, Optional, Union +import torch.nn as nn from typing_extensions import TypeVar import vllm.envs as envs @@ -33,6 +34,7 @@ from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, StatLoggerFactory) from vllm.v1.metrics.reader import Metric, get_metrics_snapshot from vllm.v1.metrics.stats import IterationStats +from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -319,12 +321,15 @@ class LLMEngine: return self.engine_core.pin_lora(lora_id) def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, args: tuple = (), kwargs: Optional[dict[str, Any]] = None) -> list[_R]: return self.engine_core.collective_rpc(method, timeout, args, kwargs) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + return self.collective_rpc("apply_model", args=(func, )) + def __del__(self): if dp_group := getattr(self, "dp_group", None): stateless_destroy_torch_distributed_process_group(dp_group) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index aa76d21f0fcaa..d0a56f6ff4637 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -5,7 +5,8 @@ import dataclasses import os import time from abc import abstractmethod -from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, Type, + TypeVar, Union) import cloudpickle import torch @@ -28,6 +29,8 @@ from vllm.worker.model_runner_base import (BroadcastableModelInput, logger = init_logger(__name__) +_R = TypeVar("_R") + @warn_for_unimplemented_methods class WorkerBase: @@ -70,6 +73,10 @@ class WorkerBase: def get_model(self) -> nn.Module: raise NotImplementedError + def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R: + """Apply a function on the model inside this worker.""" + return fn(self.get_model()) + def load_model(self) -> None: """Load model onto target device.""" raise NotImplementedError From e08a3a3fdbdb5408f904a237b31ff2447a336b2f Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Sat, 20 Sep 2025 04:16:56 -0400 Subject: [PATCH 507/584] [CI Failure] Disable FlashInfer RoPE to unblock CI (#25299) Signed-off-by: mgoin <mgoin64@gmail.com> --- .../model_executor/layers/rotary_embedding/base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 3dc249ae9adb9..1c3576bee5392 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -6,8 +6,6 @@ from typing import Optional import torch from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer from .common import apply_rotary_emb_torch @@ -32,13 +30,15 @@ class RotaryEmbedding(CustomOp): self.base = base self.is_neox_style = is_neox_style self.dtype = dtype + # TODO(mgoin): disabled for now due to failures # Flashinfer only supports head_size=64, 128, 256, 512. # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202 - self.use_flashinfer = (self.enabled() - and dtype in (torch.float16, torch.bfloat16) - and current_platform.is_cuda() - and has_flashinfer() - and self.head_size in [64, 128, 256, 512]) + # self.use_flashinfer = (self.enabled() + # and dtype in (torch.float16, torch.bfloat16) + # and current_platform.is_cuda() + # and has_flashinfer() + # and self.head_size in [64, 128, 256, 512]) + self.use_flashinfer = False cache = self._compute_cos_sin_cache() if not self.use_flashinfer: From 032d661d27db873ddd71ffb609c79c581b3f27b9 Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Sat, 20 Sep 2025 04:45:18 -0700 Subject: [PATCH 508/584] [Docs] Fix warnings in mkdocs build (continued) (#25042) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> --- vllm/multimodal/__init__.py | 2 +- vllm/utils/__init__.py | 6 +++--- vllm/v1/core/kv_cache_utils.py | 4 ++-- vllm/v1/sample/rejection_sampler.py | 10 +++++----- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/utils.py | 3 ++- vllm/worker/model_runner.py | 12 ++++++++++-- 7 files changed, 24 insertions(+), 15 deletions(-) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index b7d4cd298e24f..7ffa732cf3708 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -15,7 +15,7 @@ is used by model runners to dispatch data processing according to the target model. Info: - [mm_processing](../../../design/mm_processing.html) + [mm_processing](../../../design/mm_processing.md) """ __all__ = [ diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index fd1c0af31269c..968bba664f0a9 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3216,7 +3216,7 @@ def cprofile_context(save_file: Optional[str] = None): Args: save_file: path to save the profile result. "1" or - None will result in printing to stdout. + None will result in printing to stdout. """ import cProfile @@ -3273,7 +3273,7 @@ def check_use_alibi(model_config: ModelConfig) -> bool: and getattr(cfg.attn_config, "alibi", False))))) -def sha256(input) -> bytes: +def sha256(input: Any) -> bytes: """Hash any picklable Python object using SHA-256. The input is serialized using pickle before hashing, which allows @@ -3290,7 +3290,7 @@ def sha256(input) -> bytes: return hashlib.sha256(input_bytes).digest() -def sha256_cbor(input) -> bytes: +def sha256_cbor(input: Any) -> bytes: """ Hash objects using CBOR serialization and SHA-256. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3ccd00121f8ed..47a41322c423c 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1229,8 +1229,8 @@ def get_kv_cache_configs(vllm_config: VllmConfig, Args: vllm_config: The global VllmConfig kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker. - available_memory: Memory available for KV cache in bytes for each - worker. + available_memory: Memory available for KV cache in bytes for each + worker. Returns: The generated KVCacheConfigs for each worker. diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 3d5e59addfcfa..ced5c7a970388 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -351,17 +351,17 @@ def generate_uniform_probs( without a seed. Args: - num_tokens : int + num_tokens: int Total number of tokens. - num_draft_tokens : List[List[int]] + num_draft_tokens: List[List[int]] Number of draft tokens per request. - generators : Optional[Dict[int, torch.Generator]] + generators: Optional[Dict[int, torch.Generator]] A dictionary mapping indices in the batch to `torch.Generator` objects. - device : torch.device + device: torch.device The device on which to allocate the tensor. Returns: - uniform_rand : torch.Tensor + uniform_rand: torch.Tensor A tensor of shape `(num_tokens, )` containing uniform random values in the range [0, 1). """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 233df8f1b0e9b..d0946e8c5d7d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1479,7 +1479,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: scheduler_output: The scheduler output containing scheduled encoder - inputs. + inputs. Returns: A tuple of (mm_kwargs, req_ids_pos) where: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 021d18b2500f0..af922f9979d1a 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -205,7 +205,8 @@ def gather_mm_placeholders( """ Reconstructs the embeddings from the placeholder tokens. - This is the operation of [scatter_mm_placeholders][]. + This is the operation of [`scatter_mm_placeholders`] + [vllm.v1.worker.utils.scatter_mm_placeholders]. """ if is_embed is None: return placeholders diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index f662f5a85eff6..bab89586b0f2c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1810,7 +1810,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): return [output] - def need_recv_kv(self, model_input, kv_caches) -> bool: + def need_recv_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor]) -> bool: """Check if we need to receive kv-cache from the other worker. We need to receive KV when 1. current vLLM instance is KV cache consumer/decode vLLM instance @@ -1825,6 +1826,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if self.vllm_config.kv_transfer_config is None: return False + if model_input.attn_metadata is None: + raise ValueError("model_input.attn_metadata cannot be None") + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling @@ -1835,7 +1839,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): return self.vllm_config.kv_transfer_config.is_kv_consumer and ( not is_profile_run) and is_prefill_run - def need_send_kv(self, model_input, kv_caches) -> bool: + def need_send_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor]) -> bool: """Check if we need to send kv-cache to the other worker. We need to send KV when 1. current vLLM instance is KV cache producer/prefill vLLM instance @@ -1850,6 +1855,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if self.vllm_config.kv_transfer_config is None: return False + if model_input.attn_metadata is None: + raise ValueError("model_input.attn_metadata cannot be None") + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling From bf8b26cad17f21f142de6c06ee657f01ccb2b816 Mon Sep 17 00:00:00 2001 From: Manoel Marques <manoelmrqs@gmail.com> Date: Sat, 20 Sep 2025 07:51:13 -0400 Subject: [PATCH 509/584] Generate _ModelInfo properties file when loading to improve loading speed (#23558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> --- vllm/logging_utils/__init__.py | 2 + vllm/logging_utils/log_time.py | 32 +++++++ .../model_loader/weight_utils.py | 44 +++++++++ vllm/model_executor/models/registry.py | 92 ++++++++++++++++++- 4 files changed, 167 insertions(+), 3 deletions(-) create mode 100644 vllm/logging_utils/log_time.py diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index cf690a89ae9bc..7202259ca21aa 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -2,7 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.log_time import logtime __all__ = [ "NewLineFormatter", + "logtime", ] diff --git a/vllm/logging_utils/log_time.py b/vllm/logging_utils/log_time.py new file mode 100644 index 0000000000000..013dd144beaf8 --- /dev/null +++ b/vllm/logging_utils/log_time.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Provides a timeslice logging decorator +""" + +import functools +import time + + +def logtime(logger, msg=None): + """ + Logs the execution time of the decorated function. + Always place it beneath other decorators. + """ + + def _inner(func): + + @functools.wraps(func) + def _wrapper(*args, **kwargs): + start = time.perf_counter() + result = func(*args, **kwargs) + elapsed = time.perf_counter() - start + + prefix = f"Function '{func.__module__}.{func.__qualname__}'" \ + if msg is None else msg + logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed) + return result + + return _wrapper + + return _inner diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f2c66763d0816..a72086da18c4d 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -11,6 +11,7 @@ import tempfile import time from collections import defaultdict from collections.abc import Generator +from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Optional, Union @@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path], return lock +@contextmanager +def atomic_writer(filepath: Union[str, Path], + mode: str = 'w', + encoding: Optional[str] = None): + """ + Context manager that provides an atomic file writing routine. + + The context manager writes to a temporary file and, if successful, + atomically replaces the original file. + + Args: + filepath (str or Path): The path to the file to write. + mode (str): The file mode for the temporary file (e.g., 'w', 'wb'). + encoding (str): The encoding for text mode. + + Yields: + file object: A handle to the temporary file. + """ + # Create a temporary file in the same directory as the target file + # to ensure it's on the same filesystem for an atomic replace. + temp_dir = os.path.dirname(filepath) + temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir) + + try: + # Open the temporary file for writing + with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file: + yield temp_file + + # If the 'with' block completes successfully, + # perform the atomic replace. + os.replace(temp_path, filepath) + + except Exception: + logger.exception( + "Error during atomic write. Original file '%s' not modified", + filepath) + raise + finally: + # Clean up the temporary file if it still exists. + if os.path.exists(temp_path): + os.remove(temp_path) + + def maybe_download_from_modelscope( model: str, revision: Optional[str] = None, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 76f2bd087624c..5dc5d545bb9c5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -4,7 +4,9 @@ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. """ +import hashlib import importlib +import json import os import pickle import subprocess @@ -12,16 +14,19 @@ import sys import tempfile from abc import ABC, abstractmethod from collections.abc import Set -from dataclasses import dataclass, field +from dataclasses import asdict, dataclass, field from functools import lru_cache +from pathlib import Path from typing import Callable, Optional, TypeVar, Union import torch.nn as nn import transformers +from vllm import envs from vllm.config import (ModelConfig, iter_architecture_defaults, try_match_architecture_defaults) from vllm.logger import init_logger +from vllm.logging_utils import logtime from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) @@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel): module_name: str class_name: str - # Performed in another process to avoid initializing CUDA + @staticmethod + def _get_cache_dir() -> Path: + return Path(envs.VLLM_CACHE_ROOT) / "modelinfos" + + def _get_cache_filename(self) -> str: + cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-") + return f"{cls_name}.json" + + def _load_modelinfo_from_cache(self, + module_hash: str) -> _ModelInfo | None: + try: + try: + modelinfo_path = self._get_cache_dir( + ) / self._get_cache_filename() + with open(modelinfo_path, encoding="utf-8") as file: + mi_dict = json.load(file) + except FileNotFoundError: + logger.debug(("Cached model info file " + "for class %s.%s not found"), self.module_name, + self.class_name) + return None + + if mi_dict["hash"] != module_hash: + logger.debug(("Cached model info file " + "for class %s.%s is stale"), self.module_name, + self.class_name) + return None + + # file not changed, use cached _ModelInfo properties + return _ModelInfo(**mi_dict["modelinfo"]) + except Exception: + logger.exception(("Cached model info " + "for class %s.%s error. "), self.module_name, + self.class_name) + return None + + def _save_modelinfo_to_cache(self, mi: _ModelInfo, + module_hash: str) -> None: + """save dictionary json file to cache""" + from vllm.model_executor.model_loader.weight_utils import atomic_writer + try: + modelinfo_dict = { + "hash": module_hash, + "modelinfo": asdict(mi), + } + cache_dir = self._get_cache_dir() + cache_dir.mkdir(parents=True, exist_ok=True) + modelinfo_path = cache_dir / self._get_cache_filename() + with atomic_writer(modelinfo_path, encoding='utf-8') as f: + json.dump(modelinfo_dict, f, indent=2) + except Exception: + logger.exception("Error saving model info cache.") + + @logtime(logger=logger, msg="Registry inspect model class") def inspect_model_cls(self) -> _ModelInfo: - return _run_in_subprocess( + model_path = Path( + __file__).parent / f"{self.module_name.split('.')[-1]}.py" + + assert model_path.exists(), \ + f"Model {self.module_name} expected to be on path {model_path}" + with open(model_path, "rb") as f: + module_hash = hashlib.md5(f.read()).hexdigest() + + mi = self._load_modelinfo_from_cache(module_hash) + if mi is not None: + logger.debug(("Loaded model info " + "for class %s.%s from cache"), self.module_name, + self.class_name) + return mi + else: + logger.debug(("Cache model info " + "for class %s.%s miss. " + "Loading model instead."), self.module_name, + self.class_name) + + # Performed in another process to avoid initializing CUDA + mi = _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) + logger.debug("Loaded model info for class %s.%s", self.module_name, + self.class_name) + + # save cache file + self._save_modelinfo_to_cache(mi, module_hash) + + return mi def load_model_cls(self) -> type[nn.Module]: mod = importlib.import_module(self.module_name) From 3c713a97116f34ffdc75ba10c9dfbb2850437984 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Sat, 20 Sep 2025 20:46:24 +0800 Subject: [PATCH 510/584] [Model] Cleanup InternViT's data parallel implementation (#25306) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/model_executor/models/intern_vit.py | 158 ++++++----------------- 1 file changed, 37 insertions(+), 121 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 118cce810a1f2..892188c047228 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -164,23 +163,15 @@ class InternParallelAttention(nn.Module): self.tp_size) self.scale = self.head_dim**-0.5 - if use_data_parallel: - self.qkv = ReplicatedLinear( - self.embed_dim, - 3 * self.head_dim * self.num_heads, - bias=config.qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) - else: - self.qkv = QKVParallelLinear( - self.embed_dim, - self.head_dim, - num_dummy_heads + self.num_heads, - bias=config.qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) + self.qkv = QKVParallelLinear( + self.embed_dim, + self.head_dim, + num_dummy_heads + self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + disable_tp=use_data_parallel, + ) self.qk_normalization = config.qk_normalization @@ -192,20 +183,13 @@ class InternParallelAttention(nn.Module): eps=config.layer_norm_eps, var_hidden_size=self.embed_dim) - if use_data_parallel: - self.proj = ReplicatedLinear( - self.dummy_dim, - self.embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) - else: - self.proj = RowParallelLinear( - self.dummy_dim, - self.embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) + self.proj = RowParallelLinear( + self.dummy_dim, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + disable_tp=use_data_parallel, + ) self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) @@ -236,72 +220,6 @@ class InternParallelAttention(nn.Module): return out -class InternSdpaAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__( - self, - config: PretrainedConfig, - *, - num_dummy_heads: int = 0, - ) -> None: - super().__init__() - - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f'embed_dim must be divisible by num_heads ' - f'(got `embed_dim`: {self.embed_dim} and `num_heads`:' - f' {self.num_heads}).') - - # Additional dummy heads are used to enable TP for common GPU counts. - self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim - - self.scale = self.head_dim**-0.5 - self.qkv = nn.Linear(self.embed_dim, - 3 * self.dummy_dim, - bias=config.qkv_bias) - - self.qk_normalization = config.qk_normalization - - if self.qk_normalization: - self.q_norm = RMSNorm(self.dummy_dim, - eps=config.layer_norm_eps, - var_hidden_size=self.embed_dim) - self.k_norm = RMSNorm(self.dummy_dim, - eps=config.layer_norm_eps, - var_hidden_size=self.embed_dim) - - self.proj = nn.Linear(self.dummy_dim, self.embed_dim) - - # Use unified MultiHeadAttention with automatic backend selection - self.attn = MultiHeadAttention(self.num_heads, self.head_dim, - self.scale) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - B, N, C = x.shape - qkv = self.qkv(x) - q, k, v = qkv.chunk(3, dim=-1) - - q = q.view(B, N, self.num_heads, self.head_dim) - k = k.view(B, N, self.num_heads, self.head_dim) - v = v.view(B, N, self.num_heads, self.head_dim) - - if self.qk_normalization: - B_, N_, H_, D_ = q.shape - q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_) - k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_) - - # Use unified MultiHeadAttention with automatic backend selection - x = self.attn(q, k, v) - - x = self.proj(x) - return x - - class InternMLP(nn.Module): def __init__( @@ -315,20 +233,18 @@ class InternMLP(nn.Module): self.config = config self.activation_fn = get_act_fn(config.hidden_act) - cls_fc1 = (ReplicatedLinear - if use_data_parallel else ColumnParallelLinear) - self.fc1 = cls_fc1(config.hidden_size, - config.intermediate_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc1") - cls_fc2 = (ReplicatedLinear - if use_data_parallel else RowParallelLinear) - self.fc2 = cls_fc2(config.intermediate_size, - config.hidden_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc2") + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -385,19 +301,19 @@ class InternVisionEncoderLayer(nn.Module): use_data_parallel: bool = False, ): # fallback to sdpa attention if tp unavailable - # tp_size = get_tensor_model_parallel_world_size() tp_size = (1 if use_data_parallel else get_tensor_model_parallel_world_size()) num_heads = config.num_attention_heads - if (num_heads + num_dummy_heads) % tp_size == 0: - return InternParallelAttention(config, - quant_config=quant_config, - num_dummy_heads=num_dummy_heads, - prefix=prefix, - use_data_parallel=use_data_parallel) - - return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) + # if the number of heads is not divisible by tp_size, + # we also disable Attention's TP + use_data_parallel = (use_data_parallel + or (num_heads + num_dummy_heads) % tp_size != 0) + return InternParallelAttention(config, + quant_config=quant_config, + num_dummy_heads=num_dummy_heads, + prefix=prefix, + use_data_parallel=use_data_parallel) def forward( self, From d88918e4c2d189eb25724a6cff9a9028c67daa07 Mon Sep 17 00:00:00 2001 From: lirong <56789630+lirong-lirong@users.noreply.github.com> Date: Sat, 20 Sep 2025 21:15:22 +0800 Subject: [PATCH 511/584] [Core] Enable sharded state loader for V1 engine and enhance test coverage (#25308) Signed-off-by: pengdrumli <pengdrumli@tencent.com> --- tests/test_sharded_state_loader.py | 20 ++++++++++++-------- vllm/engine/arg_utils.py | 6 ------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 42afdfa3c7468..fd5b5fad0999c 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -57,10 +57,19 @@ def llama_3p2_1b_files(): def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): llm_sharded_writer = LLM(model=input_dir, **kwargs) - + # Check which engine version is being used + is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core") # Dump worker states to output directory - llm_sharded_writer.llm_engine.model_executor.save_sharded_state( - path=output_dir) + if is_v1_engine: + # For V1 engine, we need to use engine_core.save_sharded_state + print("Using V1 engine save path") + llm_sharded_writer.llm_engine.engine_core.save_sharded_state( + path=output_dir) + else: + # For V0 engine + print("Using V0 engine save path") + model_executor = llm_sharded_writer.llm_engine.model_executor + model_executor.save_sharded_state(path=output_dir) # Copy metadata files to output directory for file in os.listdir(input_dir): @@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, gpu_memory_utilization = 0.8 input_dir = llama_3p2_1b_files ctx = mp.get_context("spawn") - # The interface in v1 engine has changed, run in v1 engine will hang. - monkeypatch.setenv("VLLM_USE_V1", "0") # Run in separate processes for memory & CUDA isolation with TemporaryDirectory() as output_dir: @@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, args=(input_dir, output_dir, weights_patterns), kwargs=dict( tensor_parallel_size=tp_size, - distributed_executor_backend="mp", gpu_memory_utilization=gpu_memory_utilization, enforce_eager=True, )) @@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, p = ctx.Process(target=_run_generate, args=(input_dir, queue), kwargs=dict( - distributed_executor_backend="mp", enable_lora=enable_lora, gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, @@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, p = ctx.Process(target=_run_generate, args=(output_dir, queue), kwargs=dict( - distributed_executor_backend="mp", enable_lora=enable_lora, gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7a4bb0d41d236..8912ff8bad429 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1486,12 +1486,6 @@ class EngineArgs: ############################################################# # Unsupported Feature Flags on V1. - if self.load_format == "sharded_state": - _raise_or_fallback( - feature_name=f"--load_format {self.load_format}", - recommend_to_remove=False) - return False - if (self.logits_processor_pattern != EngineArgs.logits_processor_pattern): _raise_or_fallback(feature_name="--logits-processor-pattern", From bef180f00978186fcc84f7ca6328a6ae6c39676d Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sun, 21 Sep 2025 01:50:58 +0800 Subject: [PATCH 512/584] [V0 Deprecation] Enable the remaining multimodal tests in V1 (#25307) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- tests/conftest.py | 76 ++++++--- .../multimodal/generation/test_common.py | 158 +++++++++--------- .../multimodal/generation/test_pixtral.py | 49 +----- .../multimodal/generation/test_qwen2_vl.py | 9 +- .../multimodal/pooling/test_prithvi_mae.py | 26 ++- tests/models/quantization/test_awq.py | 38 +++-- .../models/quantization/test_bitsandbytes.py | 28 ++-- tests/models/test_terratorch.py | 25 ++- 8 files changed, 195 insertions(+), 214 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e8e95357ff5b9..ce9de3bf94b59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,7 @@ import socket import tempfile import threading from collections.abc import Generator +from contextlib import nullcontext from enum import Enum from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast @@ -45,14 +46,14 @@ from vllm.connections import global_http_connection from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) -from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - to_enc_dec_tuple_list, zip_enc_dec_prompts) +from vllm.inputs import TextPrompt from vllm.logger import init_logger from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.sequence import Logprob from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.utils import set_default_torch_num_threads logger = init_logger(__name__) @@ -306,6 +307,35 @@ class HfRunner: is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, + # Set this to avoid hanging issue + default_torch_num_threads: Optional[int] = None, + ) -> None: + init_ctx = (nullcontext() if default_torch_num_threads is None else + set_default_torch_num_threads(default_torch_num_threads)) + + with init_ctx: + self._init( + model_name=model_name, + dtype=dtype, + model_kwargs=model_kwargs, + trust_remote_code=trust_remote_code, + is_sentence_transformer=is_sentence_transformer, + is_cross_encoder=is_cross_encoder, + skip_tokenizer_init=skip_tokenizer_init, + auto_cls=auto_cls, + ) + + def _init( + self, + model_name: str, + dtype: str = "auto", + *, + model_kwargs: Optional[dict[str, Any]] = None, + trust_remote_code: bool = True, + is_sentence_transformer: bool = False, + is_cross_encoder: bool = False, + skip_tokenizer_init: bool = False, + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, ) -> None: model_name = maybe_model_redirect(model_name) self.model_name = model_name @@ -714,26 +744,32 @@ class VllmRunner: enable_chunked_prefill: Optional[bool] = False, swap_space: int = 4, enforce_eager: Optional[bool] = False, + # Set this to avoid hanging issue + default_torch_num_threads: Optional[int] = None, **kwargs, ) -> None: - self.llm = LLM( - model=model_name, - runner=runner, - convert=convert, - tokenizer=tokenizer_name, - tokenizer_mode=tokenizer_mode, - trust_remote_code=trust_remote_code, - dtype=dtype, - seed=seed, - swap_space=swap_space, - enforce_eager=enforce_eager, - disable_log_stats=disable_log_stats, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - block_size=block_size, - enable_chunked_prefill=enable_chunked_prefill, - **kwargs, - ) + init_ctx = (nullcontext() if default_torch_num_threads is None else + set_default_torch_num_threads(default_torch_num_threads)) + + with init_ctx: + self.llm = LLM( + model=model_name, + runner=runner, + convert=convert, + tokenizer=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + dtype=dtype, + seed=seed, + swap_space=swap_space, + enforce_eager=enforce_eager, + disable_log_stats=disable_log_stats, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + block_size=block_size, + enable_chunked_prefill=enable_chunked_prefill, + **kwargs, + ) def get_inputs( self, diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 79f9d607f3386..e76b58e61ec16 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -32,13 +32,6 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs, if current_platform.is_rocm(): os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" -REQUIRES_V0_MODELS = [ - # V1 Test: not enough KV cache space in C1. - "fuyu", - # V1 Test: Deadlock issue when processing mm_inputs - "llava-onevision-transformers", -] - # yapf: disable COMMON_BROADCAST_SETTINGS = { "test_type": VLMTestType.IMAGE, @@ -186,8 +179,11 @@ VLM_TEST_SETTINGS = { image_size_factors=[(0.25, 0.5, 1.0)], vllm_runner_kwargs={ "model_impl": "transformers", + "default_torch_num_threads": 1, }, - marks=[pytest.mark.core_model], + # FIXME: Investigate why the test hangs + # when processing the 3rd prompt in vLLM + marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")], ), "idefics3-transformers": VLMTestInfo( models=["HuggingFaceTB/SmolVLM-256M-Instruct"], @@ -320,6 +316,7 @@ VLM_TEST_SETTINGS = { vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[large_gpu_mark(min_gb=32)], ), "gemma3": VLMTestInfo( models=["google/gemma-3-4b-it"], @@ -861,13 +858,14 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) test_type=VLMTestType.IMAGE, create_new_process_for_each_test=False, )) -def test_single_image_models(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_single_image_models( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( tmp_path=tmp_path, @@ -886,13 +884,14 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, test_type=VLMTestType.MULTI_IMAGE, create_new_process_for_each_test=False, )) -def test_multi_image_models(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_multi_image_models( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( tmp_path=tmp_path, @@ -911,13 +910,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_type=VLMTestType.EMBEDDING, create_new_process_for_each_test=False, )) -def test_image_embedding_models(model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_image_embedding_models( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( model_test_info=model_test_info, @@ -935,11 +934,13 @@ def test_image_embedding_models(model_type: str, test_type=VLMTestType.VIDEO, create_new_process_for_each_test=False, )) -def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - video_assets: VideoTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_video_models( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + video_assets: VideoTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( model_test_info=model_test_info, @@ -957,11 +958,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, test_type=VLMTestType.AUDIO, create_new_process_for_each_test=False, )) -def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - audio_assets: AudioTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_audio_models( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + audio_assets: AudioTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_audio_test( model_test_info=model_test_info, @@ -984,10 +987,7 @@ def test_custom_inputs_models( test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - monkeypatch, ): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( model_test_info=model_test_info, @@ -1006,13 +1006,14 @@ def test_custom_inputs_models( create_new_process_for_each_test=True, )) @create_new_process_for_each_test() -def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_single_image_models_heavy( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( tmp_path=tmp_path, @@ -1032,13 +1033,14 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, create_new_process_for_each_test=True, )) @create_new_process_for_each_test() -def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_multi_image_models_heavy( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( tmp_path=tmp_path, @@ -1058,14 +1060,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, create_new_process_for_each_test=True, )) @create_new_process_for_each_test() -def test_image_embedding_models_heavy(model_type: str, - test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_image_embedding_models_heavy( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( model_test_info=model_test_info, @@ -1083,12 +1084,13 @@ def test_image_embedding_models_heavy(model_type: str, test_type=VLMTestType.VIDEO, create_new_process_for_each_test=True, )) -def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - video_assets: VideoTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_video_models_heavy( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + video_assets: VideoTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( model_test_info=model_test_info, @@ -1106,12 +1108,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, test_type=VLMTestType.AUDIO, create_new_process_for_each_test=True, )) -def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - audio_assets: AudioTestAssets, monkeypatch): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") +def test_audio_models_heavy( + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + audio_assets: AudioTestAssets, +): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_audio_test( model_test_info=model_test_info, @@ -1135,10 +1138,7 @@ def test_custom_inputs_models_heavy( test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - monkeypatch, ): - if model_type in REQUIRES_V0_MODELS: - monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( model_test_info=model_test_info, diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index a4e21aface41f..cb3cc1d3d330d 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -12,13 +12,12 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from mistral_common.tokens.tokenizers.multimodal import image_from_chunk from transformers import AutoProcessor -from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt +from vllm import SamplingParams, TextPrompt, TokensPrompt from vllm.multimodal import MultiModalDataBuiltins -from vllm.multimodal.inputs import PlaceholderRange from vllm.sequence import Logprob, SampleLogprobs from ....utils import VLLM_PATH, large_gpu_test -from ...utils import check_logprobs_close, dummy_hf_overrides +from ...utils import check_logprobs_close if TYPE_CHECKING: from _typeshed import StrPath @@ -185,47 +184,3 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str, outputs_1_lst=logprobs, name_0="h100_ref", name_1="output") - - -@pytest.mark.parametrize( - "image_urls,expected_ranges", - [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]), - (IMG_URLS[1:4], [ - PlaceholderRange(offset=11, length=266), - PlaceholderRange(offset=277, length=1056), - PlaceholderRange(offset=1333, length=418) - ])]) -def test_multi_modal_placeholders(vllm_runner, image_urls: list[str], - expected_ranges: list[PlaceholderRange], - local_asset_server, monkeypatch) -> None: - local_image_urls = [local_asset_server.url_for(u) for u in image_urls] - prompt = _create_engine_inputs_hf(local_image_urls) - - # This placeholder checking test only works with V0 engine - # where `multi_modal_placeholders` is returned with `RequestOutput` - monkeypatch.setenv("VLLM_USE_V1", "0") - with vllm_runner( - "mistral-community/pixtral-12b", - max_model_len=8192, - limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, - load_format="dummy", - hf_overrides=dummy_hf_overrides, - ) as vllm_model: - outputs = vllm_model.llm.generate(prompt) - - assert len(outputs) == 1, f"{len(outputs)=}" - output: RequestOutput = outputs[0] - assert hasattr(output, - "multi_modal_placeholders"), f"{output.__dict__=}" - assert "image" in output.multi_modal_placeholders, \ - f"{output.multi_modal_placeholders.keys()=}" - image_placeholder_ranges: list[ - PlaceholderRange] = output.multi_modal_placeholders["image"] - assert len(image_placeholder_ranges) == len( - expected_ranges), f"{image_placeholder_ranges=}" - for real_range, expected_range in zip(image_placeholder_ranges, - expected_ranges): - assert real_range.offset == expected_range.offset, \ - f"{real_range=} {expected_range=}" - assert real_range.length == expected_range.length, \ - f"{real_range=} {expected_range=}" diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index e56f4e4075be4..8336ebc0d59cb 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -10,7 +10,6 @@ from PIL import Image from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video -from vllm.utils import set_default_torch_num_threads from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) @@ -264,8 +263,7 @@ def run_embedding_input_test( processor = AutoProcessor.from_pretrained(model) # max_model_len should be greater than image_feature_size - with set_default_torch_num_threads(1): - vllm_model = vllm_runner( + with vllm_runner( model, runner="generate", max_model_len=4000, @@ -277,9 +275,8 @@ def run_embedding_input_test( }, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, - ) - - with vllm_model: + default_torch_num_threads=1, + ) as vllm_model: outputs_per_case_for_original_input = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py index b503d42567022..7309660ea5261 100644 --- a/tests/models/multimodal/pooling/test_prithvi_mae.py +++ b/tests/models/multimodal/pooling/test_prithvi_mae.py @@ -4,8 +4,6 @@ import pytest import torch -from vllm.utils import set_default_torch_num_threads - from ....conftest import VllmRunner @@ -30,19 +28,17 @@ def _run_test( } for _ in range(10) ] - with ( - set_default_torch_num_threads(1), - vllm_runner( - model, - runner="pooling", - dtype=torch.float16, - enforce_eager=True, - skip_tokenizer_init=True, - # Limit the maximum number of sequences to avoid the - # test going OOM during the warmup run - max_num_seqs=32, - ) as vllm_model, - ): + with vllm_runner( + model, + runner="pooling", + dtype="half", + enforce_eager=True, + skip_tokenizer_init=True, + # Limit the maximum number of sequences to avoid the + # test going OOM during the warmup run + max_num_seqs=32, + default_torch_num_threads=1, + ) as vllm_model: vllm_model.encode(prompt) diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index 7005e435ecf46..e741e4ad90a09 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -45,12 +45,15 @@ def run_awq_test( # will hurt multiprocessing backend with fork method (the default method). # max_model_len should be greater than image_feature_size - with vllm_runner(source_model, - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + with vllm_runner( + source_model, + max_model_len=4096, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + default_torch_num_threads=1, + ) as vllm_model: source_outputs_per_image = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, @@ -59,13 +62,16 @@ def run_awq_test( for prompts, images in inputs_per_image ] - with vllm_runner(quant_model, - quantization="awq", - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + with vllm_runner( + quant_model, + quantization="awq", + max_model_len=4096, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + default_torch_num_threads=1, + ) as vllm_model: quant_outputs_per_image = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, @@ -108,12 +114,8 @@ def run_awq_test( @pytest.mark.parametrize("num_logprobs", [5]) @torch.inference_mode() def test_awq_models(vllm_runner, image_assets, source_model, quant_model, - size_factors, dtype, max_tokens, num_logprobs, - monkeypatch) -> None: + size_factors, dtype, max_tokens, num_logprobs) -> None: - # Test V1: this test hangs during setup on single-scale input. - # TODO: figure out why and re-enable this on V1. - monkeypatch.setenv("VLLM_USE_V1", "0") run_awq_test( vllm_runner, image_assets, diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index e0e919b62b217..25fc44fee90dd 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -5,10 +5,7 @@ Run `pytest tests/quantization/test_bitsandbytes.py`. ''' -import gc - import pytest -import torch from transformers import BitsAndBytesConfig from tests.quantization.utils import is_quant_method_supported @@ -131,12 +128,15 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts, )) with vllm_runner(model_name, quantization='bitsandbytes', - enforce_eager=False) as llm: + enforce_eager=False, + default_torch_num_threads=1) as llm: vllm_outputs = llm.generate_greedy_logprobs(example_prompts, max_tokens=32, num_logprobs=5) - with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm: + with hf_runner(model_name, + model_kwargs=hf_model_kwargs, + default_torch_num_threads=1) as llm: transformers_outputs = llm.generate_greedy_logprobs_limit( example_prompts, max_tokens=32, num_logprobs=5) check_logprobs_close( @@ -174,7 +174,8 @@ def test_4bit_bnb_embedding_model( runner="pooling", dtype=dtype, gpu_memory_utilization=0.5, - quantization="bitsandbytes") as vllm_model: + quantization="bitsandbytes", + default_torch_num_threads=1) as vllm_model: vllm_outputs = vllm_model.embed(example_prompts) hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig( @@ -184,6 +185,7 @@ def test_4bit_bnb_embedding_model( dtype=dtype, model_kwargs=hf_model_kwargs, is_sentence_transformer=True, + default_torch_num_threads=1, ) as hf_model: hf_outputs = hf_model.encode(example_prompts) @@ -222,26 +224,22 @@ def validate_generated_texts(hf_runner, with vllm_runner(model_name, quantization=None if pre_quant else 'bitsandbytes', tensor_parallel_size=vllm_tp_size, - enforce_eager=False) as llm: + enforce_eager=False, + default_torch_num_threads=1) as llm: vllm_outputs = llm.generate_greedy(prompts, max_tokens) vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner") - # Clean up the GPU memory for the next test - gc.collect() - torch.cuda.empty_cache() - if hf_model_kwargs is None: hf_model_kwargs = {} # Run with HF runner - with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm: + with hf_runner(model_name, + model_kwargs=hf_model_kwargs, + default_torch_num_threads=1) as llm: hf_outputs = llm.generate_greedy(prompts, max_tokens) hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner") - # Clean up the GPU memory for the next test - gc.collect() - torch.cuda.empty_cache() # Compare the generated strings for hf_log, vllm_log in zip(hf_logs, vllm_logs): hf_str = hf_log["generated_text"] diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py index d6d43ca2f7e15..842e37ea26f67 100644 --- a/tests/models/test_terratorch.py +++ b/tests/models/test_terratorch.py @@ -5,7 +5,6 @@ import pytest import torch from tests.conftest import VllmRunner -from vllm.utils import set_default_torch_num_threads @pytest.mark.parametrize( @@ -25,19 +24,17 @@ def test_inference( prompt = dict(prompt_token_ids=[1], multi_modal_data=dict(pixel_values=pixel_values, location_coords=location_coords)) - with ( - set_default_torch_num_threads(1), - vllm_runner( - model, - runner="pooling", - dtype=torch.float16, - enforce_eager=True, - skip_tokenizer_init=True, - # Limit the maximum number of sequences to avoid the - # test going OOM during the warmup run - max_num_seqs=32, - ) as vllm_model, - ): + with vllm_runner( + model, + runner="pooling", + dtype="half", + enforce_eager=True, + skip_tokenizer_init=True, + # Limit the maximum number of sequences to avoid the + # test going OOM during the warmup run + max_num_seqs=32, + default_torch_num_threads=1, + ) as vllm_model: vllm_output = vllm_model.llm.encode(prompt) assert torch.equal( From 367a480bd3534edf27a8dac3c6f7ea8af9d1ed45 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Sun, 21 Sep 2025 07:39:47 +0800 Subject: [PATCH 513/584] [Docs] Fix warnings in vllm/profiler and vllm/transformers_utils (#25220) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- mkdocs.yaml | 1 + vllm/profiler/layerwise_profile.py | 4 ++-- vllm/transformers_utils/configs/jais.py | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index 6f2be65a18af8..1535fcc622cdc 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -102,6 +102,7 @@ plugins: - https://numpy.org/doc/stable/objects.inv - https://pytorch.org/docs/stable/objects.inv - https://psutil.readthedocs.io/en/stable/objects.inv + - https://huggingface.co/docs/transformers/main/en/objects.inv markdown_extensions: - attr_list diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 2f9ebe531cbb1..41136f738c286 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -353,8 +353,8 @@ class layerwise_profile(profile): Args: num_running_seqs (Optional[int], optional): When given, - num_running_seqs will be passed to LayerProfileResults for metadata - update. Defaults to None. + num_running_seqs will be passed to LayerProfileResults + for metadata update. Defaults to None. """ super().__init__( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index d5ca2c7b4751a..3f50638f16b53 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -74,8 +74,7 @@ class JAISConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). - scale_attn_by_inverse_layer_idx - (`bool`, *optional*, defaults to `False`): + scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`): Whether to additionally scale attention weights by `1 / layer_idx + 1`. reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): From 52c2a8d4adccf72c2cf59a18da31e44a61c43b41 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 17:56:30 -0700 Subject: [PATCH 514/584] [V0 Deprecation] Remove LLMEngine (#25033) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- .../scripts/hardware_ci/run-amd-test.sh | 4 - .buildkite/test-pipeline.yaml | 8 +- .github/CODEOWNERS | 1 - examples/offline_inference/profiling.py | 510 ----- .../test_basic_correctness.py | 16 +- tests/basic_correctness/test_cumem.py | 5 +- tests/compile/test_fusion_attn.py | 3 +- tests/conftest.py | 20 - tests/entrypoints/llm/test_generate.py | 6 - .../entrypoints/llm/test_prompt_validation.py | 8 - tests/entrypoints/openai/test_metrics.py | 2 +- .../attention/test_attention_selector.py | 91 +- tests/lora/test_lora_functions.py | 2 +- .../models/language/generation/test_common.py | 6 +- .../models/language/generation/test_hybrid.py | 122 +- tests/models/language/pooling/test_reward.py | 3 +- tests/models/quantization/test_fp8.py | 12 - tests/models/test_initialization.py | 13 +- tests/models/test_oot_registration.py | 1 + tests/plugins_tests/test_platform_plugins.py | 9 - tests/plugins_tests/test_scheduler_plugins.py | 37 +- tests/samplers/test_beam_search.py | 7 - tests/samplers/test_ignore_eos.py | 7 - tests/samplers/test_ranks.py | 6 - tests/tokenization/test_detokenize.py | 55 - vllm/engine/arg_utils.py | 8 - vllm/engine/llm_engine.py | 1833 +---------------- vllm/entrypoints/llm.py | 11 +- .../model_executor/model_loader/tensorizer.py | 22 +- 29 files changed, 65 insertions(+), 2763 deletions(-) delete mode 100644 examples/offline_inference/profiling.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 7f90181048d0f..aa4cc7b35a543 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} fi -if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then - commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"} -fi - if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c42ec4f2503d0..1e7ce6ef0a665 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -110,7 +110,7 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Integration Test (API Server) # 100min timeout_in_minutes: 130 @@ -163,7 +163,6 @@ steps: - tests/v1/engine/test_engine_core_client.py commands: # test with tp=2 and external_dp=2 - - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -314,12 +313,11 @@ steps: - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 @@ -894,7 +892,7 @@ steps: - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. # TODO: investigate and fix - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s models/multimodal/generation/test_maverick.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 323675993467a..f58256d38b9d1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,7 +5,6 @@ /vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py deleted file mode 100644 index 392fba8fc5ead..0000000000000 --- a/examples/offline_inference/profiling.py +++ /dev/null @@ -1,510 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import inspect -import json -import os -import sys -from argparse import RawTextHelpFormatter -from collections.abc import Generator -from dataclasses import asdict, dataclass -from typing import Any, Optional, TypeAlias - -import torch -import tqdm - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.profiler.layerwise_profile import layerwise_profile -from vllm.utils import FlexibleArgumentParser - -BATCH_SIZE_DEFAULT = 1 -PROMPT_LEN_DEFAULT = 256 - - -@dataclass -class ProfileContext: - engine_args: EngineArgs - prompt_len: int - batch_size: int - - # The profiler can run in 2 modes, - # 1. Run profiler for user specified num_steps - num_steps: Optional[int] = None - # 2. Run profiler until all requests complete - complete_num_requests_per_step: Optional[int] = None - - save_chrome_traces_folder: Optional[str] = None - - -def get_dtype(dtype: str): - if dtype == "torch.float": - return torch.float - else: - return dtype - - -OutputLen_NumReqs_Map: TypeAlias = dict[int, int] - - -def compute_request_output_lengths( - batch_size: int, step_requests: list[int] -) -> OutputLen_NumReqs_Map: - """ - Given the number of requests, batch_size, and the number of requests - that each engine-step should process, step_requests, determine the - output lengths of the requests such that step_request is honoured. - - Example: - if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1] - then return, - {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning, - 32 requests should have output length 2, - 32 requests should have output length 3, - 32 requests should have output length 4, - 31 requests should have output length 5, - 1 request should have output length 6. - - Args: - batch_size (int): Number of requests submitted for profile. This is - args.batch_size. - step_requests (list[int]): step_requests[i] is the number of requests - that the ith engine step should process. - - Returns: - OutputLen_NumReqs_Map : A dictionary with output-length as keys and the - number of requests required to have that output-length as values. - """ - ol_nr: OutputLen_NumReqs_Map = {} - - # Number of request that are assigned an output-length - num_reqs_assigned: int = 0 - num_steps: int = len(step_requests) - - # sanity check. The first step (prefill-step), must process all requests. - assert step_requests[0] == batch_size - - # Begin assignments from the last step. - output_length: int = num_steps - for num_requests_at_step in reversed(step_requests): - if num_reqs_assigned == batch_size: - break - - assert num_reqs_assigned < batch_size - - # Remove the number of requests that have been determined - # to participate in this step and beyond. - num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned - assert num_reqs_unassigned_at_step >= 0 - - if num_reqs_unassigned_at_step > 0: - ol_nr[output_length] = num_reqs_unassigned_at_step - num_reqs_assigned += num_reqs_unassigned_at_step - - output_length -= 1 - - # sanity checks. - assert sum(ol_nr.values()) == batch_size, ( - "Number of requests in output-length assignment does not match " - f"batch-size.\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}" - ) - - # Check that the output-length is in [1, num-steps]. Output length must be - # at least 1 as all requests must participate in the prefill-step. - assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), ( - "Output lengths of requests should be in range " - f"[1, num-engine-steps].\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}" - ) - - return ol_nr - - -def determine_requests_per_step(context: ProfileContext) -> list[int]: - """ - Determine number of requests each engine step should process. - If context.num_steps is set, then all engine steps process the - same number of requests and the output list is of length - context.num_steps. - - If context.complete_num_requests_per_step is set, then each decode step - processes fewer and fewer requests until there are no requests to process. - In this case, the output list is as big as the number of steps - required to process all requests. - - Args: - context: ProfileContext object. - - Returns: - list[int]: Number of requests to process for all engine-steps. - output[i], contains the number of requests that the ith step - should process. - """ - if context.num_steps: - # All requests must run until num_engine_steps. This implies - # that their output lengths must be equal to num_engine_steps. - return [context.batch_size] * context.num_steps - - assert ( - context.complete_num_requests_per_step - and context.complete_num_requests_per_step > 0 - ), ( - f"Expected a positive complete_num_requests_per_step argument." - f"Instead got {context.complete_num_requests_per_step}" - ) - - # We start dropping after the first decode step. - step_requests = [ - context.batch_size, # prefill - context.batch_size, # decode - ] - - num_running_requests = context.batch_size - num_running_requests -= context.complete_num_requests_per_step - while num_running_requests > 0: - step_requests.append(num_running_requests) - num_running_requests -= context.complete_num_requests_per_step - - if step_requests[-1] != 1: - # have 1 request running at the last step. This is often - # useful - step_requests.append(1) - - return step_requests - - -def run_profile( - context: ProfileContext, csv_output: Optional[str], json_output: Optional[str] -): - print("Run profile with:") - for key, value in asdict(context).items(): - print(f" {key} = {value}") - - requests_per_step: list[int] = determine_requests_per_step(context) - - ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( - context.batch_size, requests_per_step - ) - - num_steps_to_profile: int = len(requests_per_step) - max_output_len: int = max(ol_nr.keys()) - assert max_output_len >= 1 - - # Create sampling params - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - # max_tokens is set on a per-request basis. - max_tokens=None, - ignore_eos=True, - ) - - # Create LLM - llm = LLM(**asdict(context.engine_args)) - batch_size = context.batch_size - prompt_len = context.prompt_len - - scheduler_config = llm.llm_engine.vllm_config.scheduler_config - max_model_len = llm.llm_engine.model_config.max_model_len - max_num_batched_tokens = scheduler_config.max_num_batched_tokens - max_num_seqs = scheduler_config.max_num_seqs - - if batch_size * prompt_len > max_num_batched_tokens: - print( - f"ERROR: chosen batch_size * prompt_len " - f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " - f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " - f"and therefore cannot be run in a single profile step, please " - f"choose a smaller batch size or prompt length, or increase " - f"--max-num-batched-tokens" - ) - sys.exit(-1) - if batch_size > max_num_seqs: - print( - f"ERROR: chosen batch_size ({batch_size}) is larger than " - f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a " - f"single profile step, please choose a smaller batch size" - ) - sys.exit(-1) - print( - "llm.llm_engine.model_config.max_model_len: ", - llm.llm_engine.model_config.max_model_len, - ) - if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len: - print( - f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + " - f"{max_output_len} = {prompt_len + max_output_len}) is larger " - f"than the model's max_model_len ({max_model_len}), please " - f"choose a smaller prompt_len or max_output_len, or increase " - f"--max-model-len" - ) - sys.exit(-1) - - def add_requests(): - def get_output_len_generator() -> Generator[int, Any, Any]: - for output_len, num_reqs in ol_nr.items(): - for _ in range(num_reqs): - yield output_len - - output_len_generator = get_output_len_generator() - for i in range(batch_size): - sampling_params.max_tokens = next(output_len_generator) - assert isinstance(sampling_params.max_tokens, int) - - prompt_token_ids = torch.randint( - llm.get_tokenizer().vocab_size, size=(prompt_len,) - ).tolist() - - llm.llm_engine.add_request( - request_id=f"seq{i}", - prompt={"prompt_token_ids": prompt_token_ids}, - params=sampling_params, - ) - - def abort_requests(): - for i in range(batch_size): - llm.llm_engine.abort_request(f"seq{i}") - - # Warm up run - print("Warm up run ...") - add_requests() - llm.llm_engine.step() # Prefill - llm.llm_engine.step() # Decode - abort_requests() - - print("Profile run ...") - add_requests() - - with layerwise_profile() as prefill_prof: - llm.llm_engine.step() # First step is prefill - - decode_profs = [] - for _ in tqdm.tqdm(range(num_steps_to_profile - 1)): - num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups() - with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof: - llm.llm_engine.step() - decode_profs.append(decode_prof) - - decode_results_list = [prof.results for prof in decode_profs] - prefill_results = prefill_prof.results - has_decode = len(decode_results_list) > 0 - - LINE_WIDTH = 80 - print("=" * LINE_WIDTH) - print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * LINE_WIDTH) - print() - prefill_results.print_model_table() - - if has_decode: - print() - print("=" * LINE_WIDTH) - print( - f"= First Decode Step Model Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})" - ) - print("=" * LINE_WIDTH) - print() - decode_results_list[0].print_model_table() - - print() - print("=" * LINE_WIDTH) - print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * LINE_WIDTH) - print() - prefill_results.print_summary_table() - - if has_decode: - print() - print("=" * LINE_WIDTH) - print( - f"= First Decode Step Summary Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})" - ) - print("=" * LINE_WIDTH) - print() - decode_results_list[0].print_summary_table() - - if csv_output: - csv_filename_base = ( - csv_output[:-4] if csv_output.endswith(".csv") else csv_output - ) - prefill_results.export_model_stats_table_csv( - csv_filename_base + "_prefill_model_table.csv" - ) - prefill_results.export_summary_stats_table_csv( - csv_filename_base + "_prefill_summary_table.csv" - ) - - if has_decode: - decode_results_list[0].export_model_stats_table_csv( - csv_filename_base + "_decode_model_table.csv" - ) - decode_results_list[0].export_summary_stats_table_csv( - csv_filename_base + "_decode_summary_table.csv" - ) - - if json_output: - cuda_devices = [ - torch.cuda.get_device_properties(dev_idx) - for dev_idx in range(torch.cuda.device_count()) - ] - - json_dict = { - "context": { - "python_version": f"{sys.version}", - "torch_version": f"{torch.__version__}", - "torch_cuda_version": f"{torch.version.cuda}", - "cuda_devices": f"{cuda_devices}", - **asdict(context), - }, - "prefill": prefill_results.convert_stats_to_dict(), - } - - if has_decode: - for idx, dr in enumerate(decode_results_list): - json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() - - # Add .json to json_output filename if it doesn't exist already. - json_output_file = ( - json_output if json_output.endswith(".json") else json_output + ".json" - ) - with open(json_output_file, "w+") as f: - json.dump(json_dict, f, indent=2) - pass - - if context.save_chrome_traces_folder is not None: - os.makedirs(context.save_chrome_traces_folder, exist_ok=True) - prefill_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + "/prefill.json" - ) - for idx, decode_prof in enumerate(decode_profs): - decode_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + f"/decode_{idx + 1}.json" - ) - print( - "Traces saved as prefill.json and decode_1.json, etc." - f" in folder {context.save_chrome_traces_folder}" - ) - - -def parse_args(): - parser = FlexibleArgumentParser( - description=""" -Profile a model - - example: - ``` - python examples/offline_inference/profiling.py \\ - --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ - --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ - --enforce-eager run_num_steps -n 2 - ``` - - then you can use various tools to analyze the json output - terminal ascii tables: - ``` - python tools/profiler/print_layerwise_table.py \\ - --json-trace Llama31-8b-FP8.json --phase prefill --table summary - ``` - or create matplotlib stacked bar charts: - ``` - python tools/profiler/visualize_layerwise_profile.py \\ - --json-trace Llama31-8b-FP8.json \\ - --output-directory profile_breakdown --plot-metric pct_cuda_time - ``` -""", - formatter_class=RawTextHelpFormatter, - ) - parser.add_argument( - "--csv", - type=str, - default=None, - help="Export the results as multiple csv file. This should be the root " - "filename, will create <filename>_prefill_model_table.csv, " - "<filename>_prefill_summary_table.csv, " - "<filename>_decode_model_table.csv, and " - "<filename>_decode_summary_table.csv", - ) - parser.add_argument( - "--json", - type=str, - default=None, - help="Export the results as a json file. This should be the filename", - ) - parser.add_argument( - "--save-chrome-traces-folder", - type=str, - help="Save chrome traces for the prefill and decode " - "will save traces as prefill.json and decode_1.json, " - "etc. inside this folder", - ) - parser.add_argument( - "--prompt-len", - type=int, - default=PROMPT_LEN_DEFAULT, - help=f"Length of the random prompt to use when profiling, all batched " - f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}", - ) - parser.add_argument( - "--batch-size", - type=int, - default=BATCH_SIZE_DEFAULT, - help=f"Number of requests to run as a single batch, " - f"default={BATCH_SIZE_DEFAULT}", - ) - - subparsers = parser.add_subparsers(dest="cmd") - - run_num_steps_parser = subparsers.add_parser( - "run_num_steps", help="This variation profiles n engine.step() invocations." - ) - run_num_steps_parser.add_argument( - "-n", - "--num-steps", - type=int, - help="Number of engine steps to profile.\n" - "Setting it to 1, profiles only the prefill step.\n" - "Setting it to 2, profiles the prefill and first decode step\n" - "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n" - "and so on ...", - ) - - run_to_completion_parser = subparsers.add_parser( - "run_to_completion", - help="This variation profiles all the engine.step() invocations" - "until the engine exhausts all submitted requests.", - ) - run_to_completion_parser.add_argument( - "-n", - "--complete-num-requests-per-step", - type=int, - help="Complete complete_num_requests_per_step requests every decode step." - "For e.g., with batch_size 128 and complete_num_requests_per_step 32," - "the profiler is run for 6 engine steps, with the steps processing, " - "128, 128, 96, 64, 32, 1 requests respectively.\n" - "Note that we tack-on a one-request step at the end as it is often " - "useful.", - ) - - EngineArgs.add_cli_args(parser) - - return parser.parse_args() - - -def main(args): - context = ProfileContext( - engine_args=EngineArgs.from_cli_args(args), - **{ - k: v - for k, v in vars(args).items() - if k in inspect.signature(ProfileContext).parameters - }, - ) - run_profile(context, csv_output=args.csv, json_output=args.json) - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 24b1c9a93126c..411f3e01bc2cd 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -11,7 +11,7 @@ from unittest.mock import Mock import pytest import torch -from vllm import LLM, envs +from vllm import LLM from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1 from ..conftest import HfRunner, VllmRunner @@ -26,14 +26,6 @@ MODELS = [ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def test_vllm_gc_ed(): """Verify vllm instance is GC'ed when it is deleted""" llm = LLM("distilbert/distilgpt2") @@ -76,12 +68,6 @@ def test_models( model_executor: str, enable_prompt_embeds: bool, ) -> None: - if not envs.VLLM_USE_V1: - if async_scheduling: - pytest.skip("async_scheduling only supported in v1.") - if model_executor != "uni": - pytest.skip("only test uniproc executor for v0.") - if backend == "XFORMERS" and model == "google/gemma-2-2b-it": pytest.skip( f"{backend} does not support gemma2 with full context length.") diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index f3ad680b72b55..508740ab29389 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -122,11 +122,12 @@ def test_cumem_with_cudagraph(): # sleep mode with safetensors ("meta-llama/Llama-3.2-1B", True), # sleep mode with pytorch checkpoint - ("facebook/opt-125m", False), + ("facebook/opt-125m", True), ]) def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + assert use_v1 + m.setenv("VLLM_USE_V1", "1") free, total = torch.cuda.mem_get_info() used_bytes_baseline = total - free # in case other process is running llm = LLM(model, enable_sleep_mode=True) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 022f183b31932..b6bebbba915be 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -54,8 +54,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str, # Use global backends global backend, backend_unfused - use_v1 = False # can be made a param once V1 support added - monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1))) + monkeypatch.setenv("VLLM_USE_V1", "1") monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa))) # Prompt 4 seems too open-ended, differs between fused and unfused diff --git a/tests/conftest.py b/tests/conftest.py index ce9de3bf94b59..f14b1e8780ad9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -160,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch): monkeypatch.delenv("VLLM_USE_V1") -@pytest.fixture(params=[True, False]) -def run_with_both_engines(request, monkeypatch): - # Automatically runs tests twice, once with V1 and once without - use_v1 = request.param - # Tests decorated with `@skip_v1` are only run without v1 - skip_v0 = request.node.get_closest_marker("skip_v0") - skip_v1 = request.node.get_closest_marker("skip_v1") - - if use_v1: - if skip_v1: - pytest.skip("Skipping test on vllm V1") - monkeypatch.setenv('VLLM_USE_V1', '1') - else: - if skip_v0: - pytest.skip("Skipping test on vllm V0") - monkeypatch.setenv('VLLM_USE_V1', '0') - - yield - - @pytest.fixture(autouse=True) def init_test_http_connection(): # pytest_asyncio may use a different event loop per test diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 3bbbcc755d134..e0ecb02d4f563 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -25,12 +25,6 @@ TOKEN_IDS = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 1b7be15d5d691..b219b33d1760e 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -6,14 +6,6 @@ import pytest from vllm import LLM -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def test_empty_prompt(): llm = LLM(model="openai-community/gpt2", enforce_eager=True) with pytest.raises(ValueError, match='decoder prompt cannot be empty'): diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 8917aa5a5efb9..f0b61902eb568 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool): "--port", port, ], - env={"VLLM_USE_V1": "1" if use_v1 else "0"}) + env={"VLLM_USE_V1": "1"}) def is_server_up(url): try: diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 190c92e1251c2..f8454ad0a4c4d 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -69,28 +69,20 @@ def generate_params(): @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) -@pytest.mark.parametrize("use_v1", [True, False]) def test_env( device: str, name: str, use_mla: bool, block_size: int, - use_v1: bool, monkeypatch: pytest.MonkeyPatch, ): """Test attention backend selection with valid device-backend pairs.""" with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv("VLLM_USE_V1", "1") m.setenv(STR_BACKEND_ENV_VAR, name) m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") - if name == "FLASHINFER" and not use_v1: - pytest.skip("FlashInfer backend is only available on V1 engine") - if device == "cpu": - if not use_v1: - pytest.skip("CPU backend only supports V1") - with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float16, None, block_size, @@ -137,7 +129,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = f"{name}_VLLM_V1" if use_v1 else name + expected = f"{name}_VLLM_V1" assert backend.get_name() == expected else: backend = get_attn_backend(16, @@ -146,7 +138,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" + expected = "TRITON_ATTN_VLLM_V1" assert backend.get_name() == expected elif device == "cuda": @@ -163,11 +155,7 @@ def test_env( # - TRITON_MLA: fallback for other cases if name == "CUTLASS_MLA": - if not use_v1: - # CUTLASS_MLA only supported on V1 engine - pytest.skip( - "CUTLASS_MLA only supported on V1 engine") - elif block_size != 128: + if block_size != 128: # CUTLASS_MLA only supports block_size == 128 pytest.skip( "CUTLASS_MLA only supports block_size 128") @@ -181,11 +169,7 @@ def test_env( expected = "CUTLASS_MLA_VLLM_V1" assert backend.get_name() == expected elif name == "FLASHINFER_MLA": - if not use_v1: - # FlashInfer MLA only supported on V1 engine - pytest.skip( - "FlashInfer MLA only supported on V1 engine") - elif block_size not in [32, 64]: + if block_size not in [32, 64]: # FlashInfer MLA only supports block_size 32 or 64 pytest.skip( "FlashInfer MLA only supports block_size 32 " @@ -217,23 +201,17 @@ def test_env( block_size, False, use_mla=use_mla) - expected = f"{name}_VLLM_V1" if use_v1 else name + expected = f"{name}_VLLM_V1" assert backend.get_name() == expected elif name == "FLASH_ATTN_MLA": - if not use_v1: - # FlashAttention MLA only supported on V1 engine - pytest.skip( - "FlashAttention MLA only supported on V1 engine" - ) - else: - backend = get_attn_backend(16, - torch.float16, - None, - block_size, - False, - use_mla=use_mla) - expected = "FLASH_ATTN_MLA" - assert backend.get_name() == expected + backend = get_attn_backend(16, + torch.float16, + None, + block_size, + False, + use_mla=use_mla) + expected = "FLASH_ATTN_MLA" + assert backend.get_name() == expected else: # TRITON_MLA or other fallback backend = get_attn_backend(16, @@ -242,8 +220,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = ("TRITON_MLA_VLLM_V1" - if use_v1 else "TRITON_MLA") + expected = "TRITON_MLA_VLLM_V1" assert backend.get_name() == expected elif name == "FLASHINFER": backend = get_attn_backend(16, @@ -252,7 +229,7 @@ def test_env( block_size, False, use_mla=use_mla) - expected = "FLASHINFER_VLLM_V1" if use_v1 else name + expected = "FLASHINFER_VLLM_V1" assert backend.get_name() == expected else: backend = get_attn_backend(32, @@ -261,36 +238,30 @@ def test_env( block_size, False, use_mla=use_mla) - expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name + expected = "FLASH_ATTN_VLLM_V1" assert backend.get_name() == expected - if use_v1: - backend = get_attn_backend(16, - torch.float16, - None, - block_size, - False, - use_mla=use_mla) - assert backend.get_name() == "FLEX_ATTENTION", ( - "Should fallback to FlexAttention if head size is " - "not supported by FlashAttention") + backend = get_attn_backend(16, + torch.float16, + None, + block_size, + False, + use_mla=use_mla) + assert backend.get_name() == "FLEX_ATTENTION", ( + "Should fallback to FlexAttention if head size is " + "not supported by FlashAttention") @pytest.mark.parametrize("device", ["cpu", "cuda"]) -@pytest.mark.parametrize("use_v1", [True, False]) def test_fp32_fallback( device: str, - use_v1: bool, monkeypatch: pytest.MonkeyPatch, ): """Test attention backend selection with fp32.""" with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv("VLLM_USE_V1", "1") if device == "cpu": - if not use_v1: - pytest.skip("CPU backend only supports V1") - with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float32, None, 16, False) @@ -300,8 +271,7 @@ def test_fp32_fallback( with patch("vllm.attention.selector.current_platform", CudaPlatform()): backend = get_attn_backend(16, torch.float32, None, 16, False) - assert (backend.get_name() == "FLEX_ATTENTION" - if use_v1 else "XFORMERS") + assert backend.get_name() == "FLEX_ATTENTION" def test_flash_attn(monkeypatch: pytest.MonkeyPatch): @@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): assert backend.get_name() != STR_FLASH_ATTN_VAL -@pytest.mark.parametrize("use_v1", [True, False]) -def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): +def test_invalid_env(monkeypatch: pytest.MonkeyPatch): """Test that invalid attention backend names raise ValueError.""" with monkeypatch.context() as m, patch( "vllm.attention.selector.current_platform", CudaPlatform()): - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv("VLLM_USE_V1", "1") m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) # Should raise ValueError for invalid backend diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 50c60341f0d88..221d5237823ca 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions. import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) from vllm.lora.request import LoRARequest +from vllm.v1.engine.llm_engine import LLMEngine MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index c14e71cbdb96d..39c4dd735b725 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -15,7 +15,8 @@ from ...utils import check_logprobs_close # have a clean way to fall back, so we fail with # a clear msg when it happens. # https://github.com/vllm-project/vllm/issues/14524 -REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] +# NOTE(woosuk): Skipping these tests until V1 supports them. +# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] # This list contains the model that are using AITER kernel. # Skip model that are not using AITER tests. @@ -113,9 +114,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - if model in REQUIRES_V0: - monkeypatch.setenv("VLLM_USE_V1", "0") - if use_rocm_aiter and (model in AITER_MODEL_LIST): monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") elif use_rocm_aiter and model not in AITER_MODEL_LIST: diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 206ad1352e06e..0b1f90e27db82 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test from vllm.engine.arg_utils import EngineArgs from vllm.sampling_params import SamplingParams -from ...utils import check_logprobs_close, check_outputs_equal +from ...utils import check_logprobs_close # Mark all tests as hybrid pytestmark = pytest.mark.hybrid_model @@ -88,15 +88,6 @@ def test_models( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - if model not in V0_UNSUPPORTED_MODELS: - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - else: - vllm_v0_outputs = None - if model in V1_SUPPORTED_MODELS: with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( @@ -104,14 +95,6 @@ def test_models( else: vllm_v1_outputs = None - if vllm_v0_outputs is not None: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - if model in V1_SUPPORTED_MODELS: check_logprobs_close( outputs_0_lst=hf_outputs, @@ -157,45 +140,6 @@ def test_batching( ) -@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) -def test_chunked_prefill( - vllm_runner, - example_prompts, - model: str, - max_tokens: int, - num_logprobs: int, - chunked_prefill_token_size: int, - monkeypatch, -) -> None: - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with vllm_runner(model, - enable_chunked_prefill=True, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs) as vllm_model: - chunked = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model, - enable_chunked_prefill=False, - max_num_seqs=max_num_seqs) as vllm_model: - non_chunked = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - check_logprobs_close( - outputs_0_lst=chunked, - outputs_1_lst=non_chunked, - name_0="chunked", - name_1="non_chunked", - ) - - @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @pytest.mark.parametrize("max_tokens", [10]) def test_chunked_prefill_with_parallel_sampling( @@ -257,38 +201,6 @@ def test_mamba_cache_cg_padding( "Could be related to mamba cache not padded correctly") -@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) -@pytest.mark.parametrize("max_tokens", [20]) -def test_models_preemption_recompute( - vllm_runner, - example_prompts, - model: str, - max_tokens: int, - monkeypatch, -) -> None: - """ - Tests that outputs are identical with and w/o preemptions (recompute). - """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - scheduler = vllm_model.llm.llm_engine.scheduler[0] - scheduler.ENABLE_ARTIFICIAL_PREEMPT = True - preempt_vllm_outputs = vllm_model.generate_greedy( - example_prompts, max_tokens) - - scheduler.ENABLE_ARTIFICIAL_PREEMPT = False - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=preempt_vllm_outputs, - outputs_1_lst=vllm_outputs, - name_0="vllm_preepmtions", - name_1="vllm", - ) - - @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks( vllm_runner, @@ -386,27 +298,10 @@ def test_full_cuda_graph( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - if model not in V0_UNSUPPORTED_MODELS: - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - else: - vllm_v0_outputs = None - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - if vllm_v0_outputs is not None: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, @@ -442,27 +337,12 @@ def test_fp32_cache_state( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with vllm_runner(model, - max_num_seqs=MAX_NUM_SEQS, - **{cache_dtype_param: "float32"}) as vllm_model: - vllm_v0_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index 08722ac98b7ed..4ac91b5aed506 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import pytest import torch @@ -82,7 +81,7 @@ def test_prm_models( check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2") - if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0": + if current_platform.is_cpu(): pytest.skip("CPU only supports V1") if current_platform.is_rocm(): diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index afc27b6e0566e..97dd4d6135ac4 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -36,9 +36,6 @@ from ..utils import check_logprobs_close # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) -# Due to low-precision numerical divergence, this test is too sensitive for -# the async postprocessor -@pytest.mark.parametrize("disable_async_output_proc", [True]) def test_models( vllm_runner, example_prompts, @@ -49,7 +46,6 @@ def test_models( enforce_eager: bool, backend: str, tensor_parallel_size: int, - disable_async_output_proc: bool, monkeypatch: pytest.MonkeyPatch, ) -> None: """ @@ -74,7 +70,6 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -85,7 +80,6 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -110,9 +104,6 @@ def test_models( ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) -# Due to low-precision numerical divergence, this test is too sensitive for -# the async postprocessor -@pytest.mark.parametrize("disable_async_output_proc", [True]) def test_cpu_models( vllm_runner, example_prompts, @@ -120,7 +111,6 @@ def test_cpu_models( base_model: str, test_model: str, max_tokens: int, - disable_async_output_proc: bool, monkeypatch: pytest.MonkeyPatch, ) -> None: """ @@ -138,7 +128,6 @@ def test_cpu_models( max_model_len=MAX_MODEL_LEN, dtype="bfloat16", kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -148,7 +137,6 @@ def test_cpu_models( max_model_len=MAX_MODEL_LEN, dtype="bfloat16", kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 9281579b71e74..b9601114a3183 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -7,7 +7,6 @@ from unittest.mock import patch import pytest from vllm import LLM -from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.utils import GiB_bytes from vllm.v1.core.kv_cache_utils import get_kv_cache_configs from vllm.v1.engine.core import EngineCore as V1EngineCore @@ -61,10 +60,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, False)) # Avoid calling model.forward() - def _initialize_kv_caches_v0(self) -> None: - self.cache_config.num_gpu_blocks = 0 - self.cache_config.num_cpu_blocks = 0 - def _initialize_kv_caches_v1(self, vllm_config): kv_cache_specs = self.model_executor.get_kv_cache_specs() scheduler_kv_cache_config = get_kv_cache_configs( @@ -76,12 +71,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config return 1, 0, scheduler_kv_cache_config - with (patch.object(V0LLMEngine, "_initialize_kv_caches", - _initialize_kv_caches_v0), - patch.object(V1EngineCore, "_initialize_kv_caches", + with (patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1), monkeypatch.context() as m): if model_info.v0_only: - m.setenv("VLLM_USE_V1", "0") + # NOTE(woosuk): skip the test for V0-only models + return + if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"): # Phi4FlashForCausalLM and MotifForCausalLM # only supports DIFFERENTIAL_FLASH_ATTN backend diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 4aa7bb7297893..cb30d77c4f0ea 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -42,6 +42,7 @@ def test_oot_registration_text_generation( assert rest == "" +@pytest.mark.skip(reason="This test is skipped because it failed on V1.") @create_new_process_for_each_test() def test_oot_registration_embedding( monkeypatch: pytest.MonkeyPatch, diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 6e2089ea2e0e2..1d7e4475011d0 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -7,15 +7,6 @@ import torch from vllm.plugins import load_general_plugins -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - def test_platform_plugins(): # simulate workload by running an example import runpy diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 8c21216108685..099869a82ad21 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -3,47 +3,18 @@ import pytest -from vllm.core.scheduler import Scheduler from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams -from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler -from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.engine.llm_engine import LLMEngine -class DummyV0Scheduler(Scheduler): - - def schedule(self): - raise Exception("Exception raised by DummyV0Scheduler") - - -class DummyV1Scheduler(V1Scheduler): +class DummyV1Scheduler(Scheduler): def schedule(self): raise Exception("Exception raised by DummyV1Scheduler") -def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - with pytest.raises(Exception) as exception_info: - - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV0Scheduler, - ) - - engine = LLMEngine.from_engine_args(engine_args=engine_args) - - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() - - assert str( - exception_info.value) == "Exception raised by DummyV0Scheduler" - - def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): scheduler_cls=DummyV1Scheduler, ) - engine = V1LLMEngine.from_engine_args(engine_args=engine_args) + engine = LLMEngine.from_engine_args(engine_args=engine_args) sampling_params = SamplingParams(max_tokens=1) engine.add_request("0", "foo", sampling_params) diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 0320a5ef31a65..2960ffcbd9eab 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM from vllm.assets.audio import AudioAsset - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. # 2. Increase beam_width to 8. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index ea4a17dd2306f..1d77d37a5d581 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -9,13 +9,6 @@ import pytest from vllm import SamplingParams - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - # We also test with llama because it has generation_config to specify EOS # (past regression). MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"] diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 86fc14dc85f80..220a4a53f4671 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -8,12 +8,6 @@ from vllm import SamplingParams MODELS = ["distilbert/distilgpt2"] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass - - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_ranks( diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 15ea55afe963b..bd2b91073d568 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -352,58 +352,3 @@ def test_decode_prompt_logprobs(complete_sequence: str, logprobs[token_id + 1].decoded_token for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs) ]) - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1]) -def test_decode_prompt_logprobs_chunked_prefill( - vllm_runner, - model, - chunked_prefill_token_size: int, - example_prompts, - monkeypatch, -): - # VLLM V1 does not use incremental detokenization for - # prompt logprobs, so this test strategy is irrelevant. - monkeypatch.setenv("VLLM_USE_V1", "0") - - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - - with vllm_runner(model, - dtype="half", - max_logprobs=5, - gpu_memory_utilization=0.5, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs) as vllm_model: - - vllm_sampling_params = SamplingParams(max_tokens=10, - logprobs=5, - prompt_logprobs=5, - temperature=0.0) - vllm_results = vllm_model.llm.generate( - example_prompts, sampling_params=vllm_sampling_params) - - for idx, result in enumerate(vllm_results): - assert result.prompt_logprobs is not None - assert result.prompt_logprobs[0] is None - - # Compared detokenized prompts ids to original prompt. - generated_string = "" - for (prompt_token, - prompt_logprobs) in zip(result.prompt_token_ids[1:], - result.prompt_logprobs[1:]): - # prompt_logprobs is a dict of the token_id: logprob - # We select the token_id corresponding to the actual prompt - # Decoded token in the detokenized string corresponding to this - # prompt token. - generated_string += prompt_logprobs[prompt_token].decoded_token - - assert generated_string == example_prompts[idx], ( - "Detokenized prompt logprobs do not match original prompt") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8912ff8bad429..242fcf501bfc4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1508,14 +1508,6 @@ class EngineArgs: recommend_to_remove=True) return False - if self.kv_cache_dtype != "auto": - supported = current_platform.is_kv_cache_dtype_supported( - self.kv_cache_dtype, model_config) - if not supported: - _raise_or_fallback(feature_name="--kv-cache-dtype", - recommend_to_remove=False) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 014bc56bc8ece..a0fe38eb320d6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,1835 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time -from collections import Counter as collectionsCounter -from collections import deque -from contextlib import contextmanager -from dataclasses import dataclass -from functools import partial -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, - Iterable, List, Literal, Mapping, NamedTuple, Optional) -from typing import Sequence as GenericSequence -from typing import Set, Type, Union, cast +from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine -import torch -import torch.nn as nn -from typing_extensions import TypeVar - -import vllm.envs as envs -from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig, - ParallelConfig, SchedulerConfig, VllmConfig) -from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics_types import StatLoggerBase, Stats -from vllm.engine.output_processor.interfaces import ( - SequenceGroupOutputProcessor) -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.entrypoints.openai.logits_processors import ( - get_logits_processors as get_openai_logits_processors) -from vllm.executor.executor_base import ExecutorBase -from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs -from vllm.inputs.parse import split_enc_dec_inputs -from vllm.inputs.preprocess import InputPreprocessor -from vllm.logger import init_logger -from vllm.logits_process import get_bad_words_logits_processors -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.multimodal.cache import processor_only_cache_from_config -from vllm.multimodal.processing import EncDecMultiModalProcessor -from vllm.outputs import (PoolingRequestOutput, RequestOutput, - RequestOutputFactory) -from vllm.reasoning import ReasoningParser, ReasoningParserManager -from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, - Sequence, SequenceGroup, SequenceGroupBase, - SequenceGroupMetadata, SequenceGroupOutput, - SequenceStatus) -from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, - init_tracer) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer import (AnyTokenizer, - init_tokenizer_from_configs) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind -from vllm.version import __version__ as VLLM_VERSION -from vllm.worker.model_runner_base import InputProcessingError -from vllm.worker.worker_base import WorkerBase - -logger = init_logger(__name__) -_LOCAL_LOGGING_INTERVAL_SEC = 5 - -_O = TypeVar("_O", RequestOutput, PoolingRequestOutput) -_R = TypeVar("_R", default=Any) - - -@dataclass -class SchedulerOutputState: - """Caches the scheduler outputs for a virtual engine. Used for Multi-Step""" - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None - scheduler_outputs: Optional[SchedulerOutputs] = None - allow_async_output_proc: bool = False - last_output: Optional[SamplerOutput] = None - - -class OutputData(NamedTuple): - outputs: List[SamplerOutput] - seq_group_metadata_list: List[SequenceGroupMetadata] - scheduler_outputs: SchedulerOutputs - is_async: bool - is_last_step: bool - # Indicates if this output is from the first step of the - # multi-step. When multi-step is disabled, this is always - # set to True. - # is_first_step_output is invalid when `outputs` has - # outputs from multiple steps. - is_first_step_output: Optional[bool] - skip: List[int] - - -class SchedulerContext: - - def __init__(self) -> None: - self.output_queue: Deque[OutputData] = deque() - self.request_outputs: List[RequestOutput] = [] - self.seq_group_metadata_list: Optional[ - List[SequenceGroupMetadata]] = None - self.scheduler_outputs: Optional[SchedulerOutputs] = None - - def append_output(self, outputs: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata], - scheduler_outputs: SchedulerOutputs, is_async: bool, - is_last_step: bool, - is_first_step_output: Optional[bool]): - self.output_queue.append( - OutputData(outputs=outputs, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - is_async=is_async, - is_last_step=is_last_step, - is_first_step_output=is_first_step_output, - skip=[])) - - -class LLMEngine: - """An LLM engine that receives requests and generates texts. - - This is the main class for the vLLM engine. It receives requests - from clients and generates texts from the LLM. It includes a tokenizer, a - language model (possibly distributed across multiple GPUs), and GPU memory - space allocated for intermediate states (aka KV cache). This class utilizes - iteration-level scheduling and efficient memory management to maximize the - serving throughput. - - The [`LLM`][vllm.LLM] class wraps this class for offline batched inference - and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine] - class wraps this class for online serving. - - The config arguments are derived from [`EngineArgs`][vllm.EngineArgs]. - - Args: - vllm_config: The configuration for initializing and running vLLM. - executor_class: The model executor class for managing distributed - execution. - log_stats: Whether to log statistics. - usage_context: Specified entry point, used for usage info collection. - """ - - DO_VALIDATE_OUTPUT: ClassVar[bool] = False - """A flag to toggle whether to validate the type of request output.""" - - @classmethod - @contextmanager - def enable_output_validation(cls): - cls.DO_VALIDATE_OUTPUT = True - - yield - - cls.DO_VALIDATE_OUTPUT = False - - @classmethod - def validate_output( - cls, - output: object, - output_type: Type[_O], - ) -> _O: - do_validate = cls.DO_VALIDATE_OUTPUT - - if ((TYPE_CHECKING or do_validate) - and not isinstance(output, output_type)): - raise TypeError(f"Expected output of type {output_type}, " - f"but found type {type(output)}") - - return cast(_O, output) - - @classmethod - def validate_outputs( - cls, - outputs: GenericSequence[object], - output_type: Type[_O], - ) -> List[_O]: - do_validate = cls.DO_VALIDATE_OUTPUT - - outputs_: List[_O] - if TYPE_CHECKING or do_validate: - outputs_ = [] - for output in outputs: - if not isinstance(output, output_type): - raise TypeError(f"Expected output of type {output_type}, " - f"but found type {type(output)}") - - outputs_.append(output) - else: - outputs_ = outputs - - return outputs_ - - tokenizer: Optional[AnyTokenizer] - - def __init__( - self, - vllm_config: VllmConfig, - executor_class: Type[ExecutorBase], - log_stats: bool, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - use_cached_outputs: bool = False, - ) -> None: - if envs.VLLM_USE_V1: - raise ValueError( - "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. " - "This should not happen. As a workaround, try using " - "LLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github.") - - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config # noqa - self.load_config = vllm_config.load_config - self.structured_outputs_config = vllm_config.structured_outputs_config - self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa - ) - - logger.info( - "Initializing a V0 LLM engine (v%s) with config: %s, " - "use_cached_outputs=%s, ", - VLLM_VERSION, - vllm_config, - use_cached_outputs, - ) - - self.log_stats = log_stats - self.use_cached_outputs = use_cached_outputs - - if self.model_config.skip_tokenizer_init: - self.tokenizer = None - self.detokenizer = None - else: - self.tokenizer = self._init_tokenizer() - self.detokenizer = Detokenizer(self.tokenizer) - - self.seq_counter = Counter() - self.generation_config_fields = ( - self.model_config.try_get_generation_config()) - - self.input_preprocessor = InputPreprocessor( - self.model_config, - self.tokenizer, - mm_registry, - mm_processor_cache=processor_only_cache_from_config( - self.model_config, mm_registry), - ) - - self.model_executor = executor_class(vllm_config=vllm_config) - - self._initialize_kv_caches() - - # If usage stat is enabled, collect relevant info. - if is_usage_stats_enabled(): - from vllm.model_executor.model_loader import ( - get_architecture_class_name) - usage_message.report_usage( - get_architecture_class_name(self.model_config), - usage_context, - extra_kvs={ - # Common configuration - "dtype": - str(self.model_config.dtype), - "tensor_parallel_size": - self.parallel_config.tensor_parallel_size, - "block_size": - self.cache_config.block_size, - "gpu_memory_utilization": - self.cache_config.gpu_memory_utilization, - "kv_cache_memory_bytes": - self.cache_config.kv_cache_memory_bytes, - # Quantization - "quantization": - self.model_config.quantization, - "kv_cache_dtype": - str(self.cache_config.cache_dtype), - - # Feature flags - "enable_lora": - bool(self.lora_config), - "enable_prefix_caching": - self.cache_config.enable_prefix_caching, - "enforce_eager": - self.model_config.enforce_eager, - "disable_custom_all_reduce": - self.parallel_config.disable_custom_all_reduce, - }) - - self.cached_scheduler_outputs = [ - SchedulerOutputState() - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - - self.scheduler_contexts = [ - SchedulerContext() - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - - if self.model_config.use_async_output_proc: - process_model_outputs = weak_bind(self._process_model_outputs) - - self.async_callbacks = [ - partial(process_model_outputs, - ctx=self.scheduler_contexts[v_id]) - for v_id in range(self.parallel_config.pipeline_parallel_size) - ] - else: - self.async_callbacks = [] - - # Currently used by AsyncLLMEngine to ensure quick append - # of request outputs to asyncio queues - self.process_request_outputs_callback: Optional[Callable] = None - - # Create the scheduler. - # NOTE: the cache_config here have been updated with the numbers of - # GPU and CPU blocks, which are profiled in the distributed executor. - if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str): - Scheduler = resolve_obj_by_qualname( - self.vllm_config.scheduler_config.scheduler_cls) - else: - Scheduler = self.vllm_config.scheduler_config.scheduler_cls - self.scheduler = [ - Scheduler( - self.scheduler_config, self.cache_config, self.lora_config, - self.parallel_config.pipeline_parallel_size, - self.async_callbacks[v_id] - if self.model_config.use_async_output_proc else None) - for v_id in range(self.parallel_config.pipeline_parallel_size) - ] - - # Metric Logging. - if self.log_stats: - if stat_loggers is not None: - self.stat_loggers = stat_loggers - else: - # Lazy import for prometheus multiprocessing. - # We need to set PROMETHEUS_MULTIPROC_DIR environment variable - # before prometheus_client is imported. - # See https://prometheus.github.io/client_python/multiprocess/ - from vllm.engine.metrics import (LoggingStatLogger, - PrometheusStatLogger) - - self.stat_loggers = { - "logging": - LoggingStatLogger( - local_interval=_LOCAL_LOGGING_INTERVAL_SEC, - vllm_config=vllm_config), - "prometheus": - PrometheusStatLogger( - local_interval=_LOCAL_LOGGING_INTERVAL_SEC, - labels=dict( - model_name=self.model_config.served_model_name), - vllm_config=vllm_config), - } - self.stat_loggers["prometheus"].info("cache_config", - self.cache_config) - - self.tracer = None - if self.observability_config.otlp_traces_endpoint: - self.tracer = init_tracer( - "vllm.llm_engine", - self.observability_config.otlp_traces_endpoint) - - # Initialize reasoning parser if reasoning backend is set. - if self.structured_outputs_config.reasoning_parser and self.tokenizer: - reasoner_class = ReasoningParserManager.get_reasoning_parser( - self.structured_outputs_config.reasoning_parser) - self.reasoner: ReasoningParser = reasoner_class( - self.tokenizer.get_lora_tokenizer()) - - # Create sequence output processor, e.g. for beam search or - # speculative decoding. - self.output_processor = ( - SequenceGroupOutputProcessor.create_output_processor( - self.scheduler_config, - self.detokenizer, - self.scheduler, - self.seq_counter, - stop_checker=StopChecker( - self.scheduler_config.max_model_len, - self.reasoner - if self.structured_outputs_config.reasoning_parser - and self.tokenizer else None, - ), - )) - - self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {} - - # Flag to set when an input fails to process and the engine should run - # the next step without re-scheduling. - self._skip_scheduling_next_step = False - - # Don't keep the dummy data in memory - self.reset_mm_cache() - - def _initialize_kv_caches(self) -> None: - """Initialize the KV cache in the worker(s). - - The workers will determine the number of blocks in both the GPU cache - and the swap CPU cache. - """ - start = time.time() - num_gpu_blocks, num_cpu_blocks = ( - self.model_executor.determine_num_available_blocks()) - - if self.cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_gpu_blocks, - num_gpu_blocks_override) - num_gpu_blocks = num_gpu_blocks_override - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) - elapsed = time.time() - start - logger.info(("init engine (profile, create kv cache, " - "warmup model) took %.2f seconds"), elapsed) - - @classmethod - def _get_executor_cls(cls, - engine_config: VllmConfig) -> Type[ExecutorBase]: - # distributed_executor_backend must be set in VllmConfig.__post_init__ - distributed_executor_backend = ( - engine_config.parallel_config.distributed_executor_backend) - # Initialize the cluster and specify the executor class. - if isinstance(distributed_executor_backend, type): - if not issubclass(distributed_executor_backend, ExecutorBase): - raise TypeError( - "distributed_executor_backend must be a subclass of " - f"ExecutorBase. Got {distributed_executor_backend}.") - executor_class = distributed_executor_backend - elif distributed_executor_backend == "ray": - from vllm.executor.ray_distributed_executor import ( - RayDistributedExecutor) - executor_class = RayDistributedExecutor - elif distributed_executor_backend == "mp": - from vllm.executor.mp_distributed_executor import ( - MultiprocessingDistributedExecutor) - assert not envs.VLLM_USE_RAY_SPMD_WORKER, ( - "multiprocessing distributed executor backend does not " - "support VLLM_USE_RAY_SPMD_WORKER=1") - executor_class = MultiprocessingDistributedExecutor - elif distributed_executor_backend == "uni": - # JAX-style, single-process, multi-device executor. - from vllm.executor.uniproc_executor import UniProcExecutor - executor_class = UniProcExecutor - elif distributed_executor_backend == "external_launcher": - # executor with external launcher - from vllm.executor.uniproc_executor import ( # noqa - ExecutorWithExternalLauncher) - executor_class = ExecutorWithExternalLauncher - else: - raise ValueError("unrecognized distributed_executor_backend: " - f"{distributed_executor_backend}") - return executor_class - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - return cls( - vllm_config=vllm_config, - executor_class=cls._get_executor_cls(vllm_config), - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: EngineArgs, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - vllm_config = engine_args.create_engine_config(usage_context) - - engine_cls = cls - if envs.VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - engine_cls = V1LLMEngine - - return engine_cls.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - ) - - def __reduce__(self): - # This is to ensure that the LLMEngine is not referenced in - # the closure used to initialize Ray worker actors - raise RuntimeError("LLMEngine should not be pickled!") - - def __del__(self): - # Shutdown model executor when engine is garbage collected - # Use getattr since __init__ can fail before the field is set - if model_executor := getattr(self, "model_executor", None): - model_executor.shutdown() - - def get_tokenizer(self) -> AnyTokenizer: - if self.tokenizer is None: - raise ValueError("Unable to get tokenizer because " - "skip_tokenizer_init is True") - - return self.tokenizer - - def _init_tokenizer(self) -> AnyTokenizer: - return init_tokenizer_from_configs(model_config=self.model_config) - - def _verify_args(self) -> None: - self.model_config.verify_with_parallel_config(self.parallel_config) - self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config: - self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_with_scheduler_config( - self.scheduler_config) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: ProcessorInputs, - params: SamplingParams, - arrival_time: float, - lora_request: Optional[LoRARequest], - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> Optional[SequenceGroup]: - """Add a processed request to the engine's request pool. - return the created sequence group. - """ - if isinstance(params, SamplingParams) and params.n > 1: - ParallelSampleSequenceGroup.add_request( - request_id, - self, - params, - processed_inputs=processed_inputs, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - return None - - self._validate_model_inputs(processed_inputs) - # Create the sequences. - block_size = self.cache_config.block_size - seq_id = next(self.seq_counter) - eos_token_id = self.input_preprocessor.get_eos_token_id() - - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) - - seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, - lora_request) - - encoder_seq = (None if encoder_inputs is None else Sequence( - seq_id, encoder_inputs, block_size, eos_token_id, lora_request)) - - # Create a SequenceGroup based on SamplingParams - if isinstance(params, SamplingParams): - seq_group = self._create_sequence_group_with_sampling( - request_id, - seq, - params, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - encoder_seq=encoder_seq, - priority=priority) - else: - raise ValueError("SamplingParams must be provided.") - - # Add the sequence group to the scheduler with least unfinished seqs. - costs = [ - scheduler.get_num_unfinished_seq_groups() - for scheduler in self.scheduler - ] - min_cost_scheduler = self.scheduler[costs.index(min(costs))] - min_cost_scheduler.add_seq_group(seq_group) - - return seq_group - - def stop_remote_worker_execution_loop(self) -> None: - self.model_executor.stop_remote_worker_execution_loop() - - def add_request( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> None: - """Add a request to the engine's request pool. - - The request is added to the request pool and will be processed by the - scheduler as `engine.step()` is called. The exact scheduling policy is - determined by the scheduler. - - Args: - request_id: The unique ID of the request. - prompt: The prompt to the LLM. See - [PromptType][vllm.inputs.PromptType] - for more details about the format of each input. - params: Parameters for sampling. - [SamplingParams][vllm.SamplingParams] for text generation. - arrival_time: The arrival time of the request. If None, we use - the current monotonic time. - lora_request: The LoRA request to add. - trace_headers: OpenTelemetry trace headers. - priority: The priority of the request. - Only applicable with priority scheduling. - - Details: - - Set arrival_time to the current time if it is None. - - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of [Sequence][vllm.sequence.Sequence] objects. - - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object - from the list of [Sequence][vllm.sequence.Sequence]. - - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the - scheduler. - - Example: - >>> # initialize engine - >>> engine = LLMEngine.from_engine_args(engine_args) - >>> # set request arguments - >>> example_prompt = "Who is the president of the United States?" - >>> sampling_params = SamplingParams(temperature=0.0) - >>> request_id = 0 - >>> - >>> # add the request to the engine - >>> engine.add_request( - >>> str(request_id), - >>> example_prompt, - >>> SamplingParams(temperature=0.0)) - >>> # continue the request processing - >>> ... - """ - if not isinstance(request_id, str): - raise TypeError( - f"request_id must be a string, got {type(request_id)}") - - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - - if priority != 0 and not self.scheduler_config.policy == "priority": - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - - if isinstance(params, SamplingParams) \ - and params.logits_processors: - raise ValueError( - "Logits processors are not supported in multi-step decoding") - - if arrival_time is None: - arrival_time = time.time() - - if (isinstance(prompt, dict) - and prompt.get("prompt_embeds", None) is not None): - if not prompt.get("prompt_token_ids", None): - seq_len = prompt["prompt_embeds"].shape[0] - prompt["prompt_token_ids"] = [0] * seq_len - if params.prompt_logprobs is not None: - raise ValueError( - "prompt_logprobs is not compatible with prompt embeds.") - - processed_inputs = self.input_preprocessor.preprocess( - prompt, - tokenization_kwargs=tokenization_kwargs, - ) - - self._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=params, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - - def _create_sequence_group_with_sampling( - self, - request_id: str, - seq: Sequence, - sampling_params: SamplingParams, - arrival_time: float, - lora_request: Optional[LoRARequest], - trace_headers: Optional[Mapping[str, str]] = None, - encoder_seq: Optional[Sequence] = None, - priority: int = 0, - ) -> SequenceGroup: - """Creates a SequenceGroup with SamplingParams.""" - max_logprobs = self.get_model_config().max_logprobs - if (sampling_params.logprobs - and sampling_params.logprobs > max_logprobs) or ( - sampling_params.prompt_logprobs - and sampling_params.prompt_logprobs > max_logprobs): - raise ValueError(f"Cannot request more than " - f"{max_logprobs} logprobs.") - - sampling_params = self._build_logits_processors( - sampling_params, lora_request) - - # Defensive copy of SamplingParams, which are used by the sampler, - # this doesn't deep-copy LogitsProcessor objects - sampling_params = sampling_params.clone() - - sampling_params.update_from_generation_config( - self.generation_config_fields, seq.eos_token_id) - - # Create the sequence group. - draft_size = 1 - if self.vllm_config.speculative_config is not None: - draft_size = \ - self.vllm_config.speculative_config.num_speculative_tokens + 1 - seq_group = SequenceGroup(request_id=request_id, - seqs=[seq], - arrival_time=arrival_time, - sampling_params=sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - encoder_seq=encoder_seq, - priority=priority, - draft_size=draft_size) - - return seq_group - - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - """Aborts a request(s) with the given ID. - - Args: - request_id: The ID(s) of the request to abort. - - Details: - - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][]. - - Example: - >>> # initialize engine and add a request with request_id - >>> request_id = str(0) - >>> # abort the request - >>> engine.abort_request(request_id) - """ - for scheduler in self.scheduler: - scheduler.abort_seq_group( - request_id, seq_id_to_seq_group=self.seq_id_to_seq_group) - - def get_vllm_config(self) -> VllmConfig: - """Gets the vllm configuration.""" - return self.vllm_config - - def get_model_config(self) -> ModelConfig: - """Gets the model configuration.""" - return self.model_config - - def get_parallel_config(self) -> ParallelConfig: - """Gets the parallel configuration.""" - return self.parallel_config - - def get_scheduler_config(self) -> SchedulerConfig: - """Gets the scheduler configuration.""" - return self.scheduler_config - - def get_lora_config(self) -> LoRAConfig: - """Gets the LoRA configuration.""" - return self.lora_config - - def get_num_unfinished_requests(self) -> int: - """Gets the number of unfinished requests.""" - return sum(scheduler.get_num_unfinished_seq_groups() - for scheduler in self.scheduler) - - def has_unfinished_requests(self) -> bool: - """Returns True if there are unfinished requests.""" - return any(scheduler.has_unfinished_seqs() - for scheduler in self.scheduler) - - def has_unfinished_requests_for_virtual_engine( - self, virtual_engine: int) -> bool: - """ - Returns True if there are unfinished requests for the virtual engine. - """ - return self.scheduler[virtual_engine].has_unfinished_seqs() - - def reset_mm_cache(self) -> bool: - """Reset the multi-modal cache.""" - self.input_preprocessor.clear_cache() - return True - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache for all devices.""" - - success = True - for scheduler in self.scheduler: - success = success and scheduler.reset_prefix_cache(device) - return success - - def _process_model_outputs(self, - ctx: SchedulerContext, - request_id: Optional[str] = None) -> None: - """Apply the model output to the sequences in the scheduled seq groups - and return responses. - - ctx: The virtual engine context to work on - request_id: If provided, then only this request is going to be processed - """ - - now = time.time() - - if len(ctx.output_queue) == 0: - return None - - # Get pending async postprocessor - if request_id: - # When we process only one request, no pop is required - # (since later we will process all of the rest) - (outputs, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step, is_first_step_output, skip) = ctx.output_queue[0] - else: - (outputs, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step, is_first_step_output, - skip) = ctx.output_queue.popleft() - - # Sanity check - assert len(seq_group_metadata_list) == len( - scheduler_outputs.scheduled_seq_groups) - - has_multiple_outputs: bool = len(outputs) > 1 - outputs_by_sequence_group: List[List[SequenceGroupOutput]] - assert not has_multiple_outputs - outputs_by_sequence_group = outputs - - # Determine the requests we need to operate on - if request_id: - indices = [] - for i, seq_group_meta in enumerate(seq_group_metadata_list): - if seq_group_meta.request_id == request_id: - assert i not in skip # Cannot be called twice - indices.append(i) - break - - # If the request_id was not found, then it means that - # this is a new request that has no pending async - # postprocessor - if not indices: - return - else: - indices = range(len(seq_group_metadata_list)) # type: ignore - - finished_before: List[int] = [] - finished_now: List[int] = [] - for i in indices: - if i in skip: - continue - - seq_group_meta = seq_group_metadata_list[i] - scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] - - seq_group: SequenceGroup = scheduled_seq_group.seq_group - - if seq_group.is_finished(): - finished_before.append(i) - continue - - output: List[SequenceGroupOutput] - if has_multiple_outputs: - output = outputs_by_sequence_group[i] - else: - output = [outputs_by_sequence_group[0][i]] - - if not is_async: - seq_group.update_num_computed_tokens( - seq_group_meta.token_chunk_size or 0) - - if outputs: - for o in outputs: - if (isinstance(o, SamplerOutput) - and seq_group.metrics is not None): - if seq_group.metrics.model_forward_time is not None: - seq_group.metrics.model_forward_time += ( - o.model_forward_time or 0) - else: - seq_group.metrics.model_forward_time = ( - o.model_forward_time) - if seq_group.metrics.model_execute_time is not None: - seq_group.metrics.model_execute_time += ( - o.model_execute_time or 0) - else: - seq_group.metrics.model_execute_time = ( - o.model_execute_time) - - self.output_processor.process_prompt_logprob(seq_group, output) - if seq_group_meta.do_sample: - self.output_processor.process_outputs(seq_group, output, - is_async) - - if seq_group.is_finished(): - finished_now.append(i) - - # Generate outputs for the requests that finished this iteration - for i in finished_now: - scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] - - seq_group = scheduled_seq_group.seq_group - seq_group.maybe_set_first_token_time(now) - if not seq_group.is_prefill(): - seq_group.set_last_token_time(now) - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, - use_cache=self.use_cached_outputs) - if request_output: - ctx.request_outputs.append(request_output) - - # When we process a single request, we skip it for the next time, - # and invoke the request output callback (if there was final output) - if request_id: - assert len(indices) == 1 - skip.append(indices[0]) - - if (finished_now - and self.process_request_outputs_callback is not None): - self.process_request_outputs_callback(ctx.request_outputs) - ctx.request_outputs.clear() - return - - # Free currently finished requests - if finished_now: - for scheduler in self.scheduler: - scheduler.free_finished_seq_groups() - - # Create the outputs - for i in indices: - if i in skip or i in finished_before or i in finished_now: - continue # Avoids double processing - - scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i] - - seq_group = scheduled_seq_group.seq_group - seq_group.maybe_set_first_token_time(now) - if not seq_group.is_prefill(): - seq_group.set_last_token_time(now) - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, - use_cache=self.use_cached_outputs) - if request_output: - ctx.request_outputs.append(request_output) - - # Create outputs only after processing the scheduler's results - - for seq_group in scheduler_outputs.ignored_seq_groups: - params = seq_group.sampling_params - if params is not None and params.output_kind == ( - RequestOutputKind.DELTA) and not seq_group.is_finished(): - continue - - request_output = RequestOutputFactory.create( - seq_group, - self.seq_id_to_seq_group, - use_cache=self.use_cached_outputs, - ) - if request_output: - ctx.request_outputs.append(request_output) - - # Immediately process request outputs here (if callback is given) - if (ctx.request_outputs - and self.process_request_outputs_callback is not None): - self.process_request_outputs_callback(ctx.request_outputs) - ctx.request_outputs.clear() - - # For async case, we need to record the stats here. - # For non-async case, the stats are done in the - # LLMEngine/AsyncLLMEngine directly - if is_async: - # Log stats. - self.do_log_stats(scheduler_outputs, outputs, finished_before, - skip) - - # Tracing - self.do_tracing(scheduler_outputs, finished_before) - - return None - - def _advance_to_next_step( - self, output: SamplerOutput, - seq_group_metadata_list: List[SequenceGroupMetadata], - scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done inside output processor, but it is - required if the worker is to perform async forward pass to next step. - """ - for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \ - zip(seq_group_metadata_list, output, scheduled_seq_groups): - seq_group = scheduled_seq_group.seq_group - - if seq_group.is_finished(): - continue - - token_chunk_size = (seq_group_metadata.token_chunk_size - if seq_group_metadata.token_chunk_size - is not None else 0) - seq_group.update_num_computed_tokens(token_chunk_size) - - if seq_group_metadata.do_sample: - assert len(sequence_group_outputs.samples) == 1, ( - "Async output processor expects a single sample" - " (i.e sampling_params.n == 1)") - sample = sequence_group_outputs.samples[0] - - assert len(seq_group.seqs) == 1 - seq = seq_group.seqs[0] - - seq.append_token_id(sample.output_token, sample.logprobs, - sample.output_embed) - - def step(self) -> List[RequestOutput]: - """Performs one decoding iteration and returns newly generated results. - - <figure markdown="span"> - ![Overview of the step function](https://i.imgur.com/sv2HssD.png) - <figcaption>Overview of the step function</figcaption> - </figure> - - Details: - - Step 1: Schedules the sequences to be executed in the next - iteration and the token blocks to be swapped in/out/copy. - - - Depending on the scheduling policy, - sequences may be `preempted/reordered`. - - A Sequence Group (SG) refer to a group of sequences - that are generated from the same prompt. - - - Step 2: Calls the distributed executor to execute the model. - - Step 3: Processes the model output. This mainly includes: - - - Decodes the relevant outputs. - - Updates the scheduled sequence groups with model outputs - based on its `sampling parameters` (`use_beam_search` or not). - - Frees the finished sequence groups. - - - Finally, it creates and returns the newly generated results. - - Example: - ``` - # Please see the example/ folder for more detailed examples. - - # initialize engine and request arguments - engine = LLMEngine.from_engine_args(engine_args) - example_inputs = [(0, "What is LLM?", - SamplingParams(temperature=0.0))] - - # Start the engine with an event loop - while True: - if example_inputs: - req_id, prompt, sampling_params = example_inputs.pop(0) - engine.add_request(str(req_id),prompt,sampling_params) - - # continue the request processing - request_outputs = engine.step() - for request_output in request_outputs: - if request_output.finished: - # return or show the request output - - if not (engine.has_unfinished_requests() or example_inputs): - break - ``` - """ - if self.parallel_config.pipeline_parallel_size > 1: - raise NotImplementedError( - "Pipeline parallelism is only supported through AsyncLLMEngine " - "as performance will be severely degraded otherwise.") - - # For llm_engine, there is no pipeline parallel support, so the engine - # used is always 0. - virtual_engine = 0 - - # These are cached outputs from previous iterations. None if on first - # iteration - cached_outputs = self.cached_scheduler_outputs[virtual_engine] - seq_group_metadata_list = cached_outputs.seq_group_metadata_list - scheduler_outputs = cached_outputs.scheduler_outputs - allow_async_output_proc = cached_outputs.allow_async_output_proc - - ctx = self.scheduler_contexts[virtual_engine] - - # Clear outputs for each new scheduler iteration - ctx.request_outputs.clear() - - # Skip the scheduler if there are any remaining steps in the seq groups. - # This ensures that the scheduler is only called again when the current - # batch has completed. - # The scheduler is also skipped if a single request caused the last - # engine step to fail, and the previous schedule needs to be rerun. - if not self._has_remaining_steps( - seq_group_metadata_list - ) and not self._skip_scheduling_next_step: - # Schedule iteration - (seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc - ) = self.scheduler[virtual_engine].schedule() - - ctx.seq_group_metadata_list = seq_group_metadata_list - ctx.scheduler_outputs = scheduler_outputs - - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() - # When n>1, elements in self.seq_id_to_seq_group should be deleted - # here, otherwise memory leaks. - for finished_request_id in finished_requests_ids: - if finished_request_id in self.seq_id_to_seq_group: - del self.seq_id_to_seq_group[finished_request_id] - - # Maybe switch from async mode to sync mode - if not allow_async_output_proc and len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - - else: - finished_requests_ids = list() - - assert seq_group_metadata_list is not None - assert scheduler_outputs is not None - - if not scheduler_outputs.is_empty(): - - # Check if we have a cached last_output from the previous iteration. - # For supporting PP this is probably the best way to pass the - # sampled_token_ids, as a separate broadcast over all the PP stages - # will cause one virtual engine's microbatch to block the pipeline. - last_sampled_token_ids = \ - self._get_last_sampled_token_ids(virtual_engine) - - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, - blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, - blocks_to_copy=scheduler_outputs.blocks_to_copy, - num_lookahead_slots=scheduler_outputs.num_lookahead_slots, - running_queue_size=scheduler_outputs.running_queue_size, - finished_requests_ids=finished_requests_ids, - # We use ExecuteModelRequest to pass the last sampled_token_ids - # to each of the non-last PP stages for in-place prepare_input. - last_sampled_token_ids=last_sampled_token_ids) - - if allow_async_output_proc: - execute_model_req.async_callback = self.async_callbacks[ - virtual_engine] - - try: - outputs = self.model_executor.execute_model( - execute_model_req=execute_model_req) - self._skip_scheduling_next_step = False - except InputProcessingError as e: - # The input for this request cannot be processed, so we must - # abort it. If there are remaining requests in the batch that - # have been scheduled, they will be retried on the next step. - invalid_request_id = e.request_id - self._abort_and_cache_schedule( - request_id=invalid_request_id, - virtual_engine=virtual_engine, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - allow_async_output_proc=allow_async_output_proc) - # Raise so the caller is notified that this request failed - raise - - else: - # Nothing scheduled => If there is pending async postprocessor, - # then finish it here. - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - # No outputs in this case - outputs = [] - - if not self._has_remaining_steps(seq_group_metadata_list): - # is_first_step_output is True only when the num_steps of all - # the sequences are 1. - is_first_step_output: bool = False if not seq_group_metadata_list \ - else seq_group_metadata_list[0].state.num_steps == 1 - - # Add results to the output_queue - ctx.append_output(outputs=outputs, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - is_async=allow_async_output_proc, - is_last_step=True, - is_first_step_output=is_first_step_output) - - if outputs and allow_async_output_proc: - assert len(outputs) == 1, ( - "Async postprocessor expects only a single output set") - - self._advance_to_next_step( - outputs[0], seq_group_metadata_list, - scheduler_outputs.scheduled_seq_groups) - - # Check if need to run the usual non-async path - if not allow_async_output_proc: - self._process_model_outputs(ctx=ctx) - - # Log stats. - self.do_log_stats(scheduler_outputs, outputs) - - # Tracing - self.do_tracing(scheduler_outputs) - else: - # Multi-step case - return ctx.request_outputs - - if not self.has_unfinished_requests(): - # Drain async postprocessor (if exists) - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - assert len(ctx.output_queue) == 0 - - # Stop the execute model loop in parallel workers until there are - # more requests to process. This avoids waiting indefinitely in - # torch.distributed ops which may otherwise time out, and unblocks - # the RPC thread in the workers so that they can process any other - # queued control plane messages, such as add/remove lora adapters. - logger.debug("Stopping remote worker execution loop.") - self.model_executor.stop_remote_worker_execution_loop() - - return ctx.request_outputs - - def _abort_and_cache_schedule( - self, request_id: str, virtual_engine: int, - seq_group_metadata_list: List[SequenceGroupMetadata], - scheduler_outputs: SchedulerOutputs, - allow_async_output_proc: bool) -> None: - """Aborts a single request, and caches the scheduler outputs minus that - request. This allows the next step to continue processing the remaining - requests without having to re-run the scheduler.""" - - # Abort the request and remove its sequence group from the current - # schedule - self.abort_request(request_id) - for i, metadata in enumerate(seq_group_metadata_list): - if metadata.request_id == request_id: - del seq_group_metadata_list[i] - break - for i, group in enumerate(scheduler_outputs.scheduled_seq_groups): - if group.seq_group.request_id == request_id: - del scheduler_outputs.scheduled_seq_groups[i] - break - - # If there are still other sequence groups left in the schedule, cache - # them and flag the engine to reuse the schedule. - if len(seq_group_metadata_list) > 0: - self._skip_scheduling_next_step = True - # Reuse multi-step caching logic - self._cache_scheduler_outputs_for_multi_step( - virtual_engine=virtual_engine, - scheduler_outputs=scheduler_outputs, - seq_group_metadata_list=seq_group_metadata_list, - allow_async_output_proc=allow_async_output_proc) - - def _has_remaining_steps( - self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] - ) -> bool: - return False - - def _cache_scheduler_outputs_for_multi_step( - self, virtual_engine: int, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - scheduler_outputs: SchedulerOutputs, - allow_async_output_proc: bool) -> None: - co = self.cached_scheduler_outputs[virtual_engine] - - co.seq_group_metadata_list = seq_group_metadata_list - co.scheduler_outputs = scheduler_outputs - co.allow_async_output_proc = allow_async_output_proc - co.last_output = None - - def _update_cached_scheduler_output( - self, virtual_engine: int, - output: List[Optional[SamplerOutput]]) -> None: - if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0 - and output[0] is not None): - last_output = output[-1] - assert last_output is not None - assert last_output.sampled_token_ids_cpu is not None - assert last_output.sampled_token_ids is None - assert last_output.sampled_token_probs is None - self.cached_scheduler_outputs[ - virtual_engine].last_output = last_output - - def _get_last_sampled_token_ids( - self, virtual_engine: int) -> Optional[torch.Tensor]: - return None - - def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: - if not self.log_stats: - raise RuntimeError( - "Stat logging is disabled. Set `disable_log_stats=False` " - "argument to enable.") - if logger_name in self.stat_loggers: - raise KeyError(f"Logger with name {logger_name} already exists.") - self.stat_loggers[logger_name] = logger - - def remove_logger(self, logger_name: str) -> None: - if not self.log_stats: - raise RuntimeError( - "Stat logging is disabled. Set `disable_log_stats=False` " - "argument to enable.") - if logger_name not in self.stat_loggers: - raise KeyError(f"Logger with name {logger_name} does not exist.") - del self.stat_loggers[logger_name] - - def do_log_stats(self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None, - finished_before: Optional[List[int]] = None, - skip: Optional[List[int]] = None) -> None: - """Forced log when no requests active.""" - if self.log_stats: - stats = self._get_stats(scheduler_outputs, model_output, - finished_before, skip) - for logger in self.stat_loggers.values(): - logger.log(stats) - - def _get_stats(self, - scheduler_outputs: Optional[SchedulerOutputs], - model_output: Optional[List[SamplerOutput]] = None, - finished_before: Optional[List[int]] = None, - skip: Optional[List[int]] = None) -> Stats: - """Get Stats to be Logged to Prometheus. - - Args: - scheduler_outputs: Optional, used to populate metrics related to - the scheduled batch, - model_output: Optional, used to emit speculative decoding metrics - which are created by the workers. - finished_before: Optional, indices of sequences that were finished - before. These sequences will be ignored. - skip: Optional, indices of sequences that were preempted. These - sequences will be ignored. - """ - now = time.time() - - # System State - # Scheduler State - num_running_sys = sum( - len(scheduler.running) for scheduler in self.scheduler) - num_swapped_sys = sum( - len(scheduler.swapped) for scheduler in self.scheduler) - num_waiting_sys = sum( - len(scheduler.waiting) for scheduler in self.scheduler) - - # KV Cache Usage in % - num_total_gpu = self.cache_config.num_gpu_blocks - gpu_cache_usage_sys = 0. - if num_total_gpu: # Guard against both None and 0 - num_free_gpu = sum( - scheduler.block_manager.get_num_free_gpu_blocks() - for scheduler in self.scheduler) - gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu) - - num_total_cpu = self.cache_config.num_cpu_blocks - cpu_cache_usage_sys = 0. - if num_total_cpu: # Guard against both None and 0 - num_free_cpu = sum( - scheduler.block_manager.get_num_free_cpu_blocks() - for scheduler in self.scheduler) - cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu) - - # Prefix Cache Hit Rate. Note that we always use - # the cache hit rate of the first virtual engine. - cpu_prefix_cache_hit_rate = self.scheduler[ - 0].get_prefix_cache_hit_rate(Device.CPU) - gpu_prefix_cache_hit_rate = self.scheduler[ - 0].get_prefix_cache_hit_rate(Device.GPU) - - # Exchange the uasge and cache hit stats between gpu and cpu when - # running on cpu because the cpu_worker.py intentionally reports the - # number of cpu blocks as gpu blocks in favor of cache management. - if self.device_config.device_type == "cpu": - num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu - gpu_cache_usage_sys, cpu_cache_usage_sys = ( - cpu_cache_usage_sys, - gpu_cache_usage_sys, - ) - gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = ( - cpu_prefix_cache_hit_rate, - gpu_prefix_cache_hit_rate, - ) - - # Iteration stats - num_prompt_tokens_iter = 0 - num_generation_tokens_iter = 0 - num_tokens_iter = 0 - time_to_first_tokens_iter: List[float] = [] - inter_token_latencies_iter: List[float] = [] - num_preemption_iter = (0 if scheduler_outputs is None else - scheduler_outputs.preempted) - - # Request stats - # Latency - time_e2e_requests: List[float] = [] - time_queue_requests: List[float] = [] - time_inference_requests: List[float] = [] - time_prefill_requests: List[float] = [] - time_decode_requests: List[float] = [] - # Metadata - num_prompt_tokens_requests: List[int] = [] - num_generation_tokens_requests: List[int] = [] - n_requests: List[int] = [] - max_num_generation_tokens_requests: List[int] = [] - max_tokens_requests: List[int] = [] - finished_reason_requests: List[str] = [] - - # LoRA requests - running_lora_adapters = dict( - collectionsCounter([ - running_request.lora_request.lora_name - for scheduler in self.scheduler - for running_request in scheduler.running - if running_request.lora_request - ])) - waiting_lora_adapters = dict( - collectionsCounter([ - waiting_request.lora_request.lora_name - for scheduler in self.scheduler - for waiting_request in scheduler.waiting - if waiting_request.lora_request - ])) - max_lora_stat = "0" - if self.lora_config: - max_lora_stat = str(self.lora_config.max_loras) - - # NOTE: This loop assumes prefill seq_groups are before - # decode seq_groups in scheduled_seq_groups. - if scheduler_outputs is not None: - # For async postprocessor, already finished sequences need to be - # not counted (to avoid double counting) - actual_num_batched_tokens = scheduler_outputs.num_batched_tokens # type: ignore - - num_generation_tokens_from_prefill_groups = 0 - # NOTE: if scheduler_outputs.num_prefill_groups > 0 and - # the len of scheduler_outputs.scheduled_seq_groups is != - # scheduler_outputs.num_prefill_groups, this means that - # chunked prefills have been detected. - - for idx, scheduled_seq_group in enumerate( - scheduler_outputs.scheduled_seq_groups): - # Skip double logging when using async output proc - if finished_before and idx in finished_before: - actual_num_batched_tokens -= 1 - continue - - # Currently, skip == preempted sequences, so we need to skip - # their log stats - if skip and idx in skip: - continue - - group_was_prefill = idx < scheduler_outputs.num_prefill_groups - seq_group = scheduled_seq_group.seq_group - - # NOTE: a seq_group that completed all of its prefill tokens - # in the last iteration will have seq_group.is_prefill() = False - # with group_was_prefill = True - if group_was_prefill: - # Number of prompt tokens. - num_prompt_tokens_iter += ( - scheduled_seq_group.token_chunk_size) - - # If the seq_group just finished the prefill state - # get TTFT. - if not seq_group.is_prefill(): - latency = seq_group.get_last_token_latency() - time_to_first_tokens_iter.append(latency) - - # One generation token per finished prefill. - num_generation_tokens_from_prefill_groups += ( - seq_group.num_seqs()) - else: - # ITLs - latency = seq_group.get_last_token_latency() - inter_token_latencies_iter.append(latency) - if seq_group.state.current_step == 0: - # For async_output_proc, the do_log_stats() - # is called following init_multi_step(), which - # sets the current_step to zero. - actual_num_batched_tokens +=\ - seq_group.state.num_steps - 1 - else: - actual_num_batched_tokens +=\ - seq_group.state.current_step - 1 - - # Because of chunked prefill, we can have a single sequence - # group that does multiple prompt_runs. To prevent logging - # the same metadata more than once per request, we standardize - # on logging request level information for finished requests, - # which can only happen once. - if seq_group.is_finished(): - # Latency timings - time_e2e_requests.append(now - - seq_group.metrics.arrival_time) - if (seq_group.metrics.first_scheduled_time is not None and - seq_group.metrics.first_token_time is not None): - time_queue_requests.append( - seq_group.metrics.first_scheduled_time - - seq_group.metrics.arrival_time) - time_prefill_requests.append( - seq_group.metrics.first_token_time - - seq_group.metrics.first_scheduled_time) - time_decode_requests.append( - now - seq_group.metrics.first_token_time) - time_inference_requests.append( - now - seq_group.metrics.first_scheduled_time) - # Metadata - num_prompt_tokens_requests.append( - len(seq_group.prompt_token_ids)) - num_generation_tokens_requests.extend([ - seq.get_output_len() - for seq in seq_group.get_finished_seqs() - ]) - max_num_generation_tokens_requests.append( - max(seq.get_output_len() - for seq in seq_group.get_seqs())) - if seq_group.sampling_params is not None: - n_requests.append(seq_group.sampling_params.n) - max_tokens_requests.append( - seq_group.sampling_params.max_tokens) - finished_reason_requests.extend([ - SequenceStatus.get_finished_reason(seq.status) - for seq in seq_group.get_finished_seqs() - ]) - - # Number of generation tokens. - # num_batched_tokens equals the number of prompt_tokens plus the - # number of decode_tokens in a single iteration. So, - # num_generation_tokens = num_batched_tokens - num_prompt_tokens - # + num_generation_tokens_from_prefill_groups (since we generate - # one token on prefills on iters where the prefill finishes). - num_generation_tokens_iter = ( - actual_num_batched_tokens - num_prompt_tokens_iter + - num_generation_tokens_from_prefill_groups) - num_tokens_iter = (num_generation_tokens_iter + - num_prompt_tokens_iter) - - return Stats( - now=now, - # System stats - # Scheduler State - num_running_sys=num_running_sys, - num_swapped_sys=num_swapped_sys, - num_waiting_sys=num_waiting_sys, - # KV Cache Usage in % - gpu_cache_usage_sys=gpu_cache_usage_sys, - cpu_cache_usage_sys=cpu_cache_usage_sys, - # Prefix Cache Hit Rate - cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate, - gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate, - - # Iteration stats - num_prompt_tokens_iter=num_prompt_tokens_iter, - num_generation_tokens_iter=num_generation_tokens_iter, - num_tokens_iter=num_tokens_iter, - time_to_first_tokens_iter=time_to_first_tokens_iter, - inter_token_latencies_iter=inter_token_latencies_iter, - num_preemption_iter=num_preemption_iter, - - # Request stats - # Latency - time_e2e_requests=time_e2e_requests, - time_queue_requests=time_queue_requests, - time_inference_requests=time_inference_requests, - time_prefill_requests=time_prefill_requests, - time_decode_requests=time_decode_requests, - # Metadata - num_prompt_tokens_requests=num_prompt_tokens_requests, - num_generation_tokens_requests=num_generation_tokens_requests, - max_num_generation_tokens_requests= - max_num_generation_tokens_requests, - n_requests=n_requests, - max_tokens_requests=max_tokens_requests, - finished_reason_requests=finished_reason_requests, - max_lora=str(max_lora_stat), - waiting_lora_adapters=list(waiting_lora_adapters.keys()), - running_lora_adapters=list(running_lora_adapters.keys())) - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_executor.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_executor.remove_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_executor.list_loras() - - def pin_lora(self, lora_id: int) -> bool: - return self.model_executor.pin_lora(lora_id) - - def start_profile(self) -> None: - self.model_executor.start_profile() - - def stop_profile(self) -> None: - self.model_executor.stop_profile() - - def sleep(self, level: int = 1) -> None: - assert self.vllm_config.model_config.enable_sleep_mode, ( - "Sleep mode is not enabled in the model config") - self.model_executor.sleep(level=level) - - def wake_up(self, tags: Optional[list[str]] = None) -> None: - assert self.vllm_config.model_config.enable_sleep_mode, ( - "Sleep mode is not enabled in the model config") - self.model_executor.wake_up(tags) - - def is_sleeping(self) -> bool: - return self.model_executor.is_sleeping - - def check_health(self) -> None: - self.model_executor.check_health() - - def is_tracing_enabled(self) -> bool: - return self.tracer is not None - - def do_tracing(self, - scheduler_outputs: SchedulerOutputs, - finished_before: Optional[List[int]] = None) -> None: - if self.tracer is None: - return - - for idx, scheduled_seq_group in enumerate( - scheduler_outputs.scheduled_seq_groups): - # Skip double tracing when using async output proc - if finished_before and idx in finished_before: - continue - - seq_group = scheduled_seq_group.seq_group - if seq_group.is_finished(): - self.create_trace_span(seq_group) - - def create_trace_span(self, seq_group: SequenceGroup) -> None: - if self.tracer is None or seq_group.sampling_params is None: - return - arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9) - - trace_context = extract_trace_context(seq_group.trace_headers) - - with self.tracer.start_as_current_span( - "llm_request", - kind=SpanKind.SERVER, - context=trace_context, - start_time=arrival_time_nano_seconds) as seq_span: - metrics = seq_group.metrics - - # Handle potential None values for cancelled/aborted requests - ttft = (metrics.first_token_time - metrics.arrival_time - if metrics.first_token_time is not None else None) - - e2e_time = (metrics.finished_time - metrics.arrival_time - if metrics.finished_time is not None else None) - - seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL, - self.model_config.model) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, - seq_group.request_id) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, - seq_group.sampling_params.temperature) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, - seq_group.sampling_params.top_p) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, - seq_group.sampling_params.max_tokens) - seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, - seq_group.sampling_params.n) - seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES, - seq_group.num_seqs()) - seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, - len(seq_group.prompt_token_ids)) - seq_span.set_attribute( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, - sum([ - seq.get_output_len() - for seq in seq_group.get_finished_seqs() - ])) - - # Only set timing attributes if the values are available - if metrics.time_in_queue is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, - metrics.time_in_queue) - if ttft is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft) - if e2e_time is not None: - seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, - e2e_time) - if metrics.scheduler_time is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER, - metrics.scheduler_time) - if metrics.model_forward_time is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD, - metrics.model_forward_time / 1000.0) - if metrics.model_execute_time is not None: - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE, - metrics.model_execute_time) - - def _validate_model_inputs(self, inputs: ProcessorInputs): - encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs) - - if encoder_inputs is not None: - self._validate_model_input(encoder_inputs, prompt_type="encoder") - - self._validate_model_input(decoder_inputs, prompt_type="decoder") - - def _validate_model_input( - self, - prompt_inputs: SingletonInputs, - *, - prompt_type: Literal["encoder", "decoder"], - ): - model_config = self.model_config - tokenizer = self.tokenizer - - prompt_ids = prompt_inputs.get("prompt_token_ids", []) - if not prompt_ids: - if prompt_type == "encoder" and model_config.is_multimodal_model: - pass # Mllama may have empty encoder inputs for text-only data - elif prompt_inputs["type"] == "embeds": - pass - else: - raise ValueError(f"The {prompt_type} prompt cannot be empty") - - if tokenizer is not None: - max_input_id = max(prompt_ids, default=0) - if max_input_id > tokenizer.max_token_id: - raise ValueError( - f"Token id {max_input_id} is out of vocabulary") - - max_prompt_len = self.model_config.max_model_len - if len(prompt_ids) > max_prompt_len: - if prompt_type == "encoder" and model_config.is_multimodal_model: - mm_registry = self.input_preprocessor.mm_registry - mm_processor = mm_registry.create_processor( - model_config, - tokenizer=tokenizer or object(), # Dummy if no tokenizer - ) - assert isinstance(mm_processor, EncDecMultiModalProcessor) - - if mm_processor.pad_dummy_encoder_prompt: - return # Skip encoder length check for Whisper - - if model_config.is_multimodal_model: - suggestion = ( - "Make sure that `max_model_len` is no smaller than the " - "number of text tokens plus multimodal tokens. For image " - "inputs, the number of image tokens depends on the number " - "of images, and possibly their aspect ratios as well.") - else: - suggestion = ( - "Make sure that `max_model_len` is no smaller than the " - "number of text tokens.") - - raise ValueError( - f"The {prompt_type} prompt (length {len(prompt_ids)}) is " - f"longer than the maximum model length of {max_prompt_len}. " - f"{suggestion}") - - # TODO: Find out how many placeholder tokens are there so we can - # check that chunked prefill does not truncate them - # max_batch_len = self.scheduler_config.max_num_batched_tokens - - def _build_logits_processors( - self, sampling_params: SamplingParams, - lora_request: Optional[LoRARequest]) -> SamplingParams: - """Constructs logits processors based on the logits_bias, and - allowed_token_ids fields in sampling_params. Deletes those fields and - adds the constructed logits processors to the logits_processors field. - Returns the modified sampling params.""" - - logits_processors = [] - - if (sampling_params.logit_bias or sampling_params.allowed_token_ids): - tokenizer = self.get_tokenizer() - - processors = get_openai_logits_processors( - logit_bias=sampling_params.logit_bias, - allowed_token_ids=sampling_params.allowed_token_ids, - tokenizer=tokenizer) - logits_processors.extend(processors) - - # Unset so these don't get passed down to the model - sampling_params.logit_bias = None - sampling_params.allowed_token_ids = None - - if len(sampling_params.bad_words) > 0: - tokenizer = self.get_tokenizer() - processors = get_bad_words_logits_processors( - bad_words=sampling_params.bad_words, tokenizer=tokenizer) - logits_processors.extend(processors) - - if logits_processors: - if sampling_params.logits_processors is None: - sampling_params.logits_processors = logits_processors - else: - sampling_params.logits_processors.extend(logits_processors) - - return sampling_params - - def collective_rpc(self, - method: Union[str, Callable[[WorkerBase], _R]], - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict[str, Any]] = None) -> list[_R]: - return self.model_executor.collective_rpc(method, timeout, args, - kwargs) - - def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - return self.collective_rpc("apply_model", args=(func, )) - - -if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - LLMEngine = V1LLMEngine # type: ignore +LLMEngine = V1LLMEngine # type: ignore diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f2282c40f7073..0ab806fcb8b5c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -11,7 +11,6 @@ from pydantic import ValidationError from tqdm.auto import tqdm from typing_extensions import TypeVar -import vllm.envs as envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) @@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType, StructuredOutputsConfig, TokenizerMode, is_init_field) from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides, PoolerConfig, RunnerOption) -from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, ChatTemplateContentFormatOption, apply_hf_chat_template, @@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, Device, as_iter, is_list_of +from vllm.v1.engine.llm_engine import LLMEngine from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: @@ -309,11 +308,7 @@ class LLM: self.request_counter = Counter() self.default_sampling_params: Union[dict[str, Any], None] = None - if envs.VLLM_USE_V1: - supported_tasks = self.llm_engine \ - .get_supported_tasks() # type: ignore - else: - supported_tasks = self.llm_engine.model_config.supported_tasks + supported_tasks = self.llm_engine.get_supported_tasks() # type: ignore logger.info("Supported_tasks: %s", supported_tasks) @@ -1473,8 +1468,6 @@ class LLM: Note: This method is only available with the V1 LLM engine. """ - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - assert isinstance(self.llm_engine, V1LLMEngine) return self.llm_engine.get_metrics() def _validate_and_add_requests( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 58296131fadb9..13f4eebf1038e 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -672,21 +672,15 @@ def tensorize_vllm_model(engine_args: "EngineArgs", ) as stream: stream.write(encryption_params.key) - from vllm import LLMEngine - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + assert envs.VLLM_USE_V1 - if not envs.VLLM_USE_V1: - engine = LLMEngine.from_engine_args(engine_args) - engine.model_executor.collective_rpc( - "save_tensorized_model", - kwargs={"tensorizer_config": tensorizer_config.to_serializable()}, - ) - else: - engine = V1LLMEngine.from_vllm_config(engine_config) - engine.collective_rpc( - "save_tensorized_model", - kwargs={"tensorizer_config": tensorizer_config.to_serializable()}, - ) + from vllm.v1.engine.llm_engine import LLMEngine + + engine = LLMEngine.from_vllm_config(engine_config) + engine.collective_rpc( + "save_tensorized_model", + kwargs={"tensorizer_config": tensorizer_config.to_serializable()}, + ) def tensorize_lora_adapter(lora_path: str, From 86647d1cd0f3c82c7d678324db7e925654ac5665 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 17:57:20 -0700 Subject: [PATCH 515/584] [V0 Deprecation] Remove V0 Output Processor (#25320) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/build_cython.py | 39 ----- vllm/engine/output_processor/__init__.py | 0 vllm/engine/output_processor/interfaces.py | 59 -------- vllm/engine/output_processor/single_step.py | 145 ------------------- vllm/engine/output_processor/stop_checker.py | 139 ------------------ vllm/v1/engine/detokenizer.py | 42 +++++- 6 files changed, 40 insertions(+), 384 deletions(-) delete mode 100644 tests/build_cython.py delete mode 100644 vllm/engine/output_processor/__init__.py delete mode 100644 vllm/engine/output_processor/interfaces.py delete mode 100644 vllm/engine/output_processor/single_step.py delete mode 100644 vllm/engine/output_processor/stop_checker.py diff --git a/tests/build_cython.py b/tests/build_cython.py deleted file mode 100644 index 444434e8f0a79..0000000000000 --- a/tests/build_cython.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import Cython.Compiler.Options -from Cython.Build import cythonize -from setuptools import setup - -Cython.Compiler.Options.annotate = True - -infiles = [] - -infiles += [ - "vllm/engine/llm_engine.py", - "vllm/transformers_utils/detokenizer.py", - "vllm/engine/output_processor/single_step.py", - "vllm/outputs.py", - "vllm/engine/output_processor/stop_checker.py", -] - -infiles += [ - "vllm/core/scheduler.py", - "vllm/sequence.py", - "vllm/core/block_manager.py", -] - -infiles += [ - "vllm/model_executor/layers/sampler.py", - "vllm/sampling_params.py", - "vllm/utils/__init__.py", -] - -setup(ext_modules=cythonize(infiles, - annotate=False, - force=True, - compiler_directives={ - 'language_level': "3", - 'infer_types': True - })) - -# example usage: python3 build_cython.py build_ext --inplace diff --git a/vllm/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py deleted file mode 100644 index 587a9221e32c8..0000000000000 --- a/vllm/engine/output_processor/interfaces.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import List - -from vllm.config import SchedulerConfig -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.sequence import SequenceGroup, SequenceGroupOutput -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.utils import Counter - - -class SequenceGroupOutputProcessor(ABC): - """Interface for logic that processes new token ids in sequence groups, - managing detokenization, stop checking, and freeing/forking sequences with - the scheduler. - - This is highly coupled with the LLMEngine and should be seen as an extension - of it. The logic is separated to simplify the LLMEngine class and allow - separate implementations for single-step decoding (which supports beam - search sequence forking) and multi-step decoding (which does not support - beam search, but does support speculative decoding). - """ - - @staticmethod - def create_output_processor( - scheduler_config: SchedulerConfig, - detokenizer: Detokenizer, - scheduler: List[Scheduler], - seq_counter: Counter, - stop_checker: "StopChecker", - ): - """Create an output processor. - - Multi-step scheduling is no longer supported. Always return a - single-step output processor. - """ - from vllm.engine.output_processor.single_step import ( - SingleStepOutputProcessor) - return SingleStepOutputProcessor(scheduler_config, detokenizer, - scheduler, seq_counter, stop_checker) - - @abstractmethod - def process_outputs(self, sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], - is_async: bool) -> None: - """Process new token ids for the sequence group. Handles logic such as - detokenization, stop checking, and freeing/forking sequences in the - scheduler. - """ - pass - - @abstractmethod - def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: - """Update prompt logprobs received from outputs to seq_group.""" - pass diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py deleted file mode 100644 index dbf6a371d050a..0000000000000 --- a/vllm/engine/output_processor/single_step.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List - -from vllm.config import SchedulerConfig -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.interfaces import ( - SequenceGroupOutputProcessor) -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.logger import init_logger -from vllm.sequence import (CompletionSequenceGroupOutput, SequenceGroup, - SequenceGroupOutput) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.utils import Counter - -logger = init_logger(__name__) - - -def single_step_process_prompt_logprob( - sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, - output: CompletionSequenceGroupOutput) -> None: - """Process prompt logprobs associated with the - [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step. - - Do nothing if the output has no prompt logprobs. - - Account for the fact that transformers do not compute first-token logprobs. - - Args: - sg_output_proc: - [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor] - instance - seq_group: the output is associated with this - [`SequenceGroup`][vllm.sequence.SequenceGroup] - output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] - for a single scheduler step - """ - prompt_logprobs = output.prompt_logprobs - - # If this is the first (or only) "chunk" of the prefill, we need - # to prepend None to the list of prompt logprobs. The reason for this - # is that for N prompt tokens, the Sampler will generate N-1 total - # prompt logprobs during prefill since the token at idx 0 will not - # have a logprob associated with it. - if prompt_logprobs is not None: - if not seq_group.prompt_logprobs: - prompt_logprobs = [None] + prompt_logprobs - seq_group.prompt_logprobs = [] - - assert hasattr(sg_output_proc, 'detokenizer') - if (seq_group.sampling_params.detokenize - and sg_output_proc.detokenizer): - sg_output_proc.detokenizer.decode_prompt_logprobs_inplace( - seq_group, - prompt_logprobs, - position_offset=len(seq_group.prompt_logprobs)) - - seq_group.prompt_logprobs.extend(prompt_logprobs) - - -class SingleStepOutputProcessor(SequenceGroupOutputProcessor): - """SequenceGroupOutputProcessor which handles "output processing" logic, - which happens after the model returns generated token ids and before - scheduling of the next batch. Output processing logic includes - detokenization, and determining if a sequence is finished (e.g. via max len - or eos token). - - The SingleStepOutputProcessor is specialized to the case where the model - emits at most a single token per invocation, which precludes configurations - such as speculative decoding or multi-step decoding. This enables beam - search sampling, which requires forking/finishing/freeing sequences in a way - that is currently difficult to schedule multiple steps ahead of time. - """ - - def __init__(self, scheduler_config: SchedulerConfig, - detokenizer: Detokenizer, scheduler: List[Scheduler], - seq_counter: Counter, stop_checker: StopChecker): - self.scheduler_config = scheduler_config - self.detokenizer = detokenizer - self.scheduler = scheduler - self.seq_counter = seq_counter - self.stop_checker = stop_checker - - def process_outputs(self, sequence_group: SequenceGroup, - outputs: List[SequenceGroupOutput], - is_async: bool) -> None: - """Append all new tokens to sequences in the sequence group. Fork any - surviving beam candidates; free any unsurviving ones. - - Invokes detokenizer to detokenize new tokens, and also marks sequences - as finished if they meet stop conditions. - - is_async - Indicates whether this postprocessor runs in - parallel with the GPU forward pass and is processing - tokens from the previous step. If this is true, then - no tokens need to be appended since it is already done - externally (before the next schedule() call) - """ - assert (len(outputs) == 1 - ), f"{type(self)} does not support multiple outputs per step" - return self._process_sequence_group_outputs(sequence_group, outputs[0], - is_async) - - def process_prompt_logprob(self, seq_group: SequenceGroup, - outputs: List[SequenceGroupOutput]) -> None: - """Process prompt logprobs associated with one step of a single-step- - scheduled computation. - - Args: - seq_group: the output is associated with this - [`SequenceGroup`][vllm.sequence.SequenceGroup] - outputs: the - [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] - for a single scheduler step - """ - assert len(outputs) == 1, "Single step should only have 1 output." - output = outputs[0] - assert isinstance(output, CompletionSequenceGroupOutput) - single_step_process_prompt_logprob(self, seq_group, output) - - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - outputs: SequenceGroupOutput, - is_async: bool) -> None: - sampling_params = seq_group.sampling_params - - sample = outputs.samples[0] - seq = seq_group.first_seq - if not is_async: - seq.append_token_id(sample.output_token, sample.logprobs, - sample.output_embed) - if sampling_params.detokenize and self.detokenizer: - new_char_count = self.detokenizer.decode_sequence_inplace( - seq, sampling_params) - else: - new_char_count = 0 - self.stop_checker.maybe_stop_sequence( - seq, - new_char_count, - sampling_params, - lora_req=seq_group.lora_request, - ) - if seq.is_finished(): - for scheduler in self.scheduler: - scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py deleted file mode 100644 index 0916f1c918c85..0000000000000 --- a/vllm/engine/output_processor/stop_checker.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Tuple - -from vllm.lora.request import LoRARequest -from vllm.reasoning import ReasoningParser -from vllm.sampling_params import SamplingParams -from vllm.sequence import Sequence, SequenceStatus - - -class StopChecker: - """LLMEngine helper class which separates out the logic involving stop - checking. This checks things such as: whether the eos token was emitted, - whether the max_tokens has been consumed, whether a stop string has been - emitted, or if we have exceeded the max model len. - """ - - def __init__( - self, - max_model_len: int, - reasoner: Optional[ReasoningParser] = None, - ): - # Do not use it directly, but use `self._get_max_model_len`. - self._max_model_len = max_model_len - self.reasoner = reasoner - - def _get_max_model_len(self, lora_req: Optional[LoRARequest]): - if lora_req and lora_req.long_lora_max_len: - return lora_req.long_lora_max_len - else: - return self._max_model_len - - def maybe_stop_sequence( - self, - seq: Sequence, - new_char_count: int, - sampling_params: SamplingParams, - lora_req: Optional[LoRARequest] = None, - ) -> None: - """Stop the finished sequences. - - new_char_count is the number of chars added to the - sequence's output text for the newly generated token - """ - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - # Remove the last EOS token unless explicitly specified - # This prevents unintended exposure of the EOS token - if new_char_count and ( - not sampling_params.include_stop_str_in_output): - seq.output_text = seq.output_text[:-new_char_count] - seq.status = SequenceStatus.FINISHED_STOPPED - return - - # Skip stop string/token checks if in reasoning content generation - if self.reasoner is not None and \ - not self.reasoner.is_reasoning_end(seq.get_token_ids()): - return - - # Check if a stop token was encountered. - # This assumes a single token produced per step. - last_token_id = seq.get_last_token_id() - if last_token_id in (sampling_params.stop_token_ids or ()): - if new_char_count and ( - not sampling_params.include_stop_str_in_output): - # Remove last token - seq.output_text = seq.output_text[:-new_char_count] - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if any stop strings are matched. - stop = self.check_stop_strings( - seq.output_text, new_char_count, sampling_params.stop, - sampling_params.include_stop_str_in_output) - if stop is not None: - stop_str, truncate_to = stop - if truncate_to != -1: - seq.output_text = seq.output_text[:truncate_to] - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - - # Check if the sequence has reached max_model_len. - if seq.get_len() >= self._get_max_model_len(lora_req): - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if seq.get_output_len() == sampling_params.max_tokens: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - @staticmethod - def check_stop_strings( - output_text: str, - new_char_count: int, - stop: List[str], - include_in_output: bool, - ) -> Optional[Tuple[str, int]]: - """Check if any stop strings are matched and truncate sequence - output text accordingly. - - Returns tuple (stop_string, offset) if matched or else None. - - Where stop_string is the matched stop string and offset is the - length to which output_text should be truncated, or -1 for no - truncation. - """ - if not new_char_count or not stop: - return None - - for stop_str in stop: - stop_string_len = len(stop_str) - # Avoid searching already-searched text. - stop_index = output_text.find(stop_str, - 1 - new_char_count - stop_string_len) - if stop_index == -1: - continue - - if include_in_output: - # Truncate to end of stop string. - stop_index += stop_string_len - if stop_index >= len(output_text): - # No truncation required. - return stop_str, -1 - - # Truncate the output text to either the beginning - # or end of the stop string. - return stop_str, stop_index - return None diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 8aa36d6a439c1..0f993a74c8103 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -9,7 +9,6 @@ from tokenizers import Tokenizer from tokenizers.decoders import DecodeStream from transformers import PreTrainedTokenizerFast -from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) @@ -129,7 +128,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # 2) Evaluate stop strings. stop_string = None if self.stop and len(self.output_token_ids) > self.min_tokens: - stop = StopChecker.check_stop_strings( + stop = check_stop_strings( output_text=self.output_text, new_char_count=len(self.output_text) - stop_check_offset, stop=self.stop, @@ -309,3 +308,42 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): self.read_offset = read_offset return decoded_text + + +def check_stop_strings( + output_text: str, + new_char_count: int, + stop: list[str], + include_in_output: bool, +) -> Optional[tuple[str, int]]: + """Check if any stop strings are matched and truncate sequence + output text accordingly. + + Returns tuple (stop_string, offset) if matched or else None. + + Where stop_string is the matched stop string and offset is the + length to which output_text should be truncated, or -1 for no + truncation. + """ + if not new_char_count or not stop: + return None + + for stop_str in stop: + stop_string_len = len(stop_str) + # Avoid searching already-searched text. + stop_index = output_text.find(stop_str, + 1 - new_char_count - stop_string_len) + if stop_index == -1: + continue + + if include_in_output: + # Truncate to end of stop string. + stop_index += stop_string_len + if stop_index >= len(output_text): + # No truncation required. + return stop_str, -1 + + # Truncate the output text to either the beginning + # or end of the stop string. + return stop_str, stop_index + return None From 572ddf83ce1fe8d52670e01a7a6cc0d8a99fa90c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 19:53:20 -0700 Subject: [PATCH 516/584] [Chore] Remove unused sampler in models (#25324) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/lora/conftest.py | 3 --- vllm/model_executor/models/ernie_mtp.py | 10 ---------- vllm/model_executor/models/plamo2.py | 10 ---------- vllm/model_executor/models/step3_text.py | 10 ---------- vllm/model_executor/models/step3_vl.py | 16 ---------------- 5 files changed, 49 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 3475993ff8f07..b539a7bf5d76c 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -17,7 +17,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.platforms import current_platform @@ -97,7 +96,6 @@ def dummy_model() -> nn.Module: # Special handling for lm_head & sampler ("lm_head", ParallelLMHead(512, 10)), ("logits_processor", LogitsProcessor(512)), - ("sampler", Sampler()) ])) model.config = MagicMock() model.embedding_modules = {"lm_head": "lm_head"} @@ -125,7 +123,6 @@ def dummy_model_gate_up() -> nn.Module: # Special handling for lm_head & sampler ("lm_head", ParallelLMHead(512, 10)), ("logits_processor", LogitsProcessor(512)), - ("sampler", Sampler()) ])) model.config = MagicMock() model.packed_modules_mapping = { diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py index 57c5348874378..c446265230313 100644 --- a/vllm/model_executor/models/ernie_mtp.py +++ b/vllm/model_executor/models/ernie_mtp.py @@ -33,7 +33,6 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -160,7 +159,6 @@ class ErnieMTP(nn.Module, SupportsPP): self.lm_head = ParallelLMHead(self.config.vocab_size, self.config.hidden_size, prefix=maybe_prefix(prefix, "lm_head")) - self.sampler = get_sampler() if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight @@ -188,14 +186,6 @@ class ErnieMTP(nn.Module, SupportsPP): return self.model.compute_logits(hidden_states, self.lm_head, sampling_metadata, spec_step_idx) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index ef96d272adfb5..9f1ee36366fdd 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import ( mamba_chunk_scan_combined) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -932,7 +931,6 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -1030,14 +1028,6 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index b8733fa5e6129..6a5b540fc8174 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -26,7 +26,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -391,7 +390,6 @@ class Step3TextForCausalLM(nn.Module, SupportsPP): ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() @@ -413,14 +411,6 @@ class Step3TextForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: qkv_params_mapping = [ diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 2ba5f94ea3b88..c2940f8e44450 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from itertools import product from math import ceil, sqrt from typing import Any, Literal, Optional, TypedDict, Union @@ -24,7 +23,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -897,13 +895,6 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - @property def device(self): return next(self.parameters()).device @@ -1069,13 +1060,6 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): skip_prefixes = [] From 72dd1595b466383c8c9c9e39cb519814df1a105f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 19:57:46 -0700 Subject: [PATCH 517/584] [CI] Skip tests failing on main (#25326) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- .../entrypoints/openai/test_completion_with_prompt_embeds.py | 1 + tests/models/quantization/test_fp8.py | 5 ++++- tests/models/test_oot_registration.py | 1 + tests/quantization/test_compressed_tensors.py | 5 ++++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 176c1825530e4..9c62595ad280b 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -60,6 +60,7 @@ def create_dummy_embeds(num_tokens: int = 5) -> str: return base64.b64encode(buffer.getvalue()).decode('utf-8') +@pytest.mark.skip("This test is skipped because it is flaky.") @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_prompt_embeds( diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index 97dd4d6135ac4..bb8ae741b6149 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -32,7 +32,7 @@ from ..utils import check_logprobs_close # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("enforce_eager", [True]) -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) @@ -57,6 +57,9 @@ def test_models( pytest.skip( f"{kv_cache_dtype} is currently not supported on ROCm/HIP.") + if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None): + pytest.skip(f"{kv_cache_dtype} is not supported on this platform.") + with monkeypatch.context() as m: m.setenv("TOKENIZERS_PARALLELISM", 'true') m.setenv(STR_BACKEND_ENV_VAR, backend) diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index cb30d77c4f0ea..9b376f2a260ac 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -63,6 +63,7 @@ def test_oot_registration_embedding( image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") +@pytest.mark.skip(reason="This test is skipped because it failed on V1.") @create_new_process_for_each_test() def test_oot_registration_multimodal( monkeypatch: pytest.MonkeyPatch, diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index b7949a488ad05..c0ab3fbb10622 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -357,6 +357,9 @@ def test_compressed_tensors_fp8(vllm_runner): assert output +@pytest.mark.skipif( + not current_platform.is_kv_cache_dtype_supported("fp8", None), + reason="FP8 KV cache is not supported on this device.") @pytest.mark.skipif(not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform.") def test_compressed_tensors_kv_cache(vllm_runner): @@ -738,4 +741,4 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt, with vllm_runner(model, enforce_eager=True) as llm: perplexity = llm.generate_prompt_perplexity([prompt])[0] print(perplexity) - assert perplexity <= exp_perplexity \ No newline at end of file + assert perplexity <= exp_perplexity From c99db8c8ddd50c0d30c14057012bd27b42d32b2d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 19:58:26 -0700 Subject: [PATCH 518/584] [V0 Deprecation] Remove V0 core (#25321) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- .buildkite/test-pipeline.yaml | 3 - .github/CODEOWNERS | 2 - pyproject.toml | 2 - .../backends/differential_flash_attn.py | 12 +- .../backends/dual_chunk_flash_attn.py | 10 +- vllm/attention/backends/flash_attn.py | 12 +- vllm/attention/backends/mla/common.py | 13 +- vllm/attention/backends/placeholder_attn.py | 11 +- vllm/attention/backends/rocm_aiter_mla.py | 7 +- vllm/attention/backends/utils.py | 9 +- vllm/core/__init__.py | 0 vllm/core/block/__init__.py | 0 vllm/core/block/block_table.py | 399 ---- vllm/core/block/common.py | 371 --- vllm/core/block/cpu_gpu_block_allocator.py | 439 ---- vllm/core/block/interfaces.py | 319 --- vllm/core/block/naive_block.py | 466 ---- vllm/core/block/prefix_caching_block.py | 1135 --------- vllm/core/block/utils.py | 28 - vllm/core/block_manager.py | 523 ----- vllm/core/evictor.py | 157 -- vllm/core/interfaces.py | 139 -- vllm/core/placeholder_block_space_manager.py | 103 - vllm/core/scheduler.py | 2028 ---------------- vllm/engine/protocol.py | 8 +- vllm/v1/engine/async_llm.py | 6 +- vllm/worker/cache_engine.py | 145 -- vllm/worker/model_runner.py | 2031 ----------------- vllm/worker/worker.py | 666 ------ 29 files changed, 24 insertions(+), 9020 deletions(-) delete mode 100644 vllm/core/__init__.py delete mode 100644 vllm/core/block/__init__.py delete mode 100644 vllm/core/block/block_table.py delete mode 100644 vllm/core/block/common.py delete mode 100644 vllm/core/block/cpu_gpu_block_allocator.py delete mode 100644 vllm/core/block/interfaces.py delete mode 100644 vllm/core/block/naive_block.py delete mode 100644 vllm/core/block/prefix_caching_block.py delete mode 100644 vllm/core/block/utils.py delete mode 100644 vllm/core/block_manager.py delete mode 100644 vllm/core/evictor.py delete mode 100644 vllm/core/interfaces.py delete mode 100644 vllm/core/placeholder_block_space_manager.py delete mode 100644 vllm/core/scheduler.py delete mode 100644 vllm/worker/cache_engine.py delete mode 100644 vllm/worker/model_runner.py delete mode 100644 vllm/worker/worker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1e7ce6ef0a665..9d38e571324b4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -148,7 +148,6 @@ steps: num_gpus: 4 source_file_dependencies: - vllm/distributed/ - - vllm/core/ - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events @@ -867,8 +866,6 @@ steps: - tests/distributed/ - vllm/compilation - vllm/worker/worker_base.py - - vllm/worker/worker.py - - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py - tests/v1/test_async_llm_dp.py - tests/v1/test_external_lb_dp.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f58256d38b9d1..37bd0ace98a97 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,10 +4,8 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn -/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 diff --git a/pyproject.toml b/pyproject.toml index fe55461db00be..f43ae69e00bdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,6 @@ line-length = 80 "vllm/_version.py" = ["ALL"] # Python 3.8 typing - skip V0 code "vllm/attention/**/*.py" = ["UP006", "UP035"] -"vllm/core/**/*.py" = ["UP006", "UP035"] "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] @@ -117,7 +116,6 @@ files = [ "vllm/*.py", "vllm/assets", "vllm/entrypoints", - "vllm/core", "vllm/inputs", "vllm/logging_utils", "vllm/multimodal", diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py index a7d0e3afb517f..87a4558e377d0 100644 --- a/vllm/attention/backends/differential_flash_attn.py +++ b/vllm/attention/backends/differential_flash_attn.py @@ -4,7 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch from einops import rearrange @@ -34,9 +34,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - logger = init_logger(__name__) @@ -329,7 +326,7 @@ class DifferentialFlashAttentionMetadata(AttentionMetadata): class DifferentialFlashAttentionMetadataBuilder( AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]): - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner self.sliding_window = input_builder.sliding_window @@ -350,9 +347,8 @@ class DifferentialFlashAttentionMetadataBuilder( self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. 2. block table. diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index 85957bea1e26d..de47bb8ebd8fb 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -4,7 +4,7 @@ """ import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch import torch.distributed @@ -22,9 +22,6 @@ from vllm.utils import async_tensor_h2d from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache, sparse_attn_func) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - logger = init_logger(__name__) @@ -224,9 +221,8 @@ class DualChunkFlashAttentionMetadataBuilder(FlashAttentionMetadataBuilder): super().prepare() self.orig_seq_lens: List[int] = [] - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): super()._add_seq_group(inter_data, chunked_prefill_enabled, prefix_cache_hit) for prompt_len, seq_len in zip(inter_data.prompt_lens, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 78c768f92d3c2..edb3afb4aa075 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -4,7 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import torch @@ -31,9 +31,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - logger = init_logger(__name__) @@ -312,7 +309,7 @@ class FlashAttentionMetadata(AttentionMetadata): class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner self.sliding_window = input_builder.sliding_window @@ -332,9 +329,8 @@ class FlashAttentionMetadataBuilder( self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. 2. block table. diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 789393eb39a73..826b63e1ccda8 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -193,8 +193,7 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, - Type, TypeVar) +from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar import torch @@ -233,9 +232,6 @@ except ImportError: except ImportError: flash_attn_varlen_func = None -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - is_hip = current_platform.is_rocm() @@ -638,7 +634,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): """ BLOCK_TABLE_EXTENDER: list[list[int]] = [] - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner self.sliding_window = input_builder.sliding_window @@ -668,9 +664,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool, prefix_cache_hit: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, + prefix_cache_hit: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. 2. block table. diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index e630a6c6de8c4..f82d28938f45c 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -4,7 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import torch @@ -13,9 +13,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadataBuilder) from vllm.attention.backends.utils import CommonAttentionState from vllm.multimodal import MultiModalPlaceholderMap - -if TYPE_CHECKING: - from vllm.worker.model_runner import (ModelInputForGPUBuilder) from vllm.utils import async_tensor_h2d # Placeholder attention backend for models like Mamba and pooling models that @@ -204,7 +201,7 @@ class PlaceholderAttentionMetadata(AttentionMetadata): class PlaceholderAttentionMetadataBuilder( AttentionMetadataBuilder[PlaceholderAttentionMetadata]): - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner @@ -220,9 +217,7 @@ class PlaceholderAttentionMetadataBuilder( self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool): """Add a sequence group to the metadata. Specifically update/append 1. context length. """ diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index a2e9710437d95..587d08858b92b 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Type, Union +from typing import Optional, Type, Union import torch @@ -19,9 +19,6 @@ from vllm.attention.backends.utils import (compute_slot_mapping, from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd, get_aiter_mla_metadata) -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - def is_aiter_mla_enabled() -> bool: return envs.VLLM_ROCM_USE_AITER \ @@ -110,7 +107,7 @@ class AiterMLAMetadata(MLACommonMetadata): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): BLOCK_TABLE_EXTENDER: list[list[int]] = [[]] - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): super().__init__(input_builder) assert self.block_size == 1, "AITER MLA requires only block size 1." diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 7b6c426b0f851..289cfa2177438 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -35,9 +35,6 @@ PAD_SLOT_ID = -1 # if we have at least this many elements. Could be tuned further. _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256 -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder - def is_block_tables_empty(block_tables: Union[None, Dict]): """ @@ -129,7 +126,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): _metadata_cls: Type[TAttentionMetadata] - def __init__(self, input_builder: "ModelInputForGPUBuilder"): + def __init__(self, input_builder): self.input_builder = input_builder self.runner = input_builder.runner @@ -149,9 +146,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - def _add_seq_group( - self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", - chunked_prefill_enabled: bool): + def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool): is_prompt = inter_data.is_prompt block_tables = inter_data.block_tables diff --git a/vllm/core/__init__.py b/vllm/core/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/core/block/__init__.py b/vllm/core/block/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py deleted file mode 100644 index 444bb25f2830a..0000000000000 --- a/vllm/core/block/block_table.py +++ /dev/null @@ -1,399 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from typing import List, Optional - -from vllm.core.block.common import BlockList -from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator -from vllm.utils import Device, cdiv, chunk_list - - -class BlockTable: - """A class to manage blocks for a specific sequence. - - The BlockTable maps a sequence of tokens to a list of blocks, where each - block represents a contiguous memory allocation for a portion of the - sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is - responsible for allocating and freeing memory for the blocks. - - Args: - block_size (int): The maximum number of tokens that can be stored in a - single block. - block_allocator (DeviceAwareBlockAllocator): The block allocator used to - manage memory for the blocks. - _blocks (Optional[List[Block]], optional): An optional list of existing - blocks to initialize the BlockTable with. If not provided, an empty - BlockTable is created. - max_block_sliding_window (Optional[int], optional): The number of - blocks to keep around for each sequence. If None, all blocks - are kept (eg., when sliding window is not used). - It should at least fit the sliding window size of the model. - - Attributes: - _block_size (int): The maximum number of tokens that can be stored in a - single block. - _allocator (DeviceAwareBlockAllocator): The block allocator used to - manage memory for the blocks. - _blocks (Optional[List[Block]]): The list of blocks managed by this - BlockTable. - _num_full_slots (int): The number of tokens currently stored in the - blocks. - """ - - def __init__( - self, - block_size: int, - block_allocator: DeviceAwareBlockAllocator, - _blocks: Optional[List[Block]] = None, - max_block_sliding_window: Optional[int] = None, - ): - self._block_size = block_size - self._allocator = block_allocator - if _blocks is None: - _blocks = [] - self._blocks: BlockList = BlockList(_blocks) - - self._max_block_sliding_window = max_block_sliding_window - self._num_full_slots = self._get_num_token_ids() - - @staticmethod - def get_num_required_blocks(token_ids: List[int], - block_size: int, - num_lookahead_slots: int = 0) -> int: - """Calculates the minimum number of blocks required to store a given - sequence of token IDs along with any look-ahead slots that may be - required (like in multi-step + chunked-prefill). - - This assumes worst-case scenario, where every block requires a new - allocation (e.g. ignoring prefix caching). - - Args: - token_ids (List[int]): The sequence of token IDs to be stored. - block_size (int): The maximum number of tokens that can be stored in - a single block. - num_lookahead_slots (int): look-ahead slots that the sequence may - require. - - Returns: - int: The minimum number of blocks required to store the given - sequence of token IDs along with any required look-ahead slots. - """ - return cdiv(len(token_ids) + num_lookahead_slots, block_size) - - def allocate(self, - token_ids: List[int], - device: Device = Device.GPU, - extra_hash: Optional[int] = None) -> None: - """Allocates memory blocks for storing the given sequence of token IDs. - - This method allocates the required number of blocks to store the given - sequence of token IDs. - - Args: - token_ids (List[int]): The sequence of token IDs to be stored. - device (Device, optional): The device on which the blocks should be - allocated. Defaults to Device.GPU. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefixcaching block. - """ - assert not self._is_allocated - assert token_ids - blocks = self._allocate_blocks_for_token_ids(prev_block=None, - token_ids=token_ids, - device=device, - extra_hash=extra_hash) - self.update(blocks) - self._num_full_slots = len(token_ids) - - def update(self, blocks: List[Block]) -> None: - """Resets the table to the newly provided blocks - (with their corresponding block ids) - """ - self._blocks.update(blocks) - - def append_token_ids(self, - token_ids: List[int], - num_lookahead_slots: int = 0, - num_computed_slots: Optional[int] = None, - extra_hash: Optional[int] = None) -> None: - """Appends a sequence of token IDs to the existing blocks in the - BlockTable. - - This method appends the given sequence of token IDs to the existing - blocks in the BlockTable. If there is not enough space in the existing - blocks, new blocks are allocated using the `ensure_num_empty_slots` - method to accommodate the additional tokens. - - The token IDs are divided into chunks of size `block_size` (except for - the first chunk, which may be smaller), and each chunk is appended to a - separate block. - - Args: - token_ids (List[int]): The sequence of token IDs to be appended. - num_computed_slots (Optional[int]): The number of KV cache slots - that are already filled (computed). - When sliding window is enabled, this is used to compute how many - blocks to drop at the front of the sequence. - Without sliding window, None can be passed. - Without chunked prefill, it should be the same as - _num_full_slots. - extra_hash (Optional[int]): The hash value of additional - factors such as adapters that influence the block, apart - from the token_ids. - """ - assert self._is_allocated, "no blocks have been allocated" - assert len(self._blocks) > 0 - - # Drop blocks that are no longer needed due to sliding window - if self._max_block_sliding_window is not None: - null_block = self._allocator.allocate_or_get_null_block() - assert num_computed_slots is not None - end_block_idx = (num_computed_slots // - self._block_size) - self._max_block_sliding_window - for idx in range(0, end_block_idx): - b = self._blocks[idx] - if b is not null_block: - self._allocator.free(b) - self._blocks[idx] = null_block - - # Ensure there are enough empty slots for the new tokens plus - # lookahead slots - self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + - num_lookahead_slots, - extra_hash=extra_hash) - - # Update the blocks with the new tokens - first_block_idx = self._num_full_slots // self._block_size - token_blocks = self._chunk_token_blocks_for_append(token_ids) - - for i, token_block in enumerate(token_blocks): - self._blocks.append_token_ids(first_block_idx + i, token_block) - - self._num_full_slots += len(token_ids) - - def ensure_num_empty_slots(self, - num_empty_slots: int, - extra_hash: Optional[int] = None) -> None: - """Ensures that the BlockTable has at least the specified number of - empty slots available. - - This method checks if the BlockTable has enough empty slots (i.e., - available space) to accommodate the requested number of tokens. If not, - it allocates additional blocks on the GPU to ensure that the required - number of empty slots is available. - - Args: - num_empty_slots (int): The minimum number of empty slots required. - extra_hash (Optional[int]): The hash value of additional - factors such as adapters that influence the block, apart - from the token_ids. - """ - # Currently the block table only supports - # appending tokens to GPU blocks. - device = Device.GPU - assert self._is_allocated - - if self._num_empty_slots >= num_empty_slots: - return - - slots_to_allocate = num_empty_slots - self._num_empty_slots - blocks_to_allocate = cdiv(slots_to_allocate, self._block_size) - - for _ in range(blocks_to_allocate): - assert len(self._blocks) > 0 - self._blocks.append( - self._allocator.allocate_mutable_block( - prev_block=self._blocks[-1], - device=device, - extra_hash=extra_hash)) - - def fork(self) -> "BlockTable": - """Creates a new BlockTable instance with a copy of the blocks from the - current instance. - - This method creates a new BlockTable instance with the same block size, - block allocator, and a copy of the blocks from the current instance. The - new BlockTable has its own independent set of blocks, but shares the - same underlying memory allocation with the original BlockTable. - - Returns: - BlockTable: A new BlockTable instance with a copy of the blocks from - the current instance. - """ - assert self._is_allocated - assert len(self._blocks) > 0 - forked_blocks = self._allocator.fork(self._blocks[-1]) - return BlockTable( - block_size=self._block_size, - block_allocator=self._allocator, - _blocks=forked_blocks, - max_block_sliding_window=self._max_block_sliding_window, - ) - - def free(self) -> None: - """Frees the memory occupied by the blocks in the BlockTable. - - This method iterates over all the blocks in the `_blocks` list and calls - the `free` method of the `_allocator` object to release the memory - occupied by each block. After freeing all the blocks, the `_blocks` list - is set to `None`. - """ - for block in self.blocks: - self._allocator.free(block) - self._blocks.reset() - - @property - def physical_block_ids(self) -> List[int]: - """Returns a list of physical block indices for the blocks in the - BlockTable. - - This property returns a list of integers, where each integer represents - the physical block index of a corresponding block in the `_blocks` list. - The physical block index is a unique identifier for the memory location - occupied by the block. - - Returns: - List[int]: A list of physical block indices for the blocks in the - BlockTable. - """ - return self._blocks.ids() - - def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: - """Get the number of "unseen" tokens in the sequence. - - Unseen tokens are tokens in the sequence corresponding to this block - table, but are not yet appended to this block table. - - Args: - sequence_token_ids (List[int]): The list of token ids in the - sequence. - - Returns: - List[int]: The postfix of sequence_token_ids that has not yet been - appended to the block table. - """ - - # Since the block table is append-only, the unseen token ids are the - # ones after the appended ones. - return sequence_token_ids[self.num_full_slots:] - - def _allocate_blocks_for_token_ids( - self, - prev_block: Optional[Block], - token_ids: List[int], - device: Device, - extra_hash: Optional[int] = None) -> List[Block]: - blocks: List[Block] = [] - - block_token_ids = [] - tail_token_ids = [] - for cur_token_ids in chunk_list(token_ids, self._block_size): - if len(cur_token_ids) == self._block_size: - block_token_ids.append(cur_token_ids) - else: - tail_token_ids.append(cur_token_ids) - - if block_token_ids: - blocks.extend( - self._allocator.allocate_immutable_blocks( - prev_block, - block_token_ids=block_token_ids, - device=device, - extra_hash=extra_hash)) - prev_block = blocks[-1] - - if tail_token_ids: - assert len(tail_token_ids) == 1 - cur_token_ids = tail_token_ids[0] - - block = self._allocator.allocate_mutable_block( - prev_block=prev_block, device=device, extra_hash=extra_hash) - block.append_token_ids(cur_token_ids) - - blocks.append(block) - - return blocks - - def _get_all_token_ids(self) -> List[int]: - # NOTE: This function is O(seq_len); use sparingly. - token_ids: List[int] = [] - - if not self._is_allocated: - return token_ids - - for block in self.blocks: - token_ids.extend(block.token_ids) - - return token_ids - - def _get_num_token_ids(self) -> int: - res = 0 - for block in self.blocks: - res += len(block.token_ids) - - return res - - @property - def _is_allocated(self) -> bool: - return len(self._blocks) > 0 - - @property - def blocks(self) -> List[Block]: - return self._blocks.list() - - @property - def _num_empty_slots(self) -> int: - assert self._is_allocated - return len(self._blocks) * self._block_size - self._num_full_slots - - @property - def num_full_slots(self) -> int: - """Returns the total number of tokens currently stored in the - BlockTable. - - Returns: - int: The total number of tokens currently stored in the BlockTable. - """ - return self._num_full_slots - - def get_num_blocks_touched_by_append_slots( - self, token_ids: List[int], num_lookahead_slots: int) -> int: - """Determine how many blocks will be "touched" by appending the token - ids. - - This is required for the scheduler to determine whether a sequence can - continue generation, or if it must be preempted. - """ - # Math below is equivalent to: - # all_token_ids = token_ids + [-1] * num_lookahead_slots - # token_blocks = self._chunk_token_blocks_for_append(all_token_ids) - # return len(token_blocks) - - num_token_ids = len(token_ids) + num_lookahead_slots - first_chunk_size = self._block_size - (self._num_full_slots % - self._block_size) - num_token_blocks = (1 + math.ceil( - (num_token_ids - first_chunk_size) / self._block_size)) - return num_token_blocks - - def _chunk_token_blocks_for_append( - self, token_ids: List[int]) -> List[List[int]]: - """Split the token ids into block-sized chunks so they can be easily - appended to blocks. The first such "token block" may have less token ids - than the block size, since the last allocated block may be partially - full. - - If no token ids are provided, then no chunks are returned. - """ - - if not token_ids: - return [] - - first_chunk_size = self._block_size - (self._num_full_slots % - self._block_size) - token_blocks = [token_ids[:first_chunk_size]] - token_blocks.extend( - chunk_list(token_ids[first_chunk_size:], self._block_size)) - return token_blocks diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py deleted file mode 100644 index a337007a9eaa6..0000000000000 --- a/vllm/core/block/common.py +++ /dev/null @@ -1,371 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections import deque -from dataclasses import dataclass -from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple - -from vllm.core.block.interfaces import Block, BlockAllocator - -BlockId = int -RefCount = int - - -class RefCounterProtocol(Protocol): - - def incr(self, block_id: BlockId) -> RefCount: - raise NotImplementedError - - def decr(self, block_id: BlockId) -> RefCount: - raise NotImplementedError - - def get(self, block_id: BlockId) -> RefCount: - raise NotImplementedError - - -class RefCounter(RefCounterProtocol): - """A class for managing reference counts for a set of block indices. - - The RefCounter class maintains a dictionary that maps block indices to their - corresponding reference counts. It provides methods to increment, decrement, - and retrieve the reference count for a given block index. - - Args: - all_block_indices (Iterable[BlockId]): An iterable of block indices - to initialize the reference counter with. - """ - - def __init__(self, all_block_indices: Iterable[BlockId]): - deduped = set(all_block_indices) - self._refcounts: Dict[BlockId, RefCount] = { - index: 0 - for index in deduped - } - - def incr(self, block_id: BlockId) -> RefCount: - assert block_id in self._refcounts - pre_incr_refcount = self._refcounts[block_id] - - assert pre_incr_refcount >= 0 - - post_incr_refcount = pre_incr_refcount + 1 - self._refcounts[block_id] = post_incr_refcount - return post_incr_refcount - - def decr(self, block_id: BlockId) -> RefCount: - assert block_id in self._refcounts - refcount = self._refcounts[block_id] - - assert refcount > 0 - refcount -= 1 - - self._refcounts[block_id] = refcount - - return refcount - - def get(self, block_id: BlockId) -> RefCount: - assert block_id in self._refcounts - return self._refcounts[block_id] - - def as_readonly(self) -> "ReadOnlyRefCounter": - return ReadOnlyRefCounter(self) - - -class ReadOnlyRefCounter(RefCounterProtocol): - """A read-only view of the RefCounter class. - - The ReadOnlyRefCounter class provides a read-only interface to access the - reference counts maintained by a RefCounter instance. It does not allow - modifications to the reference counts. - - Args: - refcounter (RefCounter): The RefCounter instance to create a read-only - view for. - """ - - def __init__(self, refcounter: RefCounter): - self._refcounter = refcounter - - def incr(self, block_id: BlockId) -> RefCount: - raise ValueError("Incr not allowed") - - def decr(self, block_id: BlockId) -> RefCount: - raise ValueError("Decr not allowed") - - def get(self, block_id: BlockId) -> RefCount: - return self._refcounter.get(block_id) - - -class CopyOnWriteTracker: - """A class for tracking and managing copy-on-write operations for blocks. - - The CopyOnWriteTracker class maintains a mapping of source block indices to - their corresponding copy-on-write destination block indices. It works in - conjunction with a RefCounter. - - Args: - refcounter (RefCounter): The reference counter used to track block - reference counts. - """ - - def __init__(self, refcounter: RefCounterProtocol): - self._copy_on_writes: List[Tuple[BlockId, BlockId]] = [] - self._refcounter = refcounter - - def is_appendable(self, block: Block) -> bool: - """Checks if the block is shared or not. If shared, then it cannot - be appended and needs to be duplicated via copy-on-write - """ - block_id = block.block_id - if block_id is None: - return True - - refcount = self._refcounter.get(block_id) - return refcount <= 1 - - def record_cow(self, src_block_id: Optional[BlockId], - trg_block_id: Optional[BlockId]) -> None: - """Records a copy-on-write operation from source to target block id - Args: - src_block_id (BlockId): The source block id from which to copy - the data - trg_block_id (BlockId): The target block id to which the data - is copied - """ - assert src_block_id is not None - assert trg_block_id is not None - self._copy_on_writes.append((src_block_id, trg_block_id)) - - def clear_cows(self) -> List[Tuple[BlockId, BlockId]]: - """Clears the copy-on-write tracking information and returns the current - state. - - This method returns a list mapping source block indices to - destination block indices for the current copy-on-write operations. - It then clears the internal tracking information. - - Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source - block indices to destination block indices for the - current copy-on-write operations. - """ - cows = self._copy_on_writes - self._copy_on_writes = [] - return cows - - -class BlockPool: - """Used to pre-allocate block objects, in order to avoid excessive python - object allocations/deallocations. - The pool starts from "pool_size" objects and will increase to more objects - if necessary - - Note that multiple block objects may point to the same physical block id, - which is why this pool is needed, so that it will be easier to support - prefix caching and more complicated sharing of physical blocks. - """ - - def __init__(self, block_size: int, create_block: Block.Factory, - allocator: BlockAllocator, pool_size: int): - self._block_size = block_size - self._create_block = create_block - self._allocator = allocator - self._pool_size = pool_size - assert self._pool_size >= 0 - - self._free_ids: Deque[int] = deque(range(self._pool_size)) - self._pool = [] - for i in range(self._pool_size): - self._pool.append( - self._create_block(prev_block=None, - token_ids=[], - block_size=self._block_size, - allocator=self._allocator, - block_id=None, - extra_hash=None)) - - def increase_pool(self): - """Doubles the internal pool size - """ - cur_pool_size = self._pool_size - new_pool_size = cur_pool_size * 2 - self._pool_size = new_pool_size - - self._free_ids += deque(range(cur_pool_size, new_pool_size)) - - for i in range(cur_pool_size, new_pool_size): - self._pool.append( - self._create_block(prev_block=None, - token_ids=[], - block_size=self._block_size, - allocator=self._allocator, - block_id=None, - extra_hash=None)) - - def init_block(self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - physical_block_id: Optional[int], - extra_hash: Optional[int] = None) -> Block: - if len(self._free_ids) == 0: - self.increase_pool() - assert len(self._free_ids) > 0 - - pool_id = self._free_ids.popleft() - - block = self._pool[pool_id] - block.__init__( # type: ignore[misc] - prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - allocator=block._allocator, # type: ignore[attr-defined] - block_id=physical_block_id, - extra_hash=extra_hash) - block.pool_id = pool_id # type: ignore[attr-defined] - return block - - def free_block(self, block: Block) -> None: - self._free_ids.appendleft(block.pool_id) # type: ignore[attr-defined] - - -class BlockList: - """This class is an optimization to allow fast-access to physical - block ids. It maintains a block id list that is updated with the - block list and this avoids the need to reconstruct the block id - list on every iteration of the block manager - """ - - def __init__(self, blocks: List[Block]): - self._blocks: List[Block] = [] - self._block_ids: List[int] = [] - - self.update(blocks) - - def _add_block_id(self, block_id: Optional[BlockId]) -> None: - assert block_id is not None - self._block_ids.append(block_id) - - def _update_block_id(self, block_index: int, - new_block_id: Optional[BlockId]) -> None: - assert new_block_id is not None - self._block_ids[block_index] = new_block_id - - def update(self, blocks: List[Block]): - self._blocks = blocks - - # Cache block ids for fast query - self._block_ids = [] - for block in self._blocks: - self._add_block_id(block.block_id) - - def append_token_ids(self, block_index: int, token_ids: List[int]) -> None: - block = self._blocks[block_index] - prev_block_id = block.block_id - - block.append_token_ids(token_ids) - - # CoW or promotion may update the internal block_id - if prev_block_id != block.block_id: - self._update_block_id(block_index, block.block_id) - - def append(self, new_block: Block): - self._blocks.append(new_block) - self._add_block_id(new_block.block_id) - - def __len__(self) -> int: - return len(self._blocks) - - def __getitem__(self, block_index: int) -> Block: - return self._blocks[block_index] - - def __setitem__(self, block_index: int, new_block: Block) -> None: - self._blocks[block_index] = new_block - self._update_block_id(block_index, new_block.block_id) - - def reset(self): - self._blocks = [] - self._block_ids = [] - - def list(self) -> List[Block]: - return self._blocks - - def ids(self) -> List[int]: - return self._block_ids - - -@dataclass -class CacheMetricData: - """A utility dataclass to maintain cache metric. - To avoid overflow, we maintain the hit rate in block granularity, so that - we can maintain a single hit rate for n_completed_block x block_size, - and calculate the real time hit rate by the following: - BS = The number of queries per block. - nB = The number of completed blocks. - HR = hit rate of (nB x BS) queries. - Q = current number of queries (< BS). - H = current number of hits (< BS). - hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS) - """ - num_completed_blocks: int = 0 - completed_block_cache_hit_rate: float = 0.0 - num_incompleted_block_queries: int = 0 - num_incompleted_block_hit: int = 0 - block_size: int = 1000 - - def query(self, hit: bool): - self.num_incompleted_block_queries += 1 - self.num_incompleted_block_hit += 1 if hit else 0 - - # When a block is completed, update the cache hit rate - # and reset the incomplete numbers. - if self.num_incompleted_block_queries == self.block_size: - hit_rate = (self.num_incompleted_block_hit / - self.num_incompleted_block_queries) - self.completed_block_cache_hit_rate = ( - self.completed_block_cache_hit_rate * self.num_completed_blocks - + hit_rate) / (self.num_completed_blocks + 1) - self.num_incompleted_block_queries = 0 - self.num_incompleted_block_hit = 0 - self.num_completed_blocks += 1 - - def get_hit_rate(self): - incomplete_ratio = self.num_incompleted_block_queries / self.block_size - total_blocks = self.num_completed_blocks + incomplete_ratio - if total_blocks == 0: - return 0.0 - - completed_block_hit, incompleted_block_hit = 0.0, 0.0 - if self.num_completed_blocks > 0: - completed_block_hit = (self.completed_block_cache_hit_rate * - self.num_completed_blocks) - if self.num_incompleted_block_queries > 0: - incompleted_hit_rate = (self.num_incompleted_block_hit / - self.num_incompleted_block_queries) - incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio) - return (completed_block_hit + incompleted_block_hit) / total_blocks - - -def get_all_blocks_recursively(last_block: Block) -> List[Block]: - """Retrieves all the blocks in a sequence starting from the last block. - - This function recursively traverses the sequence of blocks in reverse order, - starting from the given last block, and returns a list of all the blocks in - the sequence. - - Args: - last_block (Block): The last block in the sequence. - - Returns: - List[Block]: A list of all the blocks in the sequence, in the order they - appear. - """ - - def recurse(block: Block, lst: List[Block]) -> None: - if block.prev_block is not None: - recurse(block.prev_block, lst) - lst.append(block) - - all_blocks: List[Block] = [] - recurse(last_block, all_blocks) - return all_blocks diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py deleted file mode 100644 index 92bc5e157e148..0000000000000 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ /dev/null @@ -1,439 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Dict, FrozenSet, List, Optional, Tuple - -from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, - DeviceAwareBlockAllocator) -from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator -from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.utils import Device - - -class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): - """A block allocator that can allocate blocks on both CPU and GPU memory. - - This class implements the `DeviceAwareBlockAllocator` interface and provides - functionality for allocating and managing blocks of memory on both CPU and - GPU devices. - - The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU - blocks, and allows for allocation, deallocation, forking, and swapping of - blocks across these memory pools. - """ - - @staticmethod - def create( - allocator_type: str, - num_gpu_blocks: int, - num_cpu_blocks: int, - block_size: int, - ) -> DeviceAwareBlockAllocator: - """Creates a CpuGpuBlockAllocator instance with the specified - configuration. - - This static method creates and returns a CpuGpuBlockAllocator instance - based on the provided parameters. It initializes the CPU and GPU block - allocators with the specified number of blocks, block size, and - allocator type. - - Args: - allocator_type (str): The type of block allocator to use for CPU - and GPU blocks. Currently supported values are "naive" and - "prefix_caching". - num_gpu_blocks (int): The number of blocks to allocate for GPU - memory. - num_cpu_blocks (int): The number of blocks to allocate for CPU - memory. - block_size (int): The size of each block in number of tokens. - - Returns: - DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the - specified configuration. - - Notes: - - The block IDs are assigned contiguously, with GPU block IDs coming - before CPU block IDs. - """ - reserved_blocks = 0 - block_ids = list( - range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) - num_gpu_blocks -= reserved_blocks - gpu_block_ids = block_ids[:num_gpu_blocks] - cpu_block_ids = block_ids[num_gpu_blocks:] - - if allocator_type == "naive": - gpu_allocator: BlockAllocator = NaiveBlockAllocator( - create_block=NaiveBlock, # type: ignore - num_blocks=num_gpu_blocks, - block_size=block_size, - block_ids=gpu_block_ids, - ) - - cpu_allocator: BlockAllocator = NaiveBlockAllocator( - create_block=NaiveBlock, # type: ignore - num_blocks=num_cpu_blocks, - block_size=block_size, - block_ids=cpu_block_ids, - ) - elif allocator_type == "prefix_caching": - gpu_allocator = PrefixCachingBlockAllocator( - num_blocks=num_gpu_blocks, - block_size=block_size, - block_ids=gpu_block_ids, - ) - - cpu_allocator = PrefixCachingBlockAllocator( - num_blocks=num_cpu_blocks, - block_size=block_size, - block_ids=cpu_block_ids, - ) - else: - raise ValueError(f"Unknown allocator type {allocator_type=}") - - return CpuGpuBlockAllocator( - cpu_block_allocator=cpu_allocator, - gpu_block_allocator=gpu_allocator, - ) - - def __init__(self, cpu_block_allocator: BlockAllocator, - gpu_block_allocator: BlockAllocator): - assert not ( - cpu_block_allocator.all_block_ids - & gpu_block_allocator.all_block_ids - ), "cpu and gpu block allocators can't have intersection of block ids" - - self._allocators = { - Device.CPU: cpu_block_allocator, - Device.GPU: gpu_block_allocator, - } - - self._swap_mapping: Dict[int, int] = {} - self._null_block: Optional[Block] = None - - self._block_ids_to_allocator: Dict[int, BlockAllocator] = {} - for _, allocator in self._allocators.items(): - for block_id in allocator.all_block_ids: - self._block_ids_to_allocator[block_id] = allocator - - def allocate_or_get_null_block(self) -> Block: - if self._null_block is None: - self._null_block = NullBlock( - self.allocate_mutable_block(None, Device.GPU)) - return self._null_block - - def allocate_mutable_block(self, - prev_block: Optional[Block], - device: Device, - extra_hash: Optional[int] = None) -> Block: - """Allocates a new mutable block on the specified device. - - Args: - prev_block (Optional[Block]): The previous block to in the sequence. - Used for prefix hashing. - device (Device): The device on which to allocate the new block. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefix caching block. - - Returns: - Block: The newly allocated mutable block. - """ - return self._allocators[device].allocate_mutable_block( - prev_block, extra_hash=extra_hash) - - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - device: Device, - extra_hash: Optional[int] = None) -> List[Block]: - """Allocates a new group of immutable blocks with the provided block - token IDs on the specified device. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. - Used for prefix hashing. - block_token_ids (List[int]): The list of block token IDs to be - stored in the new blocks. - device (Device): The device on which to allocate the new block. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefix caching block. - - Returns: - List[Block]: The newly allocated list of immutable blocks - containing the provided block token IDs. - """ - return self._allocators[device].allocate_immutable_blocks( - prev_block, block_token_ids, extra_hash=extra_hash) - - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - device: Device, - extra_hash: Optional[int] = None) -> Block: - """Allocates a new immutable block with the provided token IDs on the - specified device. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. - Used for prefix hashing. - token_ids (List[int]): The list of token IDs to be stored in the new - block. - device (Device): The device on which to allocate the new block. - extra_hash (Optional[int]): The hash value of additional - factors, such as adapters, that influence the block hash - in the prefix caching block. - - Returns: - Block: The newly allocated immutable block containing the provided - token IDs. - """ - return self._allocators[device].allocate_immutable_block( - prev_block, token_ids, extra_hash=extra_hash) - - def free(self, block: Block) -> None: - """Frees the memory occupied by the given block. - - Args: - block (Block): The block to be freed. - """ - # Null block should never be freed - if isinstance(block, NullBlock): - return - block_id = block.block_id - assert block_id is not None - allocator = self._block_ids_to_allocator[block_id] - allocator.free(block) - - def fork(self, last_block: Block) -> List[Block]: - """Creates a new sequence of blocks that shares the same underlying - memory as the original sequence. - - Args: - last_block (Block): The last block in the original sequence. - - Returns: - List[Block]: A new list of blocks that shares the same memory as the - original sequence. - """ - # do not attempt to fork the null block - assert not isinstance(last_block, NullBlock) - block_id = last_block.block_id - assert block_id is not None - allocator = self._block_ids_to_allocator[block_id] - return allocator.fork(last_block) - - def get_num_free_blocks(self, device: Device) -> int: - """Returns the number of free blocks available on the specified device. - - Args: - device (Device): The device for which to query the number of free - blocks. AssertionError is raised if None is passed. - - Returns: - int: The number of free blocks available on the specified device. - """ - return self._allocators[device].get_num_free_blocks() - - def get_num_total_blocks(self, device: Device) -> int: - return self._allocators[device].get_num_total_blocks() - - def get_physical_block_id(self, device: Device, absolute_id: int) -> int: - """Returns the zero-offset block id on certain device given the - absolute block id. - - Args: - device (Device): The device for which to query relative block id. - absolute_id (int): The absolute block id for the block in - whole allocator. - - Returns: - int: The zero-offset block id on certain device. - """ - return self._allocators[device].get_physical_block_id(absolute_id) - - def swap(self, blocks: List[Block], src_device: Device, - dst_device: Device) -> Dict[int, int]: - """Execute the swap for the given blocks from source_device - on to dest_device, save the current swap mapping and append - them to the accumulated `self._swap_mapping` for each - scheduling move. - - Args: - blocks: List of blocks to be swapped. - src_device (Device): Device to swap the 'blocks' from. - dst_device (Device): Device to swap the 'blocks' to. - - Returns: - Dict[int, int]: Swap mapping from source_device - on to dest_device. - """ - src_block_ids = [block.block_id for block in blocks] - self._allocators[src_device].swap_out(blocks) - self._allocators[dst_device].swap_in(blocks) - dst_block_ids = [block.block_id for block in blocks] - - current_swap_mapping: Dict[int, int] = {} - for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids): - if src_block_id is not None and dst_block_id is not None: - self._swap_mapping[src_block_id] = dst_block_id - current_swap_mapping[src_block_id] = dst_block_id - return current_swap_mapping - - def get_num_full_blocks_touched(self, blocks: List[Block], - device: Device) -> int: - """Returns the number of full blocks that will be touched by - swapping in/out the given blocks on to the 'device'. - - Args: - blocks: List of blocks to be swapped. - device (Device): Device to swap the 'blocks' on. - - Returns: - int: the number of full blocks that will be touched by - swapping in/out the given blocks on to the 'device'. - Non full blocks are ignored when deciding the number - of blocks to touch. - """ - return self._allocators[device].get_num_full_blocks_touched(blocks) - - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: - """Clears the copy-on-write (CoW) state and returns the mapping of - source to destination block IDs. - - Returns: - List[Tuple[int, int]]: A list mapping source block IDs to - destination block IDs. - """ - # CoW only supported on GPU - device = Device.GPU - return self._allocators[device].clear_copy_on_writes() - - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - """Mark blocks as accessed, only use for prefix caching.""" - # Prefix caching only supported on GPU. - device = Device.GPU - return self._allocators[device].mark_blocks_as_accessed(block_ids, now) - - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - """Mark blocks as accessed, only use for prefix caching.""" - # Prefix caching only supported on GPU. - device = Device.GPU - return self._allocators[device].mark_blocks_as_computed(block_ids) - - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - # Prefix caching only supported on GPU. - device = Device.GPU - return self._allocators[device].get_common_computed_block_ids( - computed_seq_block_ids) - - @property - def all_block_ids(self) -> FrozenSet[int]: - return frozenset(self._block_ids_to_allocator.keys()) - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - assert device in self._allocators - return self._allocators[device].get_prefix_cache_hit_rate() - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache for specified or all devices.""" - if device: - return self._allocators[device].reset_prefix_cache() - success = True - for allocator in self._allocators.values(): - success = success and allocator.reset_prefix_cache() - return success - - def get_and_reset_swaps(self) -> List[Tuple[int, int]]: - """Returns and clears the mapping of source to destination block IDs. - Will be called after every swapping operations for now, and after every - schedule when BlockManagerV2 become default. Currently not useful. - - Returns: - List[Tuple[int, int]]: A mapping of source to destination block IDs. - """ - mapping = self._swap_mapping.copy() - self._swap_mapping.clear() - return list(mapping.items()) - - def find_cached_blocks_prefix( - self, - block_hashes: List[int], - device: Device = Device.GPU, - ) -> List[int]: - return self._allocators[device].find_cached_blocks_prefix(block_hashes) - - -class NullBlock(Block): - """ - Null blocks are used as a placeholders for KV cache blocks that have - been dropped due to sliding window. - This implementation just wraps an ordinary block and prevents it from - being modified. It also allows for testing if a block is NullBlock - via isinstance(). - """ - - def __init__(self, proxy: Block): - super().__init__() - self._proxy = proxy - - def append_token_ids(self, token_ids: List[BlockId]): - raise ValueError("null block should not be modified") - - @property - def block_id(self): - return self._proxy.block_id - - @block_id.setter - def block_id(self, value: Optional[BlockId]): - raise ValueError("null block should not be modified") - - @property - def token_ids(self) -> List[BlockId]: - return self._proxy.token_ids - - @property - def num_tokens_total(self) -> int: - raise NotImplementedError( - "num_tokens_total is not used for null block") - - @property - def num_empty_slots(self) -> BlockId: - return self._proxy.num_empty_slots - - @property - def is_full(self): - return self._proxy.is_full - - @property - def prev_block(self): - return self._proxy.prev_block - - @property - def extra_hash(self): - return None - - @property - def computed(self): - return self._proxy.computed - - @computed.setter - def computed(self, value): - self._proxy.computed = value - - @property - def last_accessed(self) -> float: - return self._proxy.last_accessed - - @last_accessed.setter - def last_accessed(self, last_accessed_ts: float): - self._proxy.last_accessed = last_accessed_ts - - @property - def content_hash(self): - return self._proxy.content_hash diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py deleted file mode 100644 index 1a05881f7c005..0000000000000 --- a/vllm/core/block/interfaces.py +++ /dev/null @@ -1,319 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple - -from vllm.utils import Device - -BlockId = int - - -class Block(ABC): - - @abstractmethod - def append_token_ids(self, token_ids: List[int]) -> None: - pass - - @property - @abstractmethod - def block_id(self) -> Optional[int]: - pass - - @block_id.setter - @abstractmethod - def block_id(self, value: Optional[int]) -> None: - """NOTE: Do not use this API outside Block.""" - self._block_id = value - - @property - @abstractmethod - def token_ids(self) -> List[int]: - pass - - @property - @abstractmethod - def num_tokens_total(self) -> int: - """The number of tokens till the current block (inclusive) - """ - pass - - @property - @abstractmethod - def num_empty_slots(self) -> int: - pass - - @property - @abstractmethod - def is_full(self) -> bool: - pass - - @property - @abstractmethod - def prev_block(self) -> Optional["Block"]: - pass - - @property - @abstractmethod - def extra_hash(self) -> Optional[int]: - return None - - @property - @abstractmethod - def computed(self) -> bool: - raise NotImplementedError - - @computed.setter - @abstractmethod - def computed(self, value) -> bool: - """Should be only used by PrefixCacingAllocator""" - raise NotImplementedError - - @property - @abstractmethod - def last_accessed(self) -> float: - raise NotImplementedError - - @last_accessed.setter - @abstractmethod - def last_accessed(self, last_accessed_ts: float): - raise NotImplementedError - - class Factory(Protocol): - - @abstractmethod - def __call__( - self, - prev_block: Optional["Block"], - token_ids: List[int], - block_size: int, - allocator: "BlockAllocator", - block_id: Optional[int] = None, - computed: bool = False, - extra_hash: Optional[int] = None, - ) -> "Block": - pass - - @property - @abstractmethod - def content_hash(self) -> Optional[int]: - """Return the content-based hash of the current block, or None if it is - not yet defined or not supported. - - For the content-based hash to be defined, the current block must be - full. - """ - return None - - -class BlockAllocator(ABC): - - @abstractmethod - def allocate_mutable_block(self, prev_block: Optional[Block], - extra_hash: Optional[int]) -> Block: - pass - - @abstractmethod - def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int], - extra_hash: Optional[int]) -> Block: - pass - - @abstractmethod - def allocate_immutable_blocks(self, prev_block: Optional[Block], - block_token_ids: List[List[int]], - extra_hash: Optional[int]) -> List[Block]: - pass - - @abstractmethod - def free(self, block: Block) -> None: - pass - - @abstractmethod - def fork(self, last_block: Block) -> List[Block]: - pass - - @abstractmethod - def get_num_total_blocks(self) -> int: - pass - - @abstractmethod - def get_num_free_blocks(self) -> int: - pass - - @abstractmethod - def get_physical_block_id(self, absolute_id: int) -> int: - pass - - @abstractmethod - def swap_out(self, blocks: List[Block]) -> None: - pass - - @abstractmethod - def swap_in(self, blocks: List[Block]) -> None: - pass - - @property - @abstractmethod - def all_block_ids(self) -> FrozenSet[int]: - pass - - @abstractmethod - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - pass - - @abstractmethod - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - pass - - @abstractmethod - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - pass - - @abstractmethod - def cow_block_if_not_appendable(self, block: Block) -> BlockId: - """NOTE: This should not be used besides Block""" - pass - - @abstractmethod - def promote_to_immutable_block(self, block: Block) -> BlockId: - """NOTE: This should not be used besides Block""" - pass - - @abstractmethod - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: - pass - - @abstractmethod - def get_prefix_cache_hit_rate(self) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - pass - - @abstractmethod - def reset_prefix_cache(self) -> bool: - """Reset prefix cache.""" - pass - - class NoFreeBlocksError(ValueError): - pass - - @abstractmethod - def find_cached_blocks_prefix( - self, - block_hashes: List[int], - ) -> List[int]: - pass - - -class DeviceAwareBlockAllocator(ABC): - - @abstractmethod - def allocate_mutable_block(self, - prev_block: Optional[Block], - device: Device, - extra_hash: Optional[int] = None) -> Block: - pass - - @abstractmethod - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - device: Device, - extra_hash: Optional[int] = None) -> Block: - pass - - @abstractmethod - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - device: Device, - extra_hash: Optional[int] = None, - ) -> List[Block]: - pass - - @abstractmethod - def get_num_free_blocks(self, device: Device) -> int: - pass - - @abstractmethod - def get_num_total_blocks(self, device: Device) -> int: - pass - - @abstractmethod - def free(self, block: Block) -> None: - pass - - @abstractmethod - def fork(self, last_block: Block) -> List[Block]: - pass - - @property - @abstractmethod - def all_block_ids(self) -> FrozenSet[int]: - pass - - @abstractmethod - def clear_copy_on_writes(self) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - pass - - @abstractmethod - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - pass - - @abstractmethod - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - pass - - @abstractmethod - def get_num_full_blocks_touched(self, blocks: List[Block], - device: Device) -> int: - pass - - @abstractmethod - def swap(self, blocks: List[Block], src_device: Device, - dst_device: Device) -> Dict[int, int]: - pass - - @abstractmethod - def get_physical_block_id(self, device: Device, absolute_id: int) -> int: - pass - - @abstractmethod - def allocate_or_get_null_block(self) -> Block: - """ - Null blocks are used as a placeholders for KV cache blocks that have - been dropped due to sliding window. - There is at most one null block per allocator. - """ - pass - - @abstractmethod - def get_prefix_cache_hit_rate(self, device: Device) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - pass - - @abstractmethod - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache.""" - pass - - @abstractmethod - def find_cached_blocks_prefix( - self, - block_hashes: List[int], - device: Device = Device.GPU, - ) -> List[int]: - pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py deleted file mode 100644 index ae876d131eb66..0000000000000 --- a/vllm/core/block/naive_block.py +++ /dev/null @@ -1,466 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections import deque -from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union - -from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, - get_all_blocks_recursively) -from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device - -Refcount = int - - -class NaiveBlockAllocator(BlockAllocator): - """A simple block allocator that manages blocks of memory without prefix - caching. - - Args: - create_block (Block.Factory): A factory function for creating new - blocks. This is used when a NaiveBlockAllocator is composed within - a prefix caching allocator -- the naive block allocator must - construct prefix caching blocks (but shouldn't know anything else - about them). - num_blocks (int): The total number of blocks to manage. - block_size (int): The size of each block in tokens. - block_ids (Optional[Iterable[int]], optional): An optional iterable of - block IDs. If not provided, block IDs will be assigned sequentially - from 0 to num_blocks - 1. - """ - - def __init__( - self, - create_block: Block.Factory, - num_blocks: int, - block_size: int, - block_ids: Optional[Iterable[int]] = None, - block_pool: Optional[BlockPool] = None, - ): - if block_ids is None: - block_ids = range(num_blocks) - - self._free_block_indices: Deque[BlockId] = deque(block_ids) - self._all_block_indices = frozenset(block_ids) - assert len(self._all_block_indices) == num_blocks - - self._refcounter = RefCounter( - all_block_indices=self._free_block_indices) - self._block_size = block_size - - self._cow_tracker = CopyOnWriteTracker( - refcounter=self._refcounter.as_readonly()) - - if block_pool is None: - extra_factor = 4 - # Pre-allocate "num_blocks * extra_factor" block objects. - # The "* extra_factor" is a buffer to allow more block objects - # than physical blocks - self._block_pool = BlockPool(self._block_size, create_block, self, - num_blocks * extra_factor) - else: - # In this case, the block pool is provided by the caller, - # which means that there is most likely a need to share - # a block pool between allocators - self._block_pool = block_pool - - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates a new immutable block with the given token IDs, linked to - the previous block. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. If - None, then the block to be allocated is the first block in the - sequence. - token_ids (List[int]): The token IDs to be stored in the new block. - - Returns: - Block: The newly allocated immutable block. - """ - assert device is None - block = self.allocate_mutable_block(prev_block=prev_block) - block.append_token_ids(token_ids) - return block - - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> List[Block]: - assert device is None - num_blocks = len(block_token_ids) - - block_ids = [] - for i in range(num_blocks): - block_ids.append(self._allocate_block_id()) - - blocks = [] - for i in range(num_blocks): - prev_block = self._block_pool.init_block( - prev_block=prev_block, - token_ids=block_token_ids[i], - block_size=self._block_size, - physical_block_id=block_ids[i]) - blocks.append(prev_block) - - return blocks - - def allocate_mutable_block(self, - prev_block: Optional[Block], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates a new mutable block, linked to the previous block. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. If - None, then the block to be allocated is the first block in the - sequence. - - Returns: - Block: The newly allocated mutable block. - """ - assert device is None - block_id = self._allocate_block_id() - block = self._block_pool.init_block(prev_block=prev_block, - token_ids=[], - block_size=self._block_size, - physical_block_id=block_id) - return block - - def _allocate_block_id(self) -> BlockId: - if not self._free_block_indices: - raise BlockAllocator.NoFreeBlocksError() - - block_id = self._free_block_indices.popleft() - self._refcounter.incr(block_id) - return block_id - - def _free_block_id(self, block: Union[Block, BlockId]) -> None: - if isinstance(block, Block): - block_id = block.block_id - block.block_id = None - else: - block_id = block - assert block_id is not None - - refcount = self._refcounter.decr(block_id) - if refcount == 0: - self._free_block_indices.appendleft(block_id) - - def free(self, block: Block, keep_block_object: bool = False) -> None: - # Release the physical block id - self._free_block_id(block) - - # Release the block object - if not keep_block_object: - self._block_pool.free_block(block) - - def free_block_id(self, block_id: BlockId) -> None: - self._free_block_id(block_id) - - def fork(self, last_block: Block) -> List[Block]: - """Creates a new sequence of blocks that shares the same underlying - memory as the original sequence. - - Args: - last_block (Block): The last block in the original sequence. - - Returns: - List[Block]: The new sequence of blocks that shares the same memory - as the original sequence. - """ - source_blocks = get_all_blocks_recursively(last_block) - - forked_blocks: List[Block] = [] - prev_block = None - for block in source_blocks: - - # Increment refcount for each block. - assert block.block_id is not None - refcount = self._refcounter.incr(block.block_id) - assert refcount != 1, "can't fork freed block" - - forked_block = self._block_pool.init_block( - prev_block=prev_block, - token_ids=block.token_ids, - block_size=self._block_size, - physical_block_id=block.block_id) - - forked_blocks.append(forked_block) - prev_block = forked_blocks[-1] - - return forked_blocks - - def get_num_free_blocks(self) -> int: - return len(self._free_block_indices) - - def get_num_total_blocks(self) -> int: - return len(self._all_block_indices) - - def get_physical_block_id(self, absolute_id: int) -> int: - """Returns the zero-offset block id on certain block allocator - given the absolute block id. - - Args: - absolute_id (int): The absolute block id for the block - in whole allocator. - - Returns: - int: The zero-offset block id on certain device. - """ - return sorted(self._all_block_indices).index(absolute_id) - - @property - def refcounter(self): - return self._refcounter - - @property - def all_block_ids(self) -> FrozenSet[int]: - return self._all_block_indices - - def cow_block_if_not_appendable(self, block: Block) -> BlockId: - """Performs a copy-on-write operation on the given block if it is not - appendable. - - Args: - block (Block): The block to check for copy-on-write. - - Returns: - BlockId: The block index of the new block if a copy-on-write - operation was performed, or the original block index if - no copy-on-write was necessary. - """ - src_block_id = block.block_id - assert src_block_id is not None - - if self._cow_tracker.is_appendable(block): - return src_block_id - - self._free_block_id(block) - trg_block_id = self._allocate_block_id() - - self._cow_tracker.record_cow(src_block_id, trg_block_id) - - return trg_block_id - - def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: - """Returns the copy-on-write source->destination mapping and clears it. - - Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source - block indices to destination block indices. - """ - return self._cow_tracker.clear_cows() - - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - """Mark blocks as accessed, used in prefix caching. - - Since the naive allocator does not implement prefix caching, we do - nothing. - """ - pass - - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - """Mark blocks as computed, used in prefix caching. - - Since the naive allocator does not implement prefix caching, we do - nothing. - """ - pass - - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - """Determine blocks that can be skipped in prefill. - - Since the naive allocator does not support prefix caching, always return - an empty list. - """ - return [] - - def promote_to_immutable_block(self, block: Block) -> BlockId: - raise NotImplementedError("There is no promotion for naive blocks") - - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: - """Returns the number of full blocks that will be touched by - swapping in/out. - - Args: - blocks: List of blocks to be swapped. - Returns: - int: the number of full blocks that will be touched by - swapping in/out the given blocks. Non full blocks are ignored - when deciding the number of blocks to touch. - """ - # NOTE: for naive block, we use set to eliminate common blocks among - # seqs, also we compare the empty slots in the mutable blocks with - # lookahead slots to get the number of unique new block that are - # needed. - old_block_set = set() - for block in blocks: - if block.is_full: - old_block_set.add(block) - return len(old_block_set) - - def swap_out(self, blocks: List[Block]) -> None: - for block in blocks: - self._free_block_id(block) - - def swap_in(self, blocks: List[Block]) -> None: - for block in blocks: - # Here we allocate either immutable or mutable block and then - # extract its block_id. Note that the block object is released - # and the block_id is assigned to "block" to allow reusing the - # existing "block" object - if block.is_full: - tmp_block = self.allocate_immutable_block( - prev_block=block.prev_block, token_ids=block.token_ids) - else: - tmp_block = self.allocate_mutable_block( - prev_block=block.prev_block) - tmp_block.append_token_ids(block.token_ids) - - block_id = tmp_block.block_id - tmp_block.block_id = None - self._block_pool.free_block(tmp_block) - - block.block_id = block_id # Assign block_id - - def get_prefix_cache_hit_rate(self) -> float: - return -1 - - def reset_prefix_cache(self) -> bool: - """No prefix cache for naive block allocator.""" - return True - - def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: - # Not applicable for naive block allocator. - return [] - - -class NaiveBlock(Block): - """An implementation of the Block class that does not support prefix - caching. - - The NaiveBlock class represents a block of token IDs with a fixed size. It - provides methods for appending token IDs to the block and manages copy-on - -write operations when necessary. - - Args: - prev_block (Block): The previous block in the sequence. - token_ids (List[int]): The initial token IDs to be stored in the block. - block_size (int): The maximum number of token IDs that can be stored in - the block. - allocator (BlockAllocator): The block allocator associated with this - block. - block_id (Optional[int], optional): The physical block index - of this block. Defaults to None, which means no allocation has been - made. - _cow_target (Optional[Block], optional): The copy-on-write target block. - If not provided, it defaults to self. - """ - - def __init__(self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - allocator: BlockAllocator, - block_id: Optional[int] = None, - _cow_target: Optional[Block] = None, - extra_hash: Optional[int] = None): - self._token_ids: List[int] = [] - self._block_size = block_size - self._prev_block = prev_block - self._block_id = block_id - self._allocator = allocator - self._cow_target = _cow_target if _cow_target is not None else self - - self._append_token_ids_no_cow(token_ids) - - def append_token_ids(self, token_ids: List[int]) -> None: - """Appends the given token IDs to the block and performs a - copy-on-write if necessary. - - Args: - token_ids (Optional[List[int]]): The token IDs to be appended - to the block. - """ - self._append_token_ids_no_cow(token_ids) - - if self._block_id is not None: - self._block_id = (self._allocator.cow_block_if_not_appendable( - self._cow_target)) - - def _append_token_ids_no_cow(self, token_ids: List[int]) -> None: - """Appends the given token IDs to the block - - Args: - token_ids (List[int]): The token IDs to be appended to the block. - """ - if len(token_ids) == 0: - return - - assert len(token_ids) <= self.num_empty_slots - - self._token_ids.extend(token_ids) - - @property - def computed(self) -> bool: - raise NotImplementedError - - @computed.setter - def computed(self, value) -> None: - raise NotImplementedError - - @property - def last_accessed(self) -> float: - raise NotImplementedError - - @last_accessed.setter - def last_accessed(self, last_accessed_ts: float): - raise NotImplementedError - - @property - def block_id(self) -> Optional[int]: - return self._block_id - - @block_id.setter - def block_id(self, value: Optional[int]) -> None: - self._block_id = value - - @property - def is_full(self) -> bool: - return self.num_empty_slots == 0 - - @property - def num_empty_slots(self) -> int: - return self._block_size - len(self.token_ids) - - @property - def token_ids(self) -> List[int]: - return self._token_ids - - @property - def num_tokens_total(self) -> int: - raise NotImplementedError( - "num_tokens_total is not used for naive block") - - @property - def block_size(self) -> int: - return self._block_size - - @property - def prev_block(self) -> Optional["Block"]: - return self._prev_block - - @property - def extra_hash(self): - return None - - @property - def content_hash(self) -> Optional[int]: - return None diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py deleted file mode 100644 index a21d69323abbc..0000000000000 --- a/vllm/core/block/prefix_caching_block.py +++ /dev/null @@ -1,1135 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Token blocks.""" -import sys -from bisect import bisect_left -from os.path import commonprefix -from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set, - Tuple) - -from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker, - get_all_blocks_recursively) -from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device, - DeviceAwareBlockAllocator) -from vllm.core.block.naive_block import (BlockPool, NaiveBlock, - NaiveBlockAllocator) -from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor -from vllm.logger import init_logger -from vllm.sequence import Sequence - -PrefixHash = int - -# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME -# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME, -# then we know this block hasn't been accessed yet. -_DEFAULT_LAST_ACCESSED_TIME = -1 - -logger = init_logger(__name__) - - -class BlockTracker: - """Used to track the status of a block inside the prefix caching allocator - """ - __slots__ = ("active", "last_accessed", "computed") - - def reset(self): - self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME - self.computed: bool = False - - def __init__(self): - self.active: bool = False - self.reset() - - def enable(self): - assert not self.active - self.active = True - self.reset() - - def disable(self): - assert self.active - self.active = False - self.reset() - - -class PrefixCachingBlockAllocator(BlockAllocator): - """A block allocator that implements prefix caching. - - The PrefixCachingBlockAllocator maintains a cache of blocks based on their - content hash. It reuses blocks with the same content hash to avoid redundant - memory allocation. The allocator also supports copy-on-write operations. - - Args: - num_blocks (int): The total number of blocks to manage. - block_size (int): The size of each block in tokens. - block_ids (Optional[Iterable[int]], optional): An optional iterable of - block IDs. If not provided, block IDs will be assigned sequentially - from 0 to num_blocks - 1. - """ - - # Note that we use 'None' as a string here instead of None because - # as of Python 3.12, hash(None) returns a constant predictable value. - # This could possibly make it easier to find and exploit hash - # collisions. 'None' as a string will be hashed differently per process, - # but consistently within the same process. This is the same as the - # behavior of None prior to Python 3.12. - _none_hash: int = hash('None') - - # Implements Block.Factory. - def __init__( - self, - num_blocks: int, - block_size: int, - block_ids: Optional[Iterable[int]] = None, - eviction_policy: EvictionPolicy = EvictionPolicy.LRU, - ): - if block_ids is None: - block_ids = range(num_blocks) - - self._block_size = block_size - - # A mapping of prefix hash to block index. All blocks which have a - # prefix hash will be in this dict, even if they have refcount 0. - self._cached_blocks: Dict[PrefixHash, BlockId] = {} - - # A list of immutable block IDs that have been touched by scheduler - # and should be marked as computed after an entire batch of sequences - # are scheduled. - self._touched_blocks: Set[BlockId] = set() - - # Used to track status of each physical block id - self._block_tracker: Dict[BlockId, BlockTracker] = {} - for block_id in block_ids: - self._block_tracker[block_id] = BlockTracker() - - # Pre-allocate "num_blocks * extra_factor" block objects. - # The "* extra_factor" is a buffer to allow more block objects - # than physical blocks - extra_factor = 4 - self._block_pool = BlockPool(self._block_size, self._create_block, - self, num_blocks * extra_factor) - - # An allocator for blocks that do not have prefix hashes. - self._hashless_allocator = NaiveBlockAllocator( - create_block=self._create_block, # type: ignore - num_blocks=num_blocks, - block_size=block_size, - block_ids=block_ids, - block_pool=self._block_pool, # Share block pool here - ) - - # Evitor used to maintain how we want to handle those computed blocks - # if we find memory pressure is high. - self.eviction_policy = eviction_policy - self.evictor: Evictor = make_evictor(self.eviction_policy) - - # We share the refcounter between allocators. This allows us to promote - # blocks originally allocated in the hashless allocator to immutable - # blocks. - self._refcounter = self._hashless_allocator.refcounter - - self._cow_tracker = CopyOnWriteTracker( - refcounter=self._refcounter.as_readonly()) - - self.metric_data = CacheMetricData() - - def _create_block( - self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - allocator: BlockAllocator, - block_id: Optional[int] = None, - computed: bool = False, - extra_hash: Optional[int] = None, - ) -> Block: - # Bind block to self. - allocator = self - - return PrefixCachingBlock( - prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - block_id=block_id, - allocator=allocator, - computed=computed, - extra_hash=extra_hash, - ) - - def allocate_immutable_block(self, - prev_block: Optional[Block], - token_ids: List[int], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates an immutable block with the given token IDs, reusing cached - blocks if possible. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. - token_ids (List[int]): The token IDs to be stored in the block. - - Returns: - Block: The allocated immutable block. - """ - assert device is None - assert_prefix_caching_block_or_none(prev_block) - - # First, try to create a block that points to cached data - block = self._block_pool.init_block(prev_block=prev_block, - token_ids=token_ids, - block_size=self._block_size, - physical_block_id=None, - extra_hash=extra_hash) - assert block.content_hash is not None - - cached_block_id = self._cached_blocks.get(block.content_hash, None) - if cached_block_id is not None: - self.metric_data.query(hit=True) - block.block_id = cached_block_id - self._incr_refcount_cached_block(block) - return block - self.metric_data.query(hit=False) - self._block_pool.free_block(block) - - # No cached block => Allocate a new block - block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash) - block.append_token_ids(token_ids) - return block - - def allocate_immutable_blocks( - self, - prev_block: Optional[Block], - block_token_ids: List[List[int]], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> List[Block]: - blocks = [] - for token_ids in block_token_ids: - prev_block = self.allocate_immutable_block(prev_block=prev_block, - token_ids=token_ids, - device=device, - extra_hash=extra_hash) - blocks.append(prev_block) - return blocks - - def allocate_mutable_block(self, - prev_block: Optional[Block], - extra_hash: Optional[int] = None, - device: Optional[Device] = None) -> Block: - """Allocates a mutable block. If there are no free blocks, this will - evict unused cached blocks. - - Args: - prev_block (Block): The previous block in the sequence. - None is not allowed unlike it is super class. - - Returns: - Block: The allocated mutable block. - """ - assert device is None - assert_prefix_caching_block_or_none(prev_block) - - block_id = self._allocate_block_id() - block = self._block_pool.init_block(prev_block=prev_block, - token_ids=[], - block_size=self._block_size, - physical_block_id=block_id, - extra_hash=extra_hash) - assert not block.computed - assert block.content_hash is None - return block - - def _incr_refcount_cached_block(self, block: Block) -> None: - # Set this block to be "computed" since it is pointing to a - # cached block id (which was already computed) - block.computed = True - - block_id = block.block_id - assert block_id is not None - - refcount = self._refcounter.incr(block_id) - if refcount == 1: - # In case a cached block was evicted, restore its tracking - if block_id in self.evictor: - self.evictor.remove(block_id) - - self._track_block_id(block_id, computed=True) - - def _decr_refcount_cached_block(self, block: Block) -> None: - # Ensure this is immutable/cached block - assert block.content_hash is not None - - block_id = block.block_id - assert block_id is not None - - refcount = self._refcounter.decr(block_id) - if refcount > 0: - block.block_id = None - return - else: - assert refcount == 0 - - # No longer used - assert block.content_hash in self._cached_blocks - - # Add the cached block to the evictor - # (This keeps the cached block around so it can be reused) - self.evictor.add(block_id, block.content_hash, block.num_tokens_total, - self._block_tracker[block_id].last_accessed) - - # Stop tracking the block - self._untrack_block_id(block_id) - - block.block_id = None - - def _decr_refcount_hashless_block(self, block: Block) -> None: - block_id = block.block_id - assert block_id is not None - - # We may have a fork case where block is shared, - # in which case, we cannot remove it from tracking - refcount = self._refcounter.get(block_id) - if refcount == 1: - self._untrack_block_id(block_id) - - # Decrement refcount of the block_id, but do not free the block object - # itself (will be handled by the caller) - self._hashless_allocator.free(block, keep_block_object=True) - - def _allocate_block_id(self) -> BlockId: - """First tries to allocate a block id from the hashless allocator, - and if there are no blocks, then tries to evict an unused cached block. - """ - hashless_block_id = self._maybe_allocate_hashless_block_id() - if hashless_block_id is not None: - return hashless_block_id - - evicted_block_id = self._maybe_allocate_evicted_block_id() - if evicted_block_id is not None: - return evicted_block_id - - # No block available in hashless allocator, nor in unused cache blocks. - raise BlockAllocator.NoFreeBlocksError() - - def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]: - try: - # Allocate mutable block and extract its block_id - block = self._hashless_allocator.allocate_mutable_block( - prev_block=None) - block_id = block.block_id - self._block_pool.free_block(block) - - self._track_block_id(block_id, computed=False) - return block_id - except BlockAllocator.NoFreeBlocksError: - return None - - def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]: - if self.evictor.num_blocks == 0: - return None - - # Here we get an evicted block, which is only added - # into evictor if its ref counter is 0 - # and since its content would be changed, we need - # to remove it from _cached_blocks's tracking list - block_id, content_hash_to_evict = self.evictor.evict() - - # Sanity checks - assert content_hash_to_evict in self._cached_blocks - _block_id = self._cached_blocks[content_hash_to_evict] - assert self._refcounter.get(_block_id) == 0 - assert _block_id == block_id - - self._cached_blocks.pop(content_hash_to_evict) - - self._refcounter.incr(block_id) - self._track_block_id(block_id, computed=False) - - return block_id - - def _free_block_id(self, block: Block) -> None: - """Decrements the refcount of the block. The block may be in two - possible states: (1) immutable/cached or (2) mutable/hashless. - In the first case, the refcount is decremented directly and the block - may be possibly added to the evictor. In other case, hashless - allocator free(..) with keep_block_object=True is called to only free - the block id (since the block object may be reused by the caller) - """ - block_id = block.block_id - assert block_id is not None, "Freeing unallocated block is undefined" - - if block.content_hash is not None: - # Immutable: This type of block is always cached, and we want to - # keep it in the evictor for future reuse - self._decr_refcount_cached_block(block) - else: - # Mutable: This type of block is not cached, so we release it - # directly to the hashless allocator - self._decr_refcount_hashless_block(block) - - assert block.block_id is None - - def free(self, block: Block, keep_block_object: bool = False) -> None: - """Release the block (look at free_block_id(..) docs) - """ - # Release the physical block index - self._free_block_id(block) - - # Release the block object to the pool - if not keep_block_object: - self._block_pool.free_block(block) - - def fork(self, last_block: Block) -> List[Block]: - """Creates a new sequence of blocks that shares the same underlying - memory as the original sequence. - - Args: - last_block (Block): The last block in the original sequence. - - Returns: - List[Block]: The new sequence of blocks that shares the same memory - as the original sequence. - """ - source_blocks = get_all_blocks_recursively(last_block) - - forked_blocks: List[Block] = [] - prev_block = None - for block in source_blocks: - block_id = block.block_id - assert block_id is not None - - refcount = self._refcounter.incr(block_id) - assert refcount != 1, "can't fork free'd block_id = {}".format( - block_id) - - forked_block = self._block_pool.init_block( - prev_block=prev_block, - token_ids=block.token_ids, - block_size=self._block_size, - physical_block_id=block_id, - extra_hash=block.extra_hash) - - forked_blocks.append(forked_block) - prev_block = forked_blocks[-1] - - return forked_blocks - - def get_num_free_blocks(self, device: Optional[Device] = None) -> int: - assert device is None - # The number of free blocks is the number of hashless free blocks - # plus the number of blocks evictor could free from its list. - return self._hashless_allocator.get_num_free_blocks( - ) + self.evictor.num_blocks - - def get_num_total_blocks(self) -> int: - return self._hashless_allocator.get_num_total_blocks() - - def get_physical_block_id(self, absolute_id: int) -> int: - """Returns the zero-offset block id on certain block allocator - given the absolute block id. - - Args: - absolute_id (int): The absolute block id for the block - in whole allocator. - - Returns: - int: The rzero-offset block id on certain device. - """ - return sorted(self.all_block_ids).index(absolute_id) - - @property - def all_block_ids(self) -> FrozenSet[int]: - return self._hashless_allocator.all_block_ids - - def get_prefix_cache_hit_rate(self) -> float: - return self.metric_data.get_hit_rate() - - def reset_prefix_cache(self) -> bool: - """Reset prefix cache. This function may be used in RLHF - flows to invalid prefix caching after the weights are updated, - or used for resetting prefix caching status for benchmarking. - - Returns: - bool: True if the prefix cache is successfully reset, - False otherwise. - """ - num_used_blocks = (self.get_num_total_blocks() - - self.get_num_free_blocks()) - if num_used_blocks > 0: - logger.warning( - "Failed to reset prefix cache because some " - "blocks (%d) are not freed yet", num_used_blocks) - return False - - # Free all blocks in the evictor. - while (block_id := - self._maybe_allocate_evicted_block_id()) is not None: - self._hashless_allocator.free_block_id(block_id) - - # Should not have any cached blocks because all blocks are evicted. - assert not self._cached_blocks - - # Reset the evictor. - self.evictor = make_evictor(self.eviction_policy) - - # Reset the block tracker. - for block_id in self._block_tracker: - self._block_tracker[block_id] = BlockTracker() - - # Reset the metrics. - self.metric_data = CacheMetricData() - - logger.info("Successfully reset prefix cache") - return True - - def is_block_cached(self, block: Block) -> bool: - assert block.content_hash is not None - return block.content_hash in self._cached_blocks - - def promote_to_immutable_block(self, block: Block) -> BlockId: - """Once a mutable block is full, it can be promoted to an immutable - block. This means that its content can be referenced by future blocks - having the same prefix. - - Note that if we already have a cached block with the same content, we - will replace the newly-promoted block's mapping with the existing cached - block id. - - Args: - block: The mutable block to be promoted. - - Returns: - BlockId: Either the original block index, or the block index of - the previously cached block matching the same content. - """ - # Ensure block can be promoted - assert block.content_hash is not None - assert block.block_id is not None - assert self._refcounter.get(block.block_id) > 0 - - if block.content_hash not in self._cached_blocks: - # No cached content hash => Set this block as cached. - # Note that this block cannot be marked as computed yet - # because other sequences in the same batch cannot reuse - # this block. - self._cached_blocks[block.content_hash] = block.block_id - # Mark this block as touched so that it can be marked as - # computed after the entire batch of sequences are scheduled. - self._touched_blocks.add(block.block_id) - return block.block_id - - # Reuse the cached content hash - self._decr_refcount_hashless_block(block) - block.block_id = self._cached_blocks[block.content_hash] - - # Increment refcount of the cached block and (possibly) restore - # it from the evictor. - # Note that in this case, the block is marked as computed - self._incr_refcount_cached_block(block) - - return block.block_id - - def cow_block_if_not_appendable(self, block: Block) -> BlockId: - """Performs a copy-on-write operation on the given block if it is not - appendable. - - Args: - block (Block): The block to check for copy-on-write. - - Returns: - BlockId: The block index of the new block if a copy-on-write - operation was performed, or the original block index if - no copy-on-write was necessary. - """ - src_block_id = block.block_id - assert src_block_id is not None - - if self._cow_tracker.is_appendable(block): - return src_block_id - - self._free_block_id(block) - trg_block_id = self._allocate_block_id() - - self._cow_tracker.record_cow(src_block_id, trg_block_id) - - return trg_block_id - - def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]: - """Returns the copy-on-write source->destination mapping and clears it. - - Returns: - List[Tuple[BlockId, BlockId]]: A list mapping source - block indices to destination block indices. - """ - return self._cow_tracker.clear_cows() - - def mark_blocks_as_accessed(self, block_ids: List[int], - now: float) -> None: - """Mark blocks as accessed, used in prefix caching. - - If the block is added into evictor, we need to update corresponding - info in evictor's metadata. - """ - - for block_id in block_ids: - if self._block_tracker[block_id].active: - self._block_tracker[block_id].last_accessed = now - elif block_id in self.evictor: - self.evictor.update(block_id, now) - else: - raise ValueError( - "Mark block as accessed which is not belonged to GPU") - - def mark_blocks_as_computed(self, block_ids: List[int]) -> None: - # Mark all touched blocks as computed. - for block_id in self._touched_blocks: - self._block_tracker[block_id].computed = True - self._touched_blocks.clear() - - def _track_block_id(self, block_id: Optional[BlockId], - computed: bool) -> None: - assert block_id is not None - self._block_tracker[block_id].enable() - self._block_tracker[block_id].computed = computed - - def _untrack_block_id(self, block_id: Optional[BlockId]) -> None: - assert block_id is not None - self._block_tracker[block_id].disable() - - def block_is_computed(self, block_id: int) -> bool: - if self._block_tracker[block_id].active: - return self._block_tracker[block_id].computed - else: - return block_id in self.evictor - - def get_common_computed_block_ids( - self, computed_seq_block_ids: List[List[int]]) -> List[int]: - """Return the block ids that are common for a given sequence group. - - Only those blocks that are immutable and already be marked - compyted would be taken consideration. - """ - - # NOTE We exclude the last block to avoid the case where the entire - # prompt is cached. This would cause erroneous behavior in model - # runner. - - # It returns a list of int although type annotation says list of string. - if len(computed_seq_block_ids) == 1: - return computed_seq_block_ids[0] - - return commonprefix([ - ids for ids in computed_seq_block_ids # type: ignore - if ids - ]) - - def get_num_full_blocks_touched(self, blocks: List[Block]) -> int: - """Returns the number of full blocks that will be touched by - swapping in/out. - - Args: - blocks: List of blocks to be swapped. - Returns: - int: the number of full blocks that will be touched by - swapping in/out the given blocks. Non full blocks are ignored - when deciding the number of blocks to touch. - """ - num_touched_blocks: int = 0 - for block in blocks: - # If the block has a match in the cache and the cached - # block is not referenced, then we still count it as a - # touched block - if block.is_full and (not self.is_block_cached(block) or \ - (block.content_hash is not None and \ - self._cached_blocks[block.content_hash] in \ - self.evictor)): - num_touched_blocks += 1 - return num_touched_blocks - - def swap_out(self, blocks: List[Block]) -> None: - """Execute the swap out actions. Basically just free the - given blocks. - - Args: - blocks: List of blocks to be swapped out. - """ - for block in blocks: - self._free_block_id(block) - - def swap_in(self, blocks: List[Block]) -> None: - """Execute the swap in actions. Change the block id from - old allocator to current allocator for each block to finish - the block table update. - - Args: - blocks: List of blocks to be swapped in. - """ - for block in blocks: - # Here we allocate either immutable or mutable block and then - # extract its block_id. Note that the block object is released - # and the block_id is assigned to "block" to allow reusing the - # existing "block" object - if block.is_full: - tmp_block = self.allocate_immutable_block( - prev_block=block.prev_block, - token_ids=block.token_ids, - extra_hash=block.extra_hash) - else: - tmp_block = self.allocate_mutable_block( - prev_block=block.prev_block, extra_hash=block.extra_hash) - tmp_block.append_token_ids(block.token_ids) - - block_id = tmp_block.block_id - self._block_pool.free_block(tmp_block) - - block.block_id = block_id # Assign block_id - - def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: - """ - Given a list of block hashes, return the prefix of the block hashes that - are all cached. - - Since a block's block hash includes the hashes of all previous blocks, - and we only allocate/deallocate blocks in the entire sequence, so if a - block is cached, then all previous blocks are also cached. With this - property, we can use binary search to find the prefix of cached blocks. - - Args: - block_hashes (List[int]): The list of block hashes. - - Returns: - List[int]: The prefix of the `block_hashes` that are cached. - """ - - def _block_is_cached(block_hash: PrefixHash) -> bool: - if block_hash not in self._cached_blocks: - return False - - cached_block_id = self._cached_blocks[block_hash] - # We only consider the blocks that are marked as computed. - return self.block_is_computed(cached_block_id) - - def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int: - - # python <= 3.10 don't have the key argument - if sys.version_info < (3, 10): - a = [key(e) for e in a] - return bisect_left(a, x) - else: - return bisect_left(a, x, key=key) - - # Look for the first block that's not cached, and returns the prefix - # i.e. blocks that are cached. - idx = _bisect_left(block_hashes, - True, - key=lambda x: not _block_is_cached(x)) - return block_hashes[:idx] - - -class PrefixCachingBlock(Block): - """A block implementation that supports prefix caching. - - The PrefixCachingBlock class represents a block of token IDs with prefix - caching capabilities. It wraps a NaiveBlock internally and provides - additional functionality for content hashing and promoting immutable blocks - with the prefix caching allocator. - - Args: - prev_block (Optional[PrefixCachingBlock]): The previous block in the - sequence. - token_ids (List[int]): The initial token IDs to be stored in the block. - block_size (int): The maximum number of token IDs that can be stored in - the block. - allocator (BlockAllocator): The prefix - caching block allocator associated with this block. - block_id (Optional[int], optional): The physical block index - of this block. Defaults to None. - extra_hash (Optional[int]): The hash value of additional factors - such as adapters that influence the block, apart from the token_ids. - """ - - # Note that we use 'None' as a string here instead of None because - # as of Python 3.12, hash(None) returns a constant predictable value. - # This could possibly make it easier to find and exploit hash - # collisions. 'None' as a string will be hashed differently per process, - # but consistently within the same process. This is the same as the - # behavior of None prior to Python 3.12. - _none_hash: int = hash('None') - - def __init__( - self, - prev_block: Optional[Block], - token_ids: List[int], - block_size: int, - allocator: BlockAllocator, - block_id: Optional[int] = None, - computed: bool = False, - extra_hash: Optional[int] = None, - ): - assert isinstance(allocator, PrefixCachingBlockAllocator), ( - "Currently this class is only tested with " - "PrefixCachingBlockAllocator. Got instead allocator = {}".format( - allocator)) - assert_prefix_caching_block_or_none(prev_block) - - self._prev_block = prev_block - self._cached_content_hash: Optional[int] = None - self._cached_num_tokens_total: int = 0 - self._allocator = allocator - self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME - self._computed = computed - self._extra_hash = extra_hash - - # On the first time, we create the block object, and next we only - # reinitialize it - if hasattr(self, "_block"): - self._block.__init__( # type: ignore[has-type] - prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - block_id=block_id, - allocator=self._allocator) - else: - self._block = NaiveBlock(prev_block=prev_block, - token_ids=token_ids, - block_size=block_size, - block_id=block_id, - allocator=self._allocator) - - self._update_num_tokens_total() - - def _update_num_tokens_total(self): - """Incrementally computes the number of tokens that there is - till the current block (included) - """ - res = 0 - - # Add all previous blocks - if self._prev_block is not None: - res += self._prev_block.num_tokens_total - - # Add current block - res += len(self.token_ids) - - self._cached_num_tokens_total = res - - @property - def computed(self) -> bool: - return self._computed - - @computed.setter - def computed(self, value) -> None: - self._computed = value - - @property - def last_accessed(self) -> float: - return self._last_accessed - - @last_accessed.setter - def last_accessed(self, last_accessed_ts: float): - self._last_accessed = last_accessed_ts - - def append_token_ids(self, token_ids: List[int]) -> None: - """Appends the given token IDs to the block and registers the block as - immutable if the block becomes full. - - Args: - token_ids (List[int]): The token IDs to be appended to the block. - """ - # Ensure this is mutable block (not promoted) - assert self.content_hash is None - assert not self.computed - - if len(token_ids) == 0: - return - - # Ensure there are input tokens - assert token_ids, "Got token_ids = {}".format(token_ids) - - # Naive block handles CoW. - self._block.append_token_ids(token_ids) - self._update_num_tokens_total() - - # If the content hash is present, then the block can be made immutable. - # Register ourselves with the allocator, potentially replacing the - # physical block index. - if self.content_hash is not None: - self.block_id = self._allocator.promote_to_immutable_block(self) - - @property - def block_id(self) -> Optional[int]: - return self._block.block_id - - @block_id.setter - def block_id(self, value) -> None: - self._block.block_id = value - - @property - def is_full(self) -> bool: - return self._block.is_full - - @property - def num_empty_slots(self) -> int: - return self._block.num_empty_slots - - @property - def num_tokens_total(self) -> int: - return self._cached_num_tokens_total - - @property - def block_size(self) -> int: - return self._block.block_size - - @property - def token_ids(self) -> List[int]: - return self._block.token_ids - - @property - def prev_block(self) -> Optional[Block]: - return self._prev_block - - @property - def extra_hash(self) -> Optional[int]: - return self._extra_hash - - @property - def content_hash(self) -> Optional[int]: - """Return the content-based hash of the current block, or None if it is - not yet defined. - - For the content-based hash to be defined, the current block must be - full. - """ - # If the hash is already computed, return it. - if self._cached_content_hash is not None: - return self._cached_content_hash - - # We cannot compute a hash for the current block because it is not full. - if not self.is_full: - return None - - is_first_block = self._prev_block is None - prev_block_hash = ( - self._none_hash if is_first_block else - self._prev_block.content_hash # type: ignore - ) - - # Previous block exists but does not yet have a hash. - # Return no hash in this case. - if prev_block_hash == self._none_hash and not is_first_block: - return None - - self._cached_content_hash = PrefixCachingBlock.hash_block_tokens( - is_first_block, - prev_block_hash, - cur_block_token_ids=self.token_ids, - extra_hash=self._extra_hash) - return self._cached_content_hash - - @classmethod - def hash_block_tokens(cls, - is_first_block: bool, - prev_block_hash: Optional[int], - cur_block_token_ids: List[int], - extra_hash: Optional[int] = None) -> int: - """Computes a hash value corresponding to the contents of a block and - the contents of the preceding block(s). The hash value is used for - prefix caching. - - Parameters: - - is_first_block (bool): A flag indicating if the block is the first in - the sequence. - - prev_block_hash (Optional[int]): The hash of the previous block. None - if this is the first block. - - cur_block_token_ids (List[int]): A list of token ids in the current - block. The current block is assumed to be full. - - extra_hash (Optional[int]): The hash value of additional factors - such as adapters that influence the block, apart from the token_ids. - - Returns: - - int: The computed hash value for the block. - """ - if is_first_block and prev_block_hash is None: - prev_block_hash = cls._none_hash - return hash((is_first_block, prev_block_hash, *cur_block_token_ids, - extra_hash)) - - -class ComputedBlocksTracker: - """ - Tracks the computed blocks for each sequence. - - Internally, it maintains a map from sequence id to the list of block hashes - for the sequence. We cache the hashes of the full blocks for each sequence, - and make sure the hash is calculated in the same way as the allocator. - When a sequence is being decoded, we also update the sequence's hash - accordingly and incrementally. - - From the sequence hash, with prefix caching enabled, we could also calculate - the number of cached tokens for the sequence by looking up the number of - cached block hashes in the allocator. - """ - - # Note that we use 'None' as a string here instead of None because - # as of Python 3.12, hash(None) returns a constant predictable value. - # This could possibly make it easier to find and exploit hash - # collisions. 'None' as a string will be hashed differently per process, - # but consistently within the same process. This is the same as the - # behavior of None prior to Python 3.12. - _none_hash: int = hash('None') - - def __init__( - self, - allocator: DeviceAwareBlockAllocator, - block_size: int, - enable_caching: bool, - ): - self._allocator = allocator - self._block_size = block_size - self._enable_caching = enable_caching - - # A map from seq_id to the list of block hashes for the - # sequence. This is so that we don't have to recompute the block hashes - # for the sequence when we need to check if the sequence is cached. - # Note a block that's not full will not have its hash calculated and - # recorded. - self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {} - - # A map from seq_id to the number of tokens that are cached for the - # sequence. - # We need this so that a sequence in continuous prefill doesn't - # accidentally see its cached token count change. See comments in - # `get_num_cached_tokens` for more details. - self._seq_id_to_num_tokens_computed: Dict[int, int] = {} - - def _update_seq_hashes(self, seq: Sequence) -> None: - """Incrementally update the sequence's block hashes and record them.""" - assert self._enable_caching - - block_hashes_recorded = self._seq_id_to_blocks_hashes.get( - seq.seq_id, []) - cur_num_blocks_recorded = len(block_hashes_recorded) - token_ids = seq.get_token_ids() - assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, ( - f"The sequence has {len(token_ids)} tokens, but" - f" already recorded {cur_num_blocks_recorded} blocks. " - "This should not happen since we assume blocks are " - "only appended other than recomputation. When the sequence is " - "recomputed, we should have removed the info of the old blocks.") - # Update the computed block hashes for the sequence. Since only full - # blocks are considered as "computed", we take floor here. - num_computed_blocks = len(token_ids) // self._block_size - - # We need to know the hash of the previous block to compute the hash of - # the current block so that blocks could be uniquely identified across - # sequences of prefixes. - prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else - block_hashes_recorded[-1]) - # Only update the computed block hashes for the new blocks - for i in range(cur_num_blocks_recorded, num_computed_blocks): - assert len(token_ids) >= (i + 1) * self._block_size - block_token_ids = token_ids[i * self._block_size:(i + 1) * - self._block_size] - - # NOTE: If there are any factors affecting the block besides - # token_ids, they should be added as input to extra_hash. - extra_hash = seq.extra_hash() - - # This has to be kept in sync with the allocator's hash - # calculation. - block_hash = PrefixCachingBlock.hash_block_tokens( - is_first_block=prev_block_hash == self._none_hash, - prev_block_hash=prev_block_hash, - cur_block_token_ids=block_token_ids, - extra_hash=extra_hash, - ) - block_hashes_recorded.append(block_hash) - prev_block_hash = block_hash - - self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded - - def get_num_cached_tokens(self, seq: Sequence) -> int: - if not self._enable_caching: - return 0 - - # We always try to update the sequence hashes on the fly. - # This is to ensure that we don't miss any cached tokens for the - # sequence during decode. - # This routine should only update hash for any new blocks too. - self._update_seq_hashes(seq) - - num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get( - seq.seq_id, None) - - # TODO(rickyx): This hack could be removed once we mark blocks as - # computed correctly with chunked prefills. - if num_computed_tokens_prev is not None and seq.is_prefill(): - # For a sequence that is still in prefill, we don't - # recompute the number of cached tokens. - # This also handles correctly chunked prefill since currently - # we mark blocks as computed even if the sequence is still partially - # prefilled. So a continuously prefilled sequence should not - # see its cached token count change while running. - return num_computed_tokens_prev - - block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id] - - # This is O(logN), where N is the number of blocks. - num_cached_blocks = len( - self._allocator.find_cached_blocks_prefix(block_hashes)) - num_cached_tokens = num_cached_blocks * self._block_size - self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens - return num_cached_tokens - - def remove_seq(self, seq_id: int) -> None: - """Stop tracking the sequence.""" - if not self._enable_caching: - return - assert seq_id in self._seq_id_to_blocks_hashes - del self._seq_id_to_blocks_hashes[seq_id] - - assert seq_id in self._seq_id_to_num_tokens_computed - del self._seq_id_to_num_tokens_computed[seq_id] - - -class LastAccessBlocksTracker: - """Manages the last access time of the tracked sequences, in order to allow - an efficient update of allocator's block last access times - """ - - def __init__(self, allocator): - self._allocator = allocator - self._seq_last_access: Dict[int, Optional[float]] = {} - - def add_seq(self, seq_id: int) -> None: - """Start tracking seq_id - """ - assert seq_id not in self._seq_last_access - self._seq_last_access[seq_id] = None - - def remove_seq(self, seq_id: int) -> None: - """Stop tracking seq_id - """ - assert seq_id in self._seq_last_access - del self._seq_last_access[seq_id] - - def update_last_access(self, seq_id: int, time: float) -> None: - assert seq_id in self._seq_last_access - self._seq_last_access[seq_id] = time - - def update_seq_blocks_last_access(self, seq_id: int, - block_ids: List[int]) -> None: - assert seq_id in self._seq_last_access - - ts = self._seq_last_access[seq_id] - - if ts is None: - # No last access was recorded, no need to update. - return - - self._allocator.mark_blocks_as_accessed(block_ids, ts) - - -def assert_prefix_caching_block_or_none(block: Optional[Block]): - if block is None: - return - assert isinstance(block, - PrefixCachingBlock), "Got block = {}".format(block) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py deleted file mode 100644 index e933c6ee7c8bd..0000000000000 --- a/vllm/core/block/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Block manager utils.""" -from vllm.sequence import SequenceGroup -from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - STR_NOT_IMPL_ENC_DEC_SWA) - - -def check_no_caching_or_swa_for_blockmgr_encdec( - block_mgr, seq_group: SequenceGroup) -> None: - ''' - Enforce that prefix caching & sliding-window attention (SWA) - are currently unsupported *specifically* for encoder/decoder models. - - Raises NotImplementedError if unsupported scenario is detected. - - Arguments: - - * block_mgr: BlockSpaceManager instance - * seq_group: SequenceGroup passed to block_mgr - ''' - - if seq_group.is_encoder_decoder(): - if block_mgr.max_block_sliding_window is not None: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) - - if block_mgr.enable_caching: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py deleted file mode 100644 index cbfa4d7ff3c4c..0000000000000 --- a/vllm/core/block_manager.py +++ /dev/null @@ -1,523 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A block manager that manages token blocks.""" -from typing import Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple - -from vllm.core.block.block_table import BlockTable -from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator -from vllm.core.block.interfaces import Block -from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker, - LastAccessBlocksTracker) -from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec -from vllm.core.interfaces import AllocStatus, BlockSpaceManager -from vllm.sequence import Sequence, SequenceGroup, SequenceStatus -from vllm.utils import Device - -SeqId = int -EncoderSeqId = str - - -class SelfAttnBlockSpaceManager(BlockSpaceManager): - """BlockSpaceManager which manages the allocation of KV cache. - - It owns responsibility for allocation, swapping, allocating memory for - autoregressively-generated tokens, and other advanced features such as - prefix caching, forking/copy-on-write, and sliding-window memory allocation. - - This class implements the design described in - https://github.com/vllm-project/vllm/pull/3492. - - Lookahead slots - The block manager has the notion of a "lookahead slot". These are slots - in the KV cache that are allocated for a sequence. Unlike the other - allocated slots, the content of these slots is undefined -- the worker - may use the memory allocations in any way. - - In practice, a worker could use these lookahead slots to run multiple - forward passes for a single scheduler invocation. Each successive - forward pass would write KV activations to the corresponding lookahead - slot. This allows low inter-token latency use-cases, where the overhead - of continuous batching scheduling is amortized over >1 generated tokens. - - Speculative decoding uses lookahead slots to store KV activations of - proposal tokens. - - See https://github.com/vllm-project/vllm/pull/3250 for more information - on lookahead scheduling. - - Args: - block_size (int): The size of each memory block. - num_gpu_blocks (int): The number of memory blocks allocated on GPU. - num_cpu_blocks (int): The number of memory blocks allocated on CPU. - watermark (float, optional): The threshold used for memory swapping. - Defaults to 0.01. - sliding_window (Optional[int], optional): The size of the sliding - window. Defaults to None. - enable_caching (bool, optional): Flag indicating whether caching is - enabled. Defaults to False. - """ - - def __init__( - self, - block_size: int, - num_gpu_blocks: int, - num_cpu_blocks: int, - watermark: float = 0.01, - sliding_window: Optional[int] = None, - enable_caching: bool = False, - ) -> None: - self.block_size = block_size - self.num_total_gpu_blocks = num_gpu_blocks - self.num_total_cpu_blocks = num_cpu_blocks - - self.sliding_window = sliding_window - # max_block_sliding_window is the max number of blocks that need to be - # allocated - self.max_block_sliding_window = None - if sliding_window is not None: - # +1 here because // rounds down - num_blocks = sliding_window // block_size + 1 - # +1 here because the last block may not be full, - # and so the sequence stretches one more block at the beginning - # For example, if sliding_window is 3 and block_size is 4, - # we may need 2 blocks when the second block only holds 1 token. - self.max_block_sliding_window = num_blocks + 1 - - self.watermark = watermark - assert watermark >= 0.0 - - self.enable_caching = enable_caching - - self.watermark_blocks = int(watermark * num_gpu_blocks) - - self.block_allocator = CpuGpuBlockAllocator.create( - allocator_type="prefix_caching" if enable_caching else "naive", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks, - block_size=block_size, - ) - - self.block_tables: Dict[SeqId, BlockTable] = {} - self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} - - self._computed_blocks_tracker = ComputedBlocksTracker( - self.block_allocator, self.block_size, self.enable_caching) - self._last_access_blocks_tracker = LastAccessBlocksTracker( - self.block_allocator) - - def can_allocate(self, - seq_group: SequenceGroup, - num_lookahead_slots: int = 0) -> AllocStatus: - # FIXME(woosuk): Here we assume that all sequences in the group share - # the same prompt. This may not be true for preempted sequences. - - check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) - - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - num_required_blocks = BlockTable.get_num_required_blocks( - seq.get_token_ids(), - block_size=self.block_size, - num_lookahead_slots=num_lookahead_slots, - ) - - if seq_group.is_encoder_decoder(): - encoder_seq = seq_group.get_encoder_seq() - assert encoder_seq is not None - num_required_blocks += BlockTable.get_num_required_blocks( - encoder_seq.get_token_ids(), - block_size=self.block_size, - ) - - if self.max_block_sliding_window is not None: - num_required_blocks = min(num_required_blocks, - self.max_block_sliding_window) - - num_free_gpu_blocks = self.block_allocator.get_num_free_blocks( - device=Device.GPU) - - # Use watermark to avoid frequent cache eviction. - if (self.num_total_gpu_blocks - num_required_blocks - < self.watermark_blocks): - return AllocStatus.NEVER - if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: - return AllocStatus.OK - else: - return AllocStatus.LATER - - def _allocate_sequence(self, seq: Sequence) -> BlockTable: - block_table = BlockTable( - block_size=self.block_size, - block_allocator=self.block_allocator, - max_block_sliding_window=self.max_block_sliding_window, - ) - if seq.get_token_ids(): - # NOTE: If there are any factors affecting the block besides - # token_ids, they should be added as input to extra_hash. - extra_hash = seq.extra_hash() - - # Add blocks to the block table only if the sequence is non empty. - block_table.allocate(token_ids=seq.get_token_ids(), - extra_hash=extra_hash) - - return block_table - - def allocate(self, seq_group: SequenceGroup) -> None: - - # Allocate self-attention block tables for decoder sequences - waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) - assert not (set(seq.seq_id for seq in waiting_seqs) - & self.block_tables.keys()), "block table already exists" - - # NOTE: Here we assume that all sequences in the group have the same - # prompt. - seq = waiting_seqs[0] - block_table: BlockTable = self._allocate_sequence(seq) - self.block_tables[seq.seq_id] = block_table - - # Track seq - self._last_access_blocks_tracker.add_seq(seq.seq_id) - - # Assign the block table for each sequence. - for seq in waiting_seqs[1:]: - self.block_tables[seq.seq_id] = block_table.fork() - - # Track seq - self._last_access_blocks_tracker.add_seq(seq.seq_id) - - # Allocate cross-attention block table for encoder sequence - # - # NOTE: Here we assume that all sequences in the group have the same - # encoder prompt. - request_id = seq_group.request_id - - assert (request_id - not in self.cross_block_tables), \ - "block table already exists" - - check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) - - if seq_group.is_encoder_decoder(): - encoder_seq = seq_group.get_encoder_seq() - assert encoder_seq is not None - block_table = self._allocate_sequence(encoder_seq) - self.cross_block_tables[request_id] = block_table - - def can_append_slots(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - """Determine if there is enough space in the GPU KV cache to continue - generation of the specified sequence group. - - We use a worst-case heuristic: assume each touched block will require a - new allocation (either via CoW or new block). We can append slots if the - number of touched blocks is less than the number of free blocks. - - "Lookahead slots" are slots that are allocated in addition to the slots - for known tokens. The contents of the lookahead slots are not defined. - This is used by speculative decoding when speculating future tokens. - """ - - num_touched_blocks = 0 - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - block_table = self.block_tables[seq.seq_id] - - num_touched_blocks += ( - block_table.get_num_blocks_touched_by_append_slots( - token_ids=block_table.get_unseen_token_ids( - seq.get_token_ids()), - num_lookahead_slots=num_lookahead_slots, - )) - - num_free_gpu_blocks = self.block_allocator.get_num_free_blocks( - Device.GPU) - return num_touched_blocks <= num_free_gpu_blocks - - def append_slots( - self, - seq: Sequence, - num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: - - block_table = self.block_tables[seq.seq_id] - - block_table.append_token_ids( - token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()), - num_lookahead_slots=num_lookahead_slots, - num_computed_slots=seq.data.get_num_computed_tokens(), - extra_hash=seq.extra_hash(), - ) - # Return any new copy-on-writes. - new_cows = self.block_allocator.clear_copy_on_writes() - return new_cows - - def free(self, seq: Sequence) -> None: - seq_id = seq.seq_id - - if seq_id not in self.block_tables: - # Already freed or haven't been scheduled yet. - return - - # Update seq block ids with the latest access time - self._last_access_blocks_tracker.update_seq_blocks_last_access( - seq_id, self.block_tables[seq.seq_id].physical_block_ids) - - # Untrack seq - self._last_access_blocks_tracker.remove_seq(seq_id) - self._computed_blocks_tracker.remove_seq(seq_id) - - # Free table/blocks - self.block_tables[seq_id].free() - del self.block_tables[seq_id] - - def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - seq_id = seq.seq_id - self._computed_blocks_tracker.remove_seq(seq_id) - - def free_cross(self, seq_group: SequenceGroup) -> None: - request_id = seq_group.request_id - if request_id not in self.cross_block_tables: - # Already freed or hasn't been scheduled yet. - return - self.cross_block_tables[request_id].free() - del self.cross_block_tables[request_id] - - def get_block_table(self, seq: Sequence) -> List[int]: - block_ids = self.block_tables[seq.seq_id].physical_block_ids - return block_ids # type: ignore - - def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: - request_id = seq_group.request_id - assert request_id in self.cross_block_tables - block_ids = self.cross_block_tables[request_id].physical_block_ids - assert all(b is not None for b in block_ids) - return block_ids # type: ignore - - def access_all_blocks_in_seq(self, seq: Sequence, now: float): - if self.enable_caching: - # Record the latest access time for the sequence. The actual update - # of the block ids is deferred to the sequence free(..) call, since - # only during freeing of block ids, the blocks are actually added to - # the evictor (which is when the most updated time is required) - # (This avoids expensive calls to mark_blocks_as_accessed(..)) - self._last_access_blocks_tracker.update_last_access( - seq.seq_id, now) - - def mark_blocks_as_computed(self, seq_group: SequenceGroup, - token_chunk_size: int): - # If prefix caching is enabled, mark immutable blocks as computed - # right after they have been scheduled (for prefill). This assumes - # the scheduler is synchronous so blocks are actually computed when - # scheduling the next batch. - self.block_allocator.mark_blocks_as_computed([]) - - def get_common_computed_block_ids( - self, seqs: List[Sequence]) -> GenericSequence[int]: - """Determine which blocks for which we skip prefill. - - With prefix caching we can skip prefill for previously-generated blocks. - Currently, the attention implementation only supports skipping cached - blocks if they are a contiguous prefix of cached blocks. - - This method determines which blocks can be safely skipped for all - sequences in the sequence group. - """ - computed_seq_block_ids = [] - for seq in seqs: - all_blocks = self.block_tables[seq.seq_id].physical_block_ids - num_cached_tokens = ( - self._computed_blocks_tracker.get_num_cached_tokens(seq)) - assert num_cached_tokens % self.block_size == 0 - num_cached_blocks = num_cached_tokens // self.block_size - computed_block_ids = all_blocks[:num_cached_blocks] - computed_seq_block_ids.append(computed_block_ids) - - # NOTE(sang): This assumes seq_block_ids doesn't contain any None. - return self.block_allocator.get_common_computed_block_ids( - computed_seq_block_ids) # type: ignore - - def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: - if parent_seq.seq_id not in self.block_tables: - # Parent sequence has either been freed or never existed. - return - src_block_table = self.block_tables[parent_seq.seq_id] - self.block_tables[child_seq.seq_id] = src_block_table.fork() - - # Track child seq - self._last_access_blocks_tracker.add_seq(child_seq.seq_id) - - def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> AllocStatus: - """Returns the AllocStatus for the given sequence_group - with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap in. - num_lookahead_slots (int): Number of lookahead slots used in - speculative decoding, default to 0. - - Returns: - AllocStatus: The AllocStatus for the given sequence group. - """ - return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, - num_lookahead_slots) - - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - """Returns the block id mapping (from CPU to GPU) generated by - swapping in the given seq_group with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap in. - - Returns: - List[Tuple[int, int]]: The mapping of swapping block from CPU - to GPU. - """ - physical_block_id_mapping = [] - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - blocks = self.block_tables[seq.seq_id].blocks - if len(blocks) == 0: - continue - - seq_swap_mapping = self.block_allocator.swap(blocks=blocks, - src_device=Device.CPU, - dst_device=Device.GPU) - - # Refresh the block ids of the table (post-swap) - self.block_tables[seq.seq_id].update(blocks) - - seq_physical_block_id_mapping = { - self.block_allocator.get_physical_block_id( - Device.CPU, cpu_block_id): - self.block_allocator.get_physical_block_id( - Device.GPU, gpu_block_id) - for cpu_block_id, gpu_block_id in seq_swap_mapping.items() - } - - physical_block_id_mapping.extend( - list(seq_physical_block_id_mapping.items())) - - return physical_block_id_mapping - - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - """Returns whether we can swap out the given sequence_group - with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap out. - - Returns: - bool: Whether it's possible to swap out current sequence group. - """ - alloc_status = self._can_swap(seq_group, Device.CPU, - SequenceStatus.RUNNING) - return alloc_status == AllocStatus.OK - - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - """Returns the block id mapping (from GPU to CPU) generated by - swapping out the given sequence_group with num_lookahead_slots. - - Args: - seq_group (SequenceGroup): The sequence group to swap out. - - Returns: - List[Tuple[int, int]]: The mapping of swapping block from - GPU to CPU. - """ - physical_block_id_mapping = [] - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - blocks = self.block_tables[seq.seq_id].blocks - if len(blocks) == 0: - continue - - seq_swap_mapping = self.block_allocator.swap(blocks=blocks, - src_device=Device.GPU, - dst_device=Device.CPU) - - # Refresh the block ids of the table (post-swap) - self.block_tables[seq.seq_id].update(blocks) - - seq_physical_block_id_mapping = { - self.block_allocator.get_physical_block_id( - Device.GPU, gpu_block_id): - self.block_allocator.get_physical_block_id( - Device.CPU, cpu_block_id) - for gpu_block_id, cpu_block_id in seq_swap_mapping.items() - } - - physical_block_id_mapping.extend( - list(seq_physical_block_id_mapping.items())) - - return physical_block_id_mapping - - def get_num_free_gpu_blocks(self) -> int: - return self.block_allocator.get_num_free_blocks(Device.GPU) - - def get_num_free_cpu_blocks(self) -> int: - return self.block_allocator.get_num_free_blocks(Device.CPU) - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - return self.block_allocator.get_prefix_cache_hit_rate(device) - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - return self.block_allocator.reset_prefix_cache(device) - - def _can_swap(self, - seq_group: SequenceGroup, - device: Device, - status: SequenceStatus, - num_lookahead_slots: int = 0) -> AllocStatus: - """Returns the AllocStatus for swapping in/out the given sequence_group - on to the 'device'. - - Args: - seq_group (SequenceGroup): The sequence group to swap in/out. - device (Device): device to swap the 'seq_group' on. - status (SequenceStatus): The status of sequence which is needed - for action. RUNNING for swap out and SWAPPED for swap in - num_lookahead_slots (int): Number of lookahead slots used in - speculative decoding, default to 0. - - Returns: - AllocStatus: The AllocStatus for swapping in/out the given - sequence_group on to the 'device'. - """ - # First determine the number of blocks that will be touched by this - # swap. Then verify if there are available blocks in the device - # to perform the swap. - num_blocks_touched = 0 - blocks: List[Block] = [] - for seq in seq_group.get_seqs(status=status): - block_table = self.block_tables[seq.seq_id] - if block_table.blocks is not None: - # Compute the number blocks to touch for the tokens to be - # appended. This does NOT include the full blocks that need - # to be touched for the swap. - num_blocks_touched += \ - block_table.get_num_blocks_touched_by_append_slots( - block_table.get_unseen_token_ids(seq.get_token_ids()), - num_lookahead_slots=num_lookahead_slots) - blocks.extend(block_table.blocks) - # Compute the number of full blocks to touch and add it to the - # existing count of blocks to touch. - num_blocks_touched += self.block_allocator.get_num_full_blocks_touched( - blocks, device=device) - - watermark_blocks = 0 - if device == Device.GPU: - watermark_blocks = self.watermark_blocks - - if self.block_allocator.get_num_total_blocks( - device) < num_blocks_touched: - return AllocStatus.NEVER - elif self.block_allocator.get_num_free_blocks( - device) - num_blocks_touched >= watermark_blocks: - return AllocStatus.OK - else: - return AllocStatus.LATER - - def get_num_cached_tokens(self, seq: Sequence) -> int: - """Get the number of tokens in blocks that are already computed and - cached in the block manager for the sequence. - """ - return self._computed_blocks_tracker.get_num_cached_tokens(seq) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py deleted file mode 100644 index 85ff6bc9ca610..0000000000000 --- a/vllm/core/evictor.py +++ /dev/null @@ -1,157 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -import heapq -from abc import ABC, abstractmethod -from typing import Dict, List, Tuple - - -class EvictionPolicy(enum.Enum): - """Enum for eviction policy used by make_evictor to instantiate the correct - Evictor subclass. - """ - LRU = enum.auto() - - -class Evictor(ABC): - """The Evictor subclasses should be used by the BlockAllocator class to - handle eviction of freed Blocks. - """ - - @abstractmethod - def __init__(self): - pass - - @abstractmethod - def __contains__(self, block_id: int) -> bool: - pass - - @abstractmethod - def evict(self) -> Tuple[int, int]: - """Runs the eviction algorithm and returns the evicted block's - content hash along with physical block id along with physical block id - """ - pass - - @abstractmethod - def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, - last_accessed: float): - """Adds block to the evictor, making it a candidate for eviction""" - pass - - @abstractmethod - def update(self, block_id: int, last_accessed: float): - """Update corresponding block's access time in metadata""" - pass - - @abstractmethod - def remove(self, block_id: int): - """Remove a given block id from the cache.""" - pass - - @property - @abstractmethod - def num_blocks(self) -> int: - pass - - -class BlockMetaData: - """Data structure for storing key data describe cached block, so that - evictor could use to make its decision which one to choose for eviction - - Here we use physical block id as the dict key, as there maybe several - blocks with the same content hash, but their physical id is unique. - """ - - def __init__(self, content_hash: int, num_hashed_tokens: int, - last_accessed: float): - self.content_hash = content_hash - self.num_hashed_tokens = num_hashed_tokens - self.last_accessed = last_accessed - - -class LRUEvictor(Evictor): - """Evicts in a least-recently-used order using the last_accessed timestamp - that's recorded in the Block. If there are multiple blocks with - the same last_accessed time, then the one with the largest num_hashed_tokens - will be evicted. If two blocks each have the lowest last_accessed time and - highest num_hashed_tokens value, then one will be chosen arbitrarily - """ - - # CLEANUP_THRESHOLD determines the maximum allowable size of the priority - # queue relative to the free table size. When this threshold is exceeded, - # a cleanup operation is triggered to reduce memory usage. - CLEANUP_THRESHOLD = 50 - - def __init__(self): - self.free_table: Dict[int, BlockMetaData] = {} - self.priority_queue = [] - - def __contains__(self, block_id: int) -> bool: - return block_id in self.free_table - - def evict(self) -> Tuple[int, int]: - if len(self.free_table) == 0: - raise ValueError("No usable cache memory left") - - while self.priority_queue: - # We do not remove outdated entries from the priority queue at the - # time of updating the last_accessed timestamp. Instead, outdated - # entries are filtered out here during eviction. Outdated entries - # would either not in the free table, or have older last accessed - # time. - last_accessed, _, block_id, content_hash = heapq.heappop( - self.priority_queue) - if (block_id in self.free_table and - self.free_table[block_id].last_accessed == last_accessed): - self.free_table.pop(block_id) - return block_id, content_hash - - raise ValueError("No usable cache memory left") - - def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, - last_accessed: float): - self.free_table[block_id] = BlockMetaData(content_hash, - num_hashed_tokens, - last_accessed) - heapq.heappush( - self.priority_queue, - (last_accessed, -num_hashed_tokens, block_id, content_hash)) - self._cleanup_if_necessary() - - def update(self, block_id: int, last_accessed: float): - self.free_table[block_id].last_accessed = last_accessed - - def _cleanup_if_necessary(self): - if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len( - self.free_table): - self._cleanup() - - def _cleanup(self): - new_priority_queue: List[Tuple[float, int, int, int]] = [] - - for block_id, block in self.free_table.items(): - new_priority_queue.append( - (block.last_accessed, -block.num_hashed_tokens, block_id, - block.content_hash)) - heapq.heapify(new_priority_queue) - - self.priority_queue = new_priority_queue - - def remove(self, block_id: int): - if block_id not in self.free_table: - raise ValueError( - "Attempting to remove block that's not in the evictor") - self.free_table.pop(block_id) - - @property - def num_blocks(self) -> int: - return len(self.free_table) - - -def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: - if eviction_policy == EvictionPolicy.LRU: - return LRUEvictor() - else: - raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py deleted file mode 100644 index 69b9169ddd8a9..0000000000000 --- a/vllm/core/interfaces.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -from abc import ABC, abstractmethod -from typing import List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple - -from vllm.sequence import Sequence, SequenceGroup -from vllm.utils import Device - - -class AllocStatus(enum.Enum): - """Result for BlockSpaceManager.can_allocate - - 1. Ok: seq_group can be allocated now. - 2. Later: seq_group cannot be allocated. - The capacity of allocator is larger than seq_group required. - 3. Never: seq_group can never be allocated. - The seq_group is too large to allocated in GPU. - """ - OK = enum.auto() - LATER = enum.auto() - NEVER = enum.auto() - - -class BlockSpaceManager(ABC): - - @staticmethod - def get_block_space_manager_class(version: str): - version = version.lower() - - if version == "selfattn": - from vllm.core.block_manager import SelfAttnBlockSpaceManager - return SelfAttnBlockSpaceManager - - if version == "placeholder": - from vllm.core.placeholder_block_space_manager import ( - PlaceholderBlockSpaceManager) - return PlaceholderBlockSpaceManager - - raise ValueError(f"Unknown version {version=}") - - @abstractmethod - def can_allocate(self, - seq_group: SequenceGroup, - num_lookahead_slots: int = 0) -> AllocStatus: - pass - - @abstractmethod - def allocate(self, seq_group: SequenceGroup) -> None: - pass - - @abstractmethod - def can_append_slots(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - pass - - @abstractmethod - def append_slots( - self, - seq: Sequence, - num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: - pass - - @abstractmethod - def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> AllocStatus: - pass - - @abstractmethod - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - pass - - @abstractmethod - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - pass - - @abstractmethod - def free(self, seq: Sequence) -> None: - pass - - @abstractmethod - def get_block_table(self, seq: Sequence) -> List[int]: - pass - - @abstractmethod - def get_num_free_gpu_blocks(self) -> int: - pass - - @abstractmethod - def get_num_free_cpu_blocks(self) -> int: - pass - - @abstractmethod - def access_all_blocks_in_seq( - self, - seq: Sequence, - access_time: float, - ) -> None: - pass - - @abstractmethod - def get_common_computed_block_ids( - self, seqs: List[Sequence]) -> GenericSequence[int]: - pass - - @abstractmethod - def mark_blocks_as_computed(self, seq_group: SequenceGroup, - token_chunk_size: int): - pass - - @abstractmethod - def get_prefix_cache_hit_rate(self, device: Device) -> float: - """Prefix cache hit rate. -1 means not supported or disabled.""" - pass - - @abstractmethod - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - """Reset prefix cache for specified or all devices.""" - pass - - @abstractmethod - def get_num_cached_tokens(self, seq: Sequence) -> int: - pass - - @abstractmethod - def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - pass \ No newline at end of file diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py deleted file mode 100644 index 679515924e85d..0000000000000 --- a/vllm/core/placeholder_block_space_manager.py +++ /dev/null @@ -1,103 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Tuple - -from vllm.core.interfaces import AllocStatus, BlockSpaceManager -from vllm.sequence import Sequence, SequenceGroup -from vllm.utils import Device - - -class PlaceholderBlockSpaceManager(BlockSpaceManager): - """A version of BlockSpaceManager for use in environments - where block management is not required. - For example: pooling models or attention-free models like Mamba. - - This class provides the same interface as BlockSpaceManager, but its - methods perform no actions or return simple values like True in specific - actions. It's designed to be used in scenarios where the overhead of - block management is unnecessary, such as in an embedding environment. - """ - - def __init__( - self, - **kwargs, - ) -> None: - pass - - def can_allocate(self, - seq_group: SequenceGroup, - num_lookahead_slots: int = 0) -> AllocStatus: - # Always return OK for dummy purposes - return AllocStatus.OK - - def allocate(self, seq_group: SequenceGroup) -> None: - # No actual allocation logic needed - pass - - def can_append_slots(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - return True - - def append_slots( - self, - seq: Sequence, - num_lookahead_slots: int, - ) -> List[Tuple[int, int]]: - return [] - - def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: - pass - - def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> AllocStatus: - return AllocStatus.OK - - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - return None # type: ignore - - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - return True - - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: - return None # type: ignore - - def free(self, seq: Sequence) -> None: - # No operation on free - return - - def get_block_table(self, seq: Sequence) -> List[int]: - return None # type: ignore - - def get_num_free_gpu_blocks(self) -> int: - return 1 - - def get_num_free_cpu_blocks(self) -> int: - return 1 - - def access_all_blocks_in_seq( - self, - seq: Sequence, - access_time: float, - ) -> None: - pass - - def get_common_computed_block_ids(self, - seq_group: List[Sequence]) -> List[int]: - return [] - - def mark_blocks_as_computed(self, seq_group: SequenceGroup, - token_chunk_size: int): - pass - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - return -1 - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - return True - - def get_num_cached_tokens(self, seq: Sequence) -> int: - return 0 - - def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - return diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py deleted file mode 100644 index 92ebad778ea4b..0000000000000 --- a/vllm/core/scheduler.py +++ /dev/null @@ -1,2028 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -import os -import random -import time -from collections import deque -from dataclasses import dataclass, field -from typing import Callable, Deque, Dict, Iterable, List, Optional -from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union - -from vllm.config import CacheConfig, SchedulerConfig -from vllm.config.lora import LoRAConfig -from vllm.core.interfaces import AllocStatus, BlockSpaceManager -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.sequence import (Sequence, SequenceData, SequenceGroup, - SequenceGroupBase, SequenceGroupMetadata, - SequenceGroupMetadataDelta, SequenceStage, - SequenceStatus) -from vllm.utils import Device, PyObjectCache - -logger = init_logger(__name__) - -# Test-only. If configured, decode is preempted with -# ARTIFICIAL_PREEMPTION_PROB% probability. -ENABLE_ARTIFICIAL_PREEMPT = bool( - os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa -ARTIFICIAL_PREEMPTION_PROB = 0.5 -ARTIFICIAL_PREEMPTION_MAX_CNT = 500 - - -class PreemptionMode(enum.Enum): - """Preemption modes. - - 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory - and swap them back in when the sequences are resumed. - 2. Recomputation: Discard the blocks of the preempted sequences and - recompute them when the sequences are resumed, treating the sequences as - new prompts. - """ - - SWAP = enum.auto() - RECOMPUTE = enum.auto() - - -@dataclass -class SchedulingBudget: - """The available slots for scheduling. - - TODO(sang): Right now, the budget is request_id-aware meaning it can ignore - budget update from the same request_id. It is because in normal scheduling - path, we update RUNNING num_seqs ahead of time, meaning it could be - updated more than once when scheduling RUNNING requests. Since this won't - happen if we only have chunked prefill scheduling, we can remove this - feature from the API when chunked prefill is enabled by default. - """ - - token_budget: int - max_num_seqs: int - _request_ids_num_batched_tokens: Set[str] = field(default_factory=set) - _request_ids_num_curr_seqs: Set[str] = field(default_factory=set) - # Number of cached tokens in the batch. - _num_cached_tokens: int = 0 - # Number of actual non-cached tokens in the batch. - _num_batched_tokens: int = 0 - _num_curr_seqs: int = 0 - - def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int): - # We allow num_new_tokens to be 0 when the entire sequence has - # been cached. - assert num_new_tokens >= 0 - assert num_new_seqs != 0 - return (self.num_batched_tokens + num_new_tokens <= self.token_budget - and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs) - - def remaining_token_budget(self): - return self.token_budget - self.num_batched_tokens - - def add_num_batched_tokens(self, - req_id: str, - num_batched_tokens: int, - num_cached_tokens: int = 0): - if req_id in self._request_ids_num_batched_tokens: - return - assert num_cached_tokens >= 0 - assert num_batched_tokens >= 0 - - self._request_ids_num_batched_tokens.add(req_id) - self._num_batched_tokens += num_batched_tokens - self._num_cached_tokens += num_cached_tokens - - def subtract_num_batched_tokens(self, req_id: str, - num_batched_tokens: int): - if req_id in self._request_ids_num_batched_tokens: - self._request_ids_num_batched_tokens.remove(req_id) - self._num_batched_tokens -= num_batched_tokens - - def add_num_seqs(self, req_id: str, num_curr_seqs: int): - if req_id in self._request_ids_num_curr_seqs: - return - - self._request_ids_num_curr_seqs.add(req_id) - self._num_curr_seqs += num_curr_seqs - - def subtract_num_seqs(self, req_id: str, num_curr_seqs: int): - if req_id in self._request_ids_num_curr_seqs: - self._request_ids_num_curr_seqs.remove(req_id) - self._num_curr_seqs -= num_curr_seqs - - @property - def num_batched_tokens(self): - return self._num_batched_tokens - - @property - def num_curr_seqs(self): - return self._num_curr_seqs - - @property - def num_cached_tokens(self): - return self._num_cached_tokens - - -@dataclass -class ScheduledSequenceGroup: - # A sequence group that's scheduled. - seq_group: SequenceGroup - # The total chunk size (number of tokens) to process for next iteration. - # 1 for decoding. Same as prompt tokens for prefill, but if prefill is - # chunked, it can be smaller than that. - token_chunk_size: int - - -@dataclass -class SchedulerOutputs: - """The scheduling decision made from a scheduler.""" - - # Scheduled sequence groups. - scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup] - # Number of prefill groups scheduled. - num_prefill_groups: int - # Total number of batched tokens. - num_batched_tokens: int - # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: List[Tuple[int, int]] - # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: List[Tuple[int, int]] - # Blocks to copy. Source to dest block. - blocks_to_copy: List[Tuple[int, int]] - # Sequence groups that are going to be ignored. - ignored_seq_groups: List[SequenceGroup] - # The number of slots for lookahead decoding. - num_lookahead_slots: int - # The number of requests in the running queue - running_queue_size: int - preempted: int - - def __post_init__(self): - # Swap in and swap out should never happen at the same time. - assert not (self.blocks_to_swap_in and self.blocks_to_swap_out) - - self.num_loras: int = len(self.lora_requests) - if self.num_loras > 0: - self._sort_by_lora_ids() - - def is_empty(self) -> bool: - # NOTE: We do not consider the ignored sequence groups. - return (not self.scheduled_seq_groups and not self.blocks_to_swap_in - and not self.blocks_to_swap_out and not self.blocks_to_copy) - - def _sort_by_lora_ids(self): - assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups) - - def key_fn(group: ScheduledSequenceGroup): - key = (group.seq_group.lora_int_id, group.seq_group.request_id) - if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups): - # Sort sequence groups so that all prefills come before all - # decodes as required by chunked prefill. - return (not group.seq_group.is_prefill(), *key) - return key - - self.scheduled_seq_groups = sorted(self.scheduled_seq_groups, - key=key_fn) - - @property - def lora_requests(self) -> Set[LoRARequest]: - return { - g.seq_group.lora_request - for g in self.scheduled_seq_groups - if g.seq_group.lora_request is not None - } - - -@dataclass -class SchedulerRunningOutputs: - """The requests that are scheduled from a running queue. - - Could contain prefill (prefill that's chunked) or decodes. If there's not - enough memory, it can be preempted (for recompute) or swapped out. - """ - - # Selected sequences that are running and in a decoding phase. - decode_seq_groups: List[ScheduledSequenceGroup] - # Selected sequences that are running and in a prefill phase. - # I.e., it means the prefill has been chunked. - prefill_seq_groups: List[ScheduledSequenceGroup] - # The preempted sequences. - preempted: List[SequenceGroup] - # Sequences that are swapped out. - swapped_out: List[SequenceGroup] - # The blocks to swap out. - blocks_to_swap_out: List[Tuple[int, int]] - # The blocks to copy. - blocks_to_copy: List[Tuple[int, int]] - # The number of slots for lookahead decoding. - num_lookahead_slots: int - - # Optimization for fast-access to seq_group lists - decode_seq_groups_list: List[SequenceGroup] - prefill_seq_groups_list: List[SequenceGroup] - - @classmethod - def create_empty(cls) -> "SchedulerRunningOutputs": - return SchedulerRunningOutputs( - decode_seq_groups=[], - prefill_seq_groups=[], - preempted=[], - swapped_out=[], - blocks_to_swap_out=[], - blocks_to_copy=[], - num_lookahead_slots=0, - decode_seq_groups_list=[], - prefill_seq_groups_list=[], - ) - - -@dataclass -class SchedulerSwappedInOutputs: - """The requests that are scheduled from a swap queue. - - Could contain prefill (prefill that's chunked) or decodes. - """ - - # Selected sequences that are going to be swapped in and is in a - # decoding phase. - decode_seq_groups: List[ScheduledSequenceGroup] - # Selected sequences that are going to be swapped in and in a prefill - # phase. I.e., it means the prefill has been chunked. - prefill_seq_groups: List[ScheduledSequenceGroup] - # The blocks to swap in. - blocks_to_swap_in: List[Tuple[int, int]] - # The blocks to copy. - blocks_to_copy: List[Tuple[int, int]] - # The number of slots for lookahead decoding. - num_lookahead_slots: int - # Infeasible sequence groups. - infeasible_seq_groups: List[SequenceGroup] - - @classmethod - def create_empty(cls) -> "SchedulerSwappedInOutputs": - return SchedulerSwappedInOutputs( - decode_seq_groups=[], - prefill_seq_groups=[], - blocks_to_swap_in=[], - blocks_to_copy=[], - num_lookahead_slots=0, - infeasible_seq_groups=[], - ) - - -@dataclass -class SchedulerPrefillOutputs: - """The requests that are scheduled from a waiting queue. - - Could contain a fresh prefill requests or preempted requests that need - to be recomputed from scratch. - """ - - # Selected sequences for prefill. - seq_groups: List[ScheduledSequenceGroup] - # Ignored sequence groups. - ignored_seq_groups: List[SequenceGroup] - num_lookahead_slots: int - - @classmethod - def create_empty(cls) -> "SchedulerPrefillOutputs": - return SchedulerPrefillOutputs( - seq_groups=[], - ignored_seq_groups=[], - num_lookahead_slots=0, - ) - - -def seq_group_metadata_builder(): - return SequenceGroupMetadata(request_id="", - is_prompt=False, - seq_data={}, - sampling_params=None, - block_tables={}) - - -def scheduler_running_outputs_builder(): - return SchedulerRunningOutputs(decode_seq_groups=[], - prefill_seq_groups=[], - preempted=[], - swapped_out=[], - blocks_to_swap_out=[], - blocks_to_copy=[], - num_lookahead_slots=0, - prefill_seq_groups_list=[], - decode_seq_groups_list=[]) - - -def scheduled_seq_group_builder(): - return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup), - token_chunk_size=0) - # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0) - - -@dataclass -class PartialPrefillMetadata: - """Holds information about the partial prefills that are currently running - during a single iteration of the Scheduler. - When chunked prefill is enabled, we allow a certain number of seqs to be - partially prefilled during each iteration. Having multiple partial prefills - in flight allows us to minimize TTFT and avoid decode starvation in cases - where a single sequence group with a very large prompt blocks the queue for - too many iterations. - The number of long prefill requests is limited so that smaller - requests may jump the queue in front of them and get to the decode - phase faster. - """ - - # A minimum bound on the total number of prefills to be scheduled during - # this iteration - schedulable_prefills: int - - # The number of long prefill requests currently running - long_prefills: int - - scheduler_config: SchedulerConfig - - def can_schedule(self, seq_group: SequenceGroup) -> bool: - """When concurrent partial prefills are enabled, - we limit the number of long requests and only accept - shorter requests from the queue while running them - concurrently""" - return not (seq_group.first_seq.get_num_new_tokens() - > self.scheduler_config.long_prefill_token_threshold - and self.long_prefills - >= self.scheduler_config.max_long_partial_prefills - and self.scheduler_config.max_num_partial_prefills > 1) - - def maybe_increment_partial_prefills(self, - seq_group: SequenceGroup) -> None: - # When a new prefill is scheduled, we need to know if it is a - # long request - if (seq_group.first_seq.get_num_new_tokens() - > self.scheduler_config.long_prefill_token_threshold): - self.long_prefills += 1 - - @classmethod - def from_queues( - cls, - running: Deque[SequenceGroup], - waiting: Deque[SequenceGroup], - scheduler_config: SchedulerConfig, - ) -> "PartialPrefillMetadata": - """Create a PartialPrefillMetadata object from the current state of - the scheduler's queues. - This accounts for the currently running prefill requests, and peeks into - the waiting queue to see if there are more prefills to potentially be - scheduled during this iteration.""" - prefills = 0 - long_prefills = 0 - - waiting_long_prefills = 0 - - for sg in running: - if sg.first_seq.data.stage == SequenceStage.PREFILL: - prefills += 1 - if (sg.first_seq.get_num_new_tokens() - > scheduler_config.long_prefill_token_threshold): - long_prefills += 1 - - for sg in waiting: - # Don't bother looping through the rest of the queue if we know - # there are already at - # least max_partial_prefills requests to fill - if prefills >= scheduler_config.max_num_partial_prefills: - break - - # Don't count long requests from the waiting queue if we aren't - # going to schedule them anyway - if (sg.first_seq.get_num_new_tokens() - > scheduler_config.long_prefill_token_threshold): - if (long_prefills + waiting_long_prefills - >= scheduler_config.max_long_partial_prefills): - continue - waiting_long_prefills += 1 - prefills += 1 - - # NB: long_prefills and waiting_long_prefills are tracked separately. - # We don't account for the waiting requests here because we need to use - # this metadata to track how many have actually been scheduled. - return PartialPrefillMetadata( - schedulable_prefills=min( - prefills, scheduler_config.max_num_partial_prefills), - long_prefills=long_prefills, - scheduler_config=scheduler_config, - ) - - -class Scheduler: - - def __init__( - self, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig, - lora_config: Optional[LoRAConfig], - pipeline_parallel_size: int = 1, - output_proc_callback: Optional[Callable] = None, - ) -> None: - self.scheduler_config = scheduler_config - self.cache_config = cache_config - # Note for LoRA scheduling: the current policy is extremely - # simple and NOT fair. It can lead to starvation of some - # LoRAs. This should be improved in the future. - self.lora_config = lora_config - - version = "selfattn" - if (self.scheduler_config.runner_type == "pooling" - or self.cache_config.is_attention_free): - version = "placeholder" - - BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class( - version) - - num_gpu_blocks = cache_config.num_gpu_blocks - if num_gpu_blocks: - num_gpu_blocks //= pipeline_parallel_size - - num_cpu_blocks = cache_config.num_cpu_blocks - if num_cpu_blocks: - num_cpu_blocks //= pipeline_parallel_size - - # Create the block space manager. - self.block_manager = BlockSpaceManagerImpl( - block_size=self.cache_config.block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks, - sliding_window=self.cache_config.sliding_window, - enable_caching=self.cache_config.enable_prefix_caching, - ) - - # Sequence groups in the WAITING state. - # Contain new prefill or preempted requests. - self.waiting: Deque[SequenceGroup] = deque() - # Sequence groups in the RUNNING state. - # Contain decode requests. - self.running: Deque[SequenceGroup] = deque() - # Sequence groups in the SWAPPED state. - # Contain decode requests that are swapped out. - self.swapped: Deque[SequenceGroup] = deque() - # Sequence groups finished requests ids since last step iteration. - # It lets the model know that any state associated with these requests - # can and must be released after the current step. - # This is used to evict the finished requests from the Mamba cache. - self._finished_requests_ids: List[str] = list() - # Time at previous scheduling step - self.prev_time = 0.0 - # Did we schedule a prompt at previous step? - self.prev_prompt = False - # Latency of the last prompt step - self.last_prompt_latency = 0.0 - # preemption mode, RECOMPUTE or SWAP - self.user_specified_preemption_mode = scheduler_config.preemption_mode - - # The following field is test-only. It is used to inject artificial - # preemption. - self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT - self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT - if self.enable_artificial_preemption - else 0) - self.num_cumulative_preemption: int = 0 - - # Used to cache python objects - self._seq_group_metadata_cache: List[PyObjectCache] = [] - self._scheduler_running_outputs_cache: List[PyObjectCache] = [] - self._scheduled_seq_group_cache: List[PyObjectCache] = [] - - # For async output processing, we need to swap cache buffers between - # iterations. I.e. since the output processing is lagged one step, - # we cannot reuse the cached objects immediately when the schedule() - # is called again, but only when schedule() is called the second time. - self.output_proc_callback = output_proc_callback - self.use_async_output_proc = self.output_proc_callback is not None - self.num_cache_iters = 2 if self.use_async_output_proc else 1 - - self.cache_id = 0 - for i in range(self.num_cache_iters): - self._seq_group_metadata_cache.append( - PyObjectCache(seq_group_metadata_builder)) - self._scheduler_running_outputs_cache.append( - PyObjectCache(scheduler_running_outputs_builder)) - self._scheduled_seq_group_cache.append( - PyObjectCache(scheduled_seq_group_builder)) - - # For async postprocessor, the extra decode run cannot be done - # when the request reaches max_model_len. In this case, the request - # will be stopped during schedule() call and added to this stop list - # for processing and deallocation by the free_finished_seq_groups() - self._async_stopped: List[SequenceGroup] = [] - - # List with the chunk sizes to hand out to each sequence depending - # on how many partial prefills are running. This is slightly faster than - # running an integer division every time a prefill is scheduled. - # This splits the budget evenly among all prefills. - self.partial_prefill_budget_lookup_list = [0] * ( - self.scheduler_config.max_num_partial_prefills + 1) - self.partial_prefill_budget_lookup_list[0] = ( - scheduler_config.max_num_batched_tokens) - for i in range(1, self.scheduler_config.max_num_partial_prefills + 1): - self.partial_prefill_budget_lookup_list[i] = ( - scheduler_config.max_num_batched_tokens // i) - - @property - def next_cache_id(self): - return (self.cache_id + 1) % self.num_cache_iters - - @property - def lora_enabled(self) -> bool: - return bool(self.lora_config) - - @property - def num_decoding_tokens_per_seq(self) -> int: - """The number of new tokens.""" - return 1 - - def add_seq_group(self, seq_group: SequenceGroup) -> None: - # Add sequence groups to the waiting queue. - self.waiting.append(seq_group) - - def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None: - # Add sequence groups to the running queue. - # Only for testing purposes. - self.running.append(seq_group) - - def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None: - # Add sequence groups to the swapped queue. - # Only for testing purposes. - self.swapped.append(seq_group) - - def abort_seq_group( - self, - request_id: Union[str, Iterable[str]], - seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None, - ) -> None: - """Aborts a sequence group with the given ID. - - Check if the sequence group with the given ID - is present in any of the state queue. - If present, remove the sequence group from the state queue. - Also, if any of the sequences in the sequence group is not finished, - free the sequence with status `FINISHED_ABORTED`. - Otherwise, do nothing. - - Args: - request_id: The ID(s) of the sequence group to abort. - seq_id_to_seq_group: helper for groups with n>1 - """ - if isinstance(request_id, str): - request_id = (request_id, ) - request_ids = set(request_id) - seq_id_to_seq_group = seq_id_to_seq_group or {} - for state_queue in [self.waiting, self.running, self.swapped]: - aborted_groups: List[SequenceGroup] = [] - for seq_group in state_queue: - # When n>1, seq_group.request_id looks like - # foo_parallel_sample_0, while request_ids is just foo, and we - # should resolve it as real_request_id to match. - if seq_group.request_id in seq_id_to_seq_group: - real_request_id = seq_id_to_seq_group[ - seq_group.request_id].group_id - else: - real_request_id = seq_group.request_id - if real_request_id in request_ids: - # Appending aborted group into pending list. - aborted_groups.append(seq_group) - # We can't remove real_request_id in request_ids here, - # because there may be other seq groups sharing the same - # real_request_id - for aborted_group in aborted_groups: - # Remove the sequence group from the state queue. - state_queue.remove(aborted_group) - # Remove the aborted request from the Mamba cache. - self._finished_requests_ids.append(aborted_group.request_id) - for seq in aborted_group.get_seqs(): - if seq.is_finished(): - continue - seq.status = SequenceStatus.FINISHED_ABORTED - self.free_seq(seq) - if aborted_group.request_id in seq_id_to_seq_group: - del seq_id_to_seq_group[aborted_group.request_id] - - self._free_seq_group_cross_attn_blocks(aborted_group) - - def _free_seq_group_cross_attn_blocks( - self, - seq_group: SequenceGroup, - ) -> None: - """ - Free a sequence group from a cross-attention block table. - Has no effect on decoder-only models. - """ - if seq_group.is_encoder_decoder(): - self.block_manager.free_cross(seq_group) - - def has_unfinished_seqs(self) -> bool: - return (len(self.waiting) != 0 or len(self.running) != 0 - or len(self.swapped) != 0) - - def get_prefix_cache_hit_rate(self, device: Device) -> float: - return self.block_manager.get_prefix_cache_hit_rate(device) - - def reset_prefix_cache(self, device: Optional[Device] = None) -> bool: - return self.block_manager.reset_prefix_cache(device) - - def get_num_unfinished_seq_groups(self) -> int: - return len(self.waiting) + len(self.running) + len(self.swapped) - - def get_and_reset_finished_requests_ids(self) -> List[str]: - """Flushes the list of request ids of previously finished seq_groups.""" - finished_requests_ids = self._finished_requests_ids - self._finished_requests_ids = list() - return finished_requests_ids - - def _schedule_running( - self, - budget: SchedulingBudget, - curr_loras: Optional[Set[int]], - enable_chunking: bool = False, - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> SchedulerRunningOutputs: - """Schedule sequence groups that are running. - - Running queue should include decode and chunked prefill requests. - - Args: - budget: The scheduling budget. The argument is in-place updated - when any decodes are preempted. - curr_loras: Currently batched lora request ids. The argument is - in-place updated when any decodes are preempted. - enable_chunking: If True, seq group can be chunked and only a - chunked number of tokens are scheduled if - `budget.num_batched_tokens` has not enough capacity to schedule - all tokens. - partial_prefill_metadata: information about the partial prefills - that are currently running - - Returns: - SchedulerRunningOutputs. - """ - ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[ - self.cache_id].get_object() - ret.blocks_to_swap_out.clear() - ret.blocks_to_copy.clear() - ret.decode_seq_groups.clear() - ret.prefill_seq_groups.clear() - ret.preempted.clear() - ret.swapped_out.clear() - - ret.num_lookahead_slots = self._get_num_lookahead_slots( - is_prefill=False, enable_chunking=enable_chunking) - - ret.decode_seq_groups_list.clear() - ret.prefill_seq_groups_list.clear() - - # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out - blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy - - decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups - prefill_seq_groups: List[ - ScheduledSequenceGroup] = ret.prefill_seq_groups - preempted: List[SequenceGroup] = ret.preempted - swapped_out: List[SequenceGroup] = ret.swapped_out - - running_queue = self.running - assert len(self._async_stopped) == 0 - while running_queue: - seq_group = running_queue[0] - # We discard the cached tokens info here because we don't need it - # for running sequence: - # 1. If a sequence is running with chunked prefill, the cached - # tokens info was already used for the first prefill. - # 2. If a sequence is running with non-chunked prefill, then - # there it's a decoding sequence, and the cached tokens info is - # irrelevant. - num_uncached_new_tokens, _ = \ - self._get_num_new_uncached_and_cached_tokens( - seq_group, - SequenceStatus.RUNNING, - enable_chunking, - budget, - partial_prefill_metadata, - ) - - num_running_tokens = num_uncached_new_tokens - if num_running_tokens == 0: - # No budget => Stop - break - - running_queue.popleft() - - # With async postprocessor, an extra decode run is done - # to process the final tokens. The check below avoids this extra - # decode run when the model max len is reached, in order to avoid - # a memory overflow. - if (self.use_async_output_proc and seq_group.seqs[0].get_len() - > self.scheduler_config.max_model_len): - self._async_stopped.append(seq_group) - continue - - # NOTE(woosuk): Preemption happens only when there is no available - # slot to keep all the sequence groups in the RUNNING state. - while not self._can_append_slots(seq_group, enable_chunking): - budget.subtract_num_batched_tokens(seq_group.request_id, - num_running_tokens) - num_running_seqs = seq_group.get_max_num_running_seqs() - budget.subtract_num_seqs(seq_group.request_id, - num_running_seqs) - - if (curr_loras is not None and seq_group.lora_int_id > 0 - and seq_group.lora_int_id in curr_loras): - curr_loras.remove(seq_group.lora_int_id) - - # Determine victim sequence - cont_loop = True - if running_queue: - # Preempt the lowest-priority sequence group. - victim_seq_group = running_queue.pop() - else: - # No other sequence group can be preempted. - # Preempt the current sequence group. - # Note: This is also where we stop this loop - # (since there is nothing else to preempt) - victim_seq_group = seq_group - cont_loop = False - - # With async postprocessor, before preempting a sequence - # we need to ensure it has no pending async postprocessor - do_preempt = True - if self.use_async_output_proc: - assert self.output_proc_callback is not None - self.output_proc_callback( - request_id=victim_seq_group.request_id) - - # It may be that the async pending "victim_seq_group" - # becomes finished, in which case we simply free it. - if victim_seq_group.is_finished(): - self._free_finished_seq_group(victim_seq_group) - do_preempt = False - - # Do preemption - if do_preempt: - preempted_mode = self._preempt(victim_seq_group, - blocks_to_swap_out) - if preempted_mode == PreemptionMode.RECOMPUTE: - preempted.append(victim_seq_group) - else: - swapped_out.append(victim_seq_group) - - if not cont_loop: - break - else: - self._append_slots(seq_group, blocks_to_copy, enable_chunking) - is_prefill = seq_group.is_prefill() - - scheduled_seq_group: ScheduledSequenceGroup = ( - self._scheduled_seq_group_cache[ - self.cache_id].get_object()) - scheduled_seq_group.seq_group = seq_group - if is_prefill: - scheduled_seq_group.token_chunk_size = num_running_tokens - prefill_seq_groups.append(scheduled_seq_group) - ret.prefill_seq_groups_list.append(seq_group) - else: - scheduled_seq_group.token_chunk_size = 1 - decode_seq_groups.append(scheduled_seq_group) - ret.decode_seq_groups_list.append(seq_group) - - budget.add_num_batched_tokens(seq_group.request_id, - num_running_tokens) - # OPTIMIZATION: Note that get_max_num_running_seqs is - # expensive. For the default scheduling chase where - # enable_chunking is False, num_seqs are updated before running - # this method, so we don't have to update it again here. - if enable_chunking: - num_running_seqs = seq_group.get_max_num_running_seqs() - budget.add_num_seqs(seq_group.request_id, num_running_seqs) - if curr_loras is not None and seq_group.lora_int_id > 0: - curr_loras.add(seq_group.lora_int_id) - - self._scheduler_running_outputs_cache[self.next_cache_id].reset() - self._scheduled_seq_group_cache[self.next_cache_id].reset() - - return ret - - def _schedule_swapped( - self, - budget: SchedulingBudget, - curr_loras: Optional[Set[int]], - enable_chunking: bool = False, - ) -> SchedulerSwappedInOutputs: - """Schedule sequence groups that are swapped out. - - It schedules swapped requests as long as it fits `budget` and - curr_loras <= max_lora from the scheduling config. The input arguments - `budget` and `curr_loras` are updated based on scheduled seq_groups. - - Args: - budget: The scheduling budget. The argument is in-place updated - when any requests are swapped in. - curr_loras: Currently batched lora request ids. The argument is - in-place updated when any requests are swapped in. - enable_chunking: If True, seq group can be chunked and only a - chunked number of tokens are scheduled if - `budget.num_batched_tokens` has not enough capacity to schedule - all tokens. - - Returns: - SchedulerSwappedInOutputs. - """ - # Blocks that need to be swapped or copied before model execution. - blocks_to_swap_in: List[Tuple[int, int]] = [] - blocks_to_copy: List[Tuple[int, int]] = [] - decode_seq_groups: List[ScheduledSequenceGroup] = [] - prefill_seq_groups: List[ScheduledSequenceGroup] = [] - infeasible_seq_groups: List[SequenceGroup] = [] - - swapped_queue = self.swapped - - leftover_swapped: Deque[SequenceGroup] = deque() - while swapped_queue: - seq_group = swapped_queue[0] - - # If the sequence group cannot be swapped in, stop. - is_prefill = seq_group.is_prefill() - alloc_status = self.block_manager.can_swap_in( - seq_group, - self._get_num_lookahead_slots(is_prefill, enable_chunking)) - if alloc_status == AllocStatus.LATER: - break - elif alloc_status == AllocStatus.NEVER: - logger.warning( - "Failing the request %s because there's not enough kv " - "cache blocks to run the entire sequence.", - seq_group.request_id, - ) - for seq in seq_group.get_seqs(): - seq.status = SequenceStatus.FINISHED_IGNORED - infeasible_seq_groups.append(seq_group) - swapped_queue.popleft() - continue - - lora_int_id = 0 - if self.lora_enabled: - lora_int_id = seq_group.lora_int_id - assert curr_loras is not None - assert self.lora_config is not None - if (lora_int_id > 0 and (lora_int_id not in curr_loras) - and len(curr_loras) >= self.lora_config.max_loras): - # We don't have a space for another LoRA, so - # we ignore this request for now. - leftover_swapped.appendleft(seq_group) - swapped_queue.popleft() - continue - - # The total number of sequences in the RUNNING state should not - # exceed the maximum number of sequences. - num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens_uncached, num_new_tokens_cached = ( - self._get_num_new_uncached_and_cached_tokens( - seq_group, SequenceStatus.SWAPPED, enable_chunking, - budget)) - - if num_new_tokens_uncached == 0 or not budget.can_schedule( - num_new_tokens=num_new_tokens_uncached, - num_new_seqs=num_new_seqs, - ): - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.SWAPPED) - break - - if lora_int_id > 0 and curr_loras is not None: - curr_loras.add(lora_int_id) - swapped_queue.popleft() - self._swap_in(seq_group, blocks_to_swap_in) - self._append_slots(seq_group, blocks_to_copy, enable_chunking) - if is_prefill: - prefill_seq_groups.append( - ScheduledSequenceGroup( - seq_group, - token_chunk_size=num_new_tokens_uncached + - num_new_tokens_cached, - )) - else: - decode_seq_groups.append( - ScheduledSequenceGroup(seq_group, token_chunk_size=1)) - budget.add_num_batched_tokens( - seq_group.request_id, - num_batched_tokens=num_new_tokens_uncached, - num_cached_tokens=num_new_tokens_cached, - ) - budget.add_num_seqs(seq_group.request_id, num_new_seqs) - - swapped_queue.extendleft(leftover_swapped) - - return SchedulerSwappedInOutputs( - decode_seq_groups=decode_seq_groups, - prefill_seq_groups=prefill_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_copy=blocks_to_copy, - num_lookahead_slots=self._get_num_lookahead_slots( - is_prefill=False, enable_chunking=enable_chunking), - infeasible_seq_groups=infeasible_seq_groups, - ) - - def _get_prompt_limit(self, seq_group: SequenceGroup) -> int: - if self.scheduler_config.chunked_prefill_enabled: - prompt_limit = self.scheduler_config.max_model_len - else: - prompt_limit = min( - self.scheduler_config.max_model_len, - self.scheduler_config.max_num_batched_tokens, - ) - - # Model is fine tuned with long context. Return the fine tuned max_len. - if seq_group.lora_request and seq_group.lora_request.long_lora_max_len: - assert prompt_limit <= seq_group.lora_request.long_lora_max_len - return seq_group.lora_request.long_lora_max_len - else: - return prompt_limit - - def _get_priority(self, - seq_group: SequenceGroup) -> Tuple[Optional[int], float]: - """Get the priority of the sequence group. - Highest preference to user-defined priority, followed by arrival time. - Args: - seq_group: The sequence group input. - Returns: - The priority of the sequence group. - """ - return seq_group.priority, seq_group.arrival_time - - def _schedule_priority_preemption( - self, - budget: SchedulingBudget, - ) -> int: - """Sorts waiting and running queue. Also, force preempt requests - from the running queue if their priority is lower. - Priority-based preemption is used with the priority policy. - Args: - budget: The scheduling budget. The argument is in-place updated - when any requests are scheduled. - Returns: - A count of priority-based preemptions. - """ - - waiting_queue = self.waiting - - running_queue = deque(sorted(self.running, key=self._get_priority)) - - blocks_to_swap_out: List[Tuple[int, int]] = [] - force_preemption_count = 0 - - if waiting_queue: - seq_group = waiting_queue.popleft() - num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens_uncached, _ = \ - self._get_num_new_uncached_and_cached_tokens( - seq_group, SequenceStatus.WAITING, False, budget) - - # Only preempt if priority inversion exists - while running_queue and self._get_priority( - running_queue[-1]) > self._get_priority(seq_group): - # Only preempt if waiting sequence cannot be allocated - can_allocate = self.block_manager.can_allocate(seq_group) - if (num_new_tokens_uncached > 0 - and can_allocate == AllocStatus.OK - and budget.can_schedule( - num_new_tokens=num_new_tokens_uncached, - num_new_seqs=num_new_seqs, - )): - break - - # Adjust budget to remove the victim sequence group - vseq_group = running_queue.pop() - num_running_tokens_uncached, _ = ( - self._get_num_new_uncached_and_cached_tokens( - vseq_group, SequenceStatus.RUNNING, False, budget)) - budget.subtract_num_batched_tokens( - vseq_group.request_id, num_running_tokens_uncached) - num_running_seqs = vseq_group.get_max_num_running_seqs() - budget.subtract_num_seqs(vseq_group.request_id, - num_running_seqs) - - # Preempt out the victim sequence group - self._preempt(vseq_group, blocks_to_swap_out) - waiting_queue.appendleft(vseq_group) - force_preemption_count += 1 - # Put the sequence back into the waiting queue - waiting_queue.appendleft(seq_group) - - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - - waiting_queue = deque(sorted(waiting_queue, key=self._get_priority)) - - self.waiting = waiting_queue - self.running = running_queue - return force_preemption_count - - def _schedule_prefills( - self, - budget: SchedulingBudget, - curr_loras: Optional[Set[int]], - enable_chunking: bool = False, - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> SchedulerPrefillOutputs: - """Schedule sequence groups that are in prefill stage. - - Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE - as a new prefill (that starts from beginning -> most recently generated - tokens). - - It schedules waiting requests as long as it fits `budget` and - curr_loras <= max_lora from the scheduling config. The input arguments - `budget` and `curr_loras` are updated based on scheduled seq_groups. - - Args: - budget: The scheduling budget. The argument is in-place updated - when any requests are scheduled. - curr_loras: Currently batched lora request ids. The argument is - in-place updated when any requests are scheduled. - enable_chunking: If True, seq group can be chunked and only a - chunked number of tokens are scheduled if - `budget.num_batched_tokens` has not enough capacity to schedule - all tokens. - partial_prefill_metadata: information about the partial prefills - that are currently running - - Returns: - SchedulerPrefillOutputs. - """ - if budget.remaining_token_budget() == 0: - # Do nothing: Can't add any more prefill anyway - return SchedulerPrefillOutputs( - seq_groups=[], - ignored_seq_groups=[], - num_lookahead_slots=self._get_num_lookahead_slots( - is_prefill=True, enable_chunking=enable_chunking), - ) - ignored_seq_groups: List[SequenceGroup] = [] - seq_groups: List[ScheduledSequenceGroup] = [] - using_prompt_embeds: bool = False - - waiting_queue = self.waiting - - leftover_waiting_sequences: Deque[SequenceGroup] = deque() - while self._passed_delay(time.time()) and waiting_queue: - seq_group = waiting_queue[0] - - waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) - assert len(waiting_seqs) == 1, ( - "Waiting sequence group should have only one prompt " - "sequence.") - if (partial_prefill_metadata is not None - and not partial_prefill_metadata.can_schedule(seq_group)): - leftover_waiting_sequences.appendleft(seq_group) - waiting_queue.popleft() - continue - num_new_tokens_uncached, num_new_tokens_cached = ( - self._get_num_new_uncached_and_cached_tokens( - seq_group, - SequenceStatus.WAITING, - enable_chunking, - budget, - partial_prefill_metadata=partial_prefill_metadata, - )) - num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached - - if not enable_chunking: - num_prompt_tokens = waiting_seqs[0].get_len() - assert num_new_tokens == num_prompt_tokens - - prompt_limit = self._get_prompt_limit(seq_group) - if num_new_tokens > prompt_limit: - logger.warning( - "Input prompt (%d tokens) is too long" - " and exceeds limit of %d", - num_new_tokens, - prompt_limit, - ) - for seq in waiting_seqs: - seq.status = SequenceStatus.FINISHED_IGNORED - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.FINISHED_IGNORED) - ignored_seq_groups.append(seq_group) - waiting_queue.popleft() - continue - - num_lookahead_slots: int = 0 - - # If the sequence group cannot be allocated, stop. - can_allocate = self.block_manager.can_allocate( - seq_group, num_lookahead_slots=num_lookahead_slots) - if can_allocate == AllocStatus.LATER: - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - break - elif can_allocate == AllocStatus.NEVER: - logger.warning( - "Input prompt (%d tokens) + lookahead slots (%d) is " - "too long and exceeds the capacity of block_manager", - num_new_tokens, - num_lookahead_slots, - ) - for seq in waiting_seqs: - seq.status = SequenceStatus.FINISHED_IGNORED - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.FINISHED_IGNORED) - ignored_seq_groups.append(seq_group) - waiting_queue.popleft() - continue - - # We cannot mix sequence groups that use prompt embeds and - # those that do not. - if len(seq_groups) == 0: - using_prompt_embeds = seq_group.uses_prompt_embeds() - if using_prompt_embeds != seq_group.uses_prompt_embeds(): - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - leftover_waiting_sequences.appendleft(seq_group) - waiting_queue.popleft() - continue - - lora_int_id = 0 - if self.lora_enabled: - lora_int_id = seq_group.lora_int_id - assert curr_loras is not None - assert self.lora_config is not None - if (self.lora_enabled and lora_int_id > 0 - and lora_int_id not in curr_loras - and len(curr_loras) >= self.lora_config.max_loras): - # We don't have a space for another LoRA, so - # we ignore this request for now. - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - leftover_waiting_sequences.appendleft(seq_group) - waiting_queue.popleft() - continue - - if (budget.num_batched_tokens - >= self.scheduler_config.max_num_batched_tokens): - # We've reached the budget limit - since there might be - # continuous prefills in the running queue, we should break - # to avoid scheduling any new prefills. - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - break - - num_new_seqs = seq_group.get_max_num_running_seqs() - if num_new_tokens_uncached == 0 or not budget.can_schedule( - num_new_tokens=num_new_tokens_uncached, - num_new_seqs=num_new_seqs, - ): - self.remove_seq_from_computed_blocks_tracker( - seq_group, SequenceStatus.WAITING) - break - - # Can schedule this request. - if curr_loras is not None and lora_int_id > 0: - curr_loras.add(lora_int_id) - waiting_queue.popleft() - self._allocate_and_set_running(seq_group) - - if partial_prefill_metadata is not None: - partial_prefill_metadata.maybe_increment_partial_prefills( - seq_group) - - seq_groups.append( - ScheduledSequenceGroup(seq_group=seq_group, - token_chunk_size=num_new_tokens)) - budget.add_num_batched_tokens( - seq_group.request_id, - num_batched_tokens=num_new_tokens_uncached, - num_cached_tokens=num_new_tokens_cached, - ) - budget.add_num_seqs(seq_group.request_id, num_new_seqs) - - # Queue requests that couldn't be scheduled. - waiting_queue.extendleft(leftover_waiting_sequences) - if len(seq_groups) > 0: - self.prev_prompt = True - - return SchedulerPrefillOutputs( - seq_groups=seq_groups, - ignored_seq_groups=ignored_seq_groups, - num_lookahead_slots=self._get_num_lookahead_slots( - is_prefill=True, enable_chunking=enable_chunking), - ) - - def _schedule_default(self) -> SchedulerOutputs: - """Schedule queued requests. - - The current policy is designed to optimize the throughput. First, - it batches as many prefill requests as possible. And it schedules - decodes. If there's a pressure on GPU memory, decode requests can - be swapped or preempted. - """ - # Include running requests to the budget. - budget = SchedulingBudget( - token_budget=self.scheduler_config.max_num_batched_tokens, - max_num_seqs=self.scheduler_config.max_num_seqs, - ) - # Make sure we include num running seqs before scheduling prefill, - # so that we don't schedule beyond max_num_seqs for prefill. - for seq_group in self.running: - budget.add_num_seqs(seq_group.request_id, - seq_group.get_max_num_running_seqs()) - curr_loras = (set( - seq_group.lora_int_id for seq_group in self.running - if seq_group.lora_int_id > 0) if self.lora_enabled else None) - - prefills = SchedulerPrefillOutputs.create_empty() - running_scheduled = SchedulerRunningOutputs.create_empty() - swapped_in = SchedulerSwappedInOutputs.create_empty() - - # If any requests are swapped, prioritized swapped requests. - if not self.swapped: - prefills = self._schedule_prefills(budget, - curr_loras, - enable_chunking=False) - - if len(prefills.seq_groups - ) == 0 and self.scheduler_config.policy == "priority": - self._schedule_priority_preemption(budget) - - # Don't schedule decodes if prefills are scheduled. - # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running - # only contains decode requests, not chunked prefills. - if len(prefills.seq_groups) == 0: - running_scheduled = self._schedule_running(budget, - curr_loras, - enable_chunking=False) - - # If any sequence group is preempted, do not swap in any sequence - # group. because it means there's no slot for new running requests. - if (len(running_scheduled.preempted) + - len(running_scheduled.swapped_out) == 0): - swapped_in = \ - self._schedule_swapped(budget, curr_loras) - - assert (budget.num_batched_tokens - <= self.scheduler_config.max_num_batched_tokens) - assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs - - # Update waiting requests. - self.waiting.extendleft(running_scheduled.preempted) - # Update new running requests. - if len(prefills.seq_groups) > 0: - self.running.extend([s.seq_group for s in prefills.seq_groups]) - - self.running.extend(running_scheduled.decode_seq_groups_list) - - if len(swapped_in.decode_seq_groups) > 0: - self.running.extend( - [s.seq_group for s in swapped_in.decode_seq_groups]) - - # Update swapped requests. - self.swapped.extend(running_scheduled.swapped_out) - preempted = len(running_scheduled.preempted) + len( - running_scheduled.swapped_out) - - # There should be no prefill from running queue because this policy - # doesn't allow chunked prefills. - assert len(running_scheduled.prefill_seq_groups) == 0 - assert len(swapped_in.prefill_seq_groups) == 0 - - # Merge lists - num_prefill_groups = len(prefills.seq_groups) - ignored_seq_groups_for_embeds = list[SequenceGroup]() - if num_prefill_groups > 0: - scheduled_seq_groups = prefills.seq_groups - scheduled_seq_groups.extend(running_scheduled.decode_seq_groups) - ignored_seq_groups_for_embeds.clear() - else: - scheduled_seq_groups = running_scheduled.decode_seq_groups - if len(scheduled_seq_groups) > 0: - using_prompt_embeds = scheduled_seq_groups[ - 0].seq_group.uses_prompt_embeds() - ignored_seq_groups_for_embeds.clear() - indices_ignored = list[int]() - for i, schedule_seq_group in enumerate(scheduled_seq_groups): - if using_prompt_embeds !=\ - schedule_seq_group.seq_group.uses_prompt_embeds(): - ignored_seq_groups_for_embeds.append( - schedule_seq_group.seq_group) - indices_ignored.append(i) - if len(ignored_seq_groups_for_embeds) > 0: - scheduled_seq_groups = [ - group for i, group in enumerate(scheduled_seq_groups) - if i not in indices_ignored - ] - else: - ignored_seq_groups_for_embeds.clear() - - scheduled_seq_groups.extend(swapped_in.decode_seq_groups) - - blocks_to_copy = running_scheduled.blocks_to_copy - blocks_to_copy.extend(swapped_in.blocks_to_copy) - - ignored_seq_groups = prefills.ignored_seq_groups - ignored_seq_groups.extend(ignored_seq_groups_for_embeds) - ignored_seq_groups.extend(swapped_in.infeasible_seq_groups) - - return SchedulerOutputs( - scheduled_seq_groups=scheduled_seq_groups, - num_prefill_groups=num_prefill_groups, - num_batched_tokens=budget.num_batched_tokens + - budget.num_cached_tokens, - blocks_to_swap_in=swapped_in.blocks_to_swap_in, - blocks_to_swap_out=running_scheduled.blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ignored_seq_groups=ignored_seq_groups, - num_lookahead_slots=running_scheduled.num_lookahead_slots, - running_queue_size=len(self.running), - preempted=preempted, - ) - - def _schedule_chunked_prefill(self) -> SchedulerOutputs: - """Schedule queued requests. - - Chunked prefill allows to chunk prefill requests, batch them together - with decode requests. This policy 1. schedule as many decoding requests - as possible. 2. schedule chunked prefill requests that are not - finished. 3. schedule swapped request. 4. schedule new prefill - requests. - - The policy can sustain the high GPU utilization because it can put - prefill and decodes requests to the same batch, while it improves - inter token latency because decodes requests don't need to be blocked - by prefill requests. - """ - budget = SchedulingBudget( - token_budget=self.scheduler_config.max_num_batched_tokens, - max_num_seqs=self.scheduler_config.max_num_seqs, - ) - curr_loras: Set[int] = set() - - prefills = SchedulerPrefillOutputs.create_empty() - swapped_in = SchedulerSwappedInOutputs.create_empty() - - # Create partial prefill metadata - partial_prefill_metadata = PartialPrefillMetadata.from_queues( - running=self.running, - waiting=self.waiting, - scheduler_config=self.scheduler_config, - ) - - # Decoding should be always scheduled first by fcfs. - running_scheduled = self._schedule_running( - budget, - curr_loras, - enable_chunking=True, - partial_prefill_metadata=partial_prefill_metadata, - ) - - # Schedule swapped out requests. - # If preemption happens, it means we don't have space for swap-in. - if len(running_scheduled.preempted) + len( - running_scheduled.swapped_out) == 0: - swapped_in = self._schedule_swapped(budget, curr_loras) - - prefills = self._schedule_prefills( - budget, - curr_loras, - enable_chunking=True, - partial_prefill_metadata=partial_prefill_metadata, - ) - - assert (budget.num_batched_tokens - <= self.scheduler_config.max_num_batched_tokens) - assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs - - # Update waiting requests. - self.waiting.extendleft(running_scheduled.preempted) - - # Update new running requests. - # By default, vLLM scheduler prioritizes prefills. - # Once chunked prefill is enabled, - # the policy is changed to prioritize decode requests. - self.running.extend( - [s.seq_group for s in swapped_in.decode_seq_groups]) - self.running.extend( - [s.seq_group for s in swapped_in.prefill_seq_groups]) - self.running.extend( - [s.seq_group for s in running_scheduled.decode_seq_groups]) - # Because multiple prefills may be running concurrently, we need to - # make sure that prefills which are scheduled to finish are listed - # before those that won't. This is so that on the next scheduling - # iteration when they have transitioned to the decode stage, they are - # properly prioritized over sequences that are still in the prefill - # stage. - self.running.extend( - self._order_finishing_prefills_first( - running_scheduled.prefill_seq_groups)) - self.running.extend([s.seq_group for s in prefills.seq_groups]) - - # Update swapped requests. - self.swapped.extend(running_scheduled.swapped_out) - # Put prefills first due to Attention backend ordering assumption. - scheduled_seq_groups = (prefills.seq_groups + - running_scheduled.prefill_seq_groups + - swapped_in.prefill_seq_groups + - running_scheduled.decode_seq_groups + - swapped_in.decode_seq_groups) - num_prefill_groups = (len(prefills.seq_groups) + - len(swapped_in.prefill_seq_groups) + - len(running_scheduled.prefill_seq_groups)) - return SchedulerOutputs( - scheduled_seq_groups=scheduled_seq_groups, - num_prefill_groups=num_prefill_groups, - num_batched_tokens=budget.num_batched_tokens + - budget.num_cached_tokens, - blocks_to_swap_in=swapped_in.blocks_to_swap_in, - blocks_to_swap_out=running_scheduled.blocks_to_swap_out, - blocks_to_copy=running_scheduled.blocks_to_copy + - swapped_in.blocks_to_copy, - ignored_seq_groups=prefills.ignored_seq_groups + - swapped_in.infeasible_seq_groups, - num_lookahead_slots=0, - running_queue_size=len(self.running), - preempted=(len(running_scheduled.preempted) + - len(running_scheduled.swapped_out)), - ) - - def _order_finishing_prefills_first( - self, scheduled_prefill_seqs: List[ScheduledSequenceGroup] - ) -> List[SequenceGroup]: - """Returns a list of prefilling SequenceGroups where sequences that are - scheduled to finish prefilling are listed first""" - finishing = [ - s.seq_group for s in scheduled_prefill_seqs - if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size - ] - not_finishing = [ - s.seq_group for s in scheduled_prefill_seqs - if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size - ] - return finishing + not_finishing - - def _schedule(self) -> SchedulerOutputs: - """Schedule queued requests.""" - if self.scheduler_config.chunked_prefill_enabled: - return self._schedule_chunked_prefill() - else: - return self._schedule_default() - - def _can_append_slots(self, seq_group: SequenceGroup, - enable_chunking: bool) -> bool: - """Determine whether or not we have enough space in the KV cache to - continue generation of the sequence group. - """ - # It is True only for testing case to trigger artificial preemption. - if (self.enable_artificial_preemption - and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB - and self.artificial_preempt_cnt > 0): - self.artificial_preempt_cnt -= 1 - return False - - is_prefill = seq_group.is_prefill() - num_lookahead_slots = self._get_num_lookahead_slots( - is_prefill, enable_chunking) - - return self.block_manager.can_append_slots( - seq_group=seq_group, num_lookahead_slots=num_lookahead_slots) - - def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: - # async_output_proc is allowed only when we have a single sequence - # in the sequence group - no_single_seq = seq_group.sampling_params is None or ( - seq_group.sampling_params.n == 1) - return no_single_seq - - def schedule( - self - ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]: - # Schedule sequence groups. - # This function call changes the internal states of the scheduler - # such as self.running, self.swapped, and self.waiting. - scheduler_start_time = time.perf_counter() - - scheduler_outputs: SchedulerOutputs = self._schedule() - now = time.time() - - if not self.cache_config.enable_prefix_caching: - common_computed_block_nums = [] - - allow_async_output_proc: bool = self.use_async_output_proc - - # Create input data structures. - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - for i, scheduled_seq_group in enumerate( - scheduler_outputs.scheduled_seq_groups): - seq_group = scheduled_seq_group.seq_group - token_chunk_size = scheduled_seq_group.token_chunk_size - seq_group.maybe_set_first_scheduled_time(now) - - seq_group_metadata = self._seq_group_metadata_cache[ - self.cache_id].get_object() - seq_group_metadata.seq_data.clear() - seq_group_metadata.block_tables.clear() - - # seq_id -> SequenceData - seq_data: Dict[int, SequenceData] = {} - # seq_id -> physical block numbers - block_tables: Dict[int, List[int]] = {} - - if seq_group.is_encoder_decoder(): - # Encoder associated with SequenceGroup - encoder_seq = seq_group.get_encoder_seq() - assert encoder_seq is not None - encoder_seq_data = encoder_seq.data - # Block table for cross-attention - # Also managed at SequenceGroup level - cross_block_table = self.block_manager.get_cross_block_table( - seq_group) - else: - encoder_seq_data = None - cross_block_table = None - - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - seq_id = seq.seq_id - seq_data[seq_id] = seq.data - block_tables[seq_id] = self.block_manager.get_block_table(seq) - self.block_manager.access_all_blocks_in_seq(seq, now) - - if self.cache_config.enable_prefix_caching: - common_computed_block_nums = ( - self.block_manager.get_common_computed_block_ids( - seq_group.get_seqs(status=SequenceStatus.RUNNING))) - - do_sample = True - is_prompt = seq_group.is_prefill() - # We should send the metadata to workers when the first prefill - # is sent. Subsequent requests could be chunked prefill or decode. - is_first_prefill = False - if is_prompt: - seqs = seq_group.get_seqs() - # Prefill has only 1 sequence. - assert len(seqs) == 1 - num_computed_tokens = seqs[0].data.get_num_computed_tokens() - is_first_prefill = num_computed_tokens == 0 - # In the next iteration, all prompt tokens are not computed. - # It means the prefill is chunked, and we don't need sampling. - # NOTE: We use get_len instead of get_prompt_len because when - # a sequence is preempted, prefill includes previous generated - # output tokens. - if (token_chunk_size + num_computed_tokens - < seqs[0].data.get_len()): - do_sample = False - - # It assumes the scheduled_seq_groups is ordered by - # prefill < decoding. - if is_first_prefill or not self.scheduler_config.send_delta_data: - seq_group_metadata = SequenceGroupMetadata( - request_id=seq_group.request_id, - is_prompt=is_prompt, - seq_data=seq_data, - sampling_params=seq_group.sampling_params, - block_tables=block_tables, - do_sample=do_sample, - pooling_params=seq_group.pooling_params, - token_chunk_size=token_chunk_size, - lora_request=seq_group.lora_request, - computed_block_nums=common_computed_block_nums, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - state=seq_group.state, - # `multi_modal_data` will only be present for the 1st comm - # between engine and worker. - # the subsequent comms can still use delta, but - # `multi_modal_data` will be None. - multi_modal_data=(seq_group.multi_modal_data - if scheduler_outputs.num_prefill_groups - > 0 else None), - multi_modal_placeholders=( - seq_group.multi_modal_placeholders - if scheduler_outputs.num_prefill_groups > 0 else None), - ) - else: - # When SPMD mode is enabled, we only send delta data except for - # the first request to reduce serialization cost. - seq_data_delta = {} - for id, data in seq_data.items(): - seq_data_delta[id] = data.get_delta_and_reset() - seq_group_metadata = SequenceGroupMetadataDelta( - seq_data_delta, - seq_group.request_id, - block_tables, - is_prompt, - do_sample=do_sample, - token_chunk_size=token_chunk_size, - computed_block_nums=common_computed_block_nums, - ) - seq_group_metadata_list.append(seq_group_metadata) - - if allow_async_output_proc: - allow_async_output_proc = self._allow_async_output_proc( - seq_group) - - # Now that the batch has been created, we can assume all blocks in the - # batch will have been computed before the next scheduling invocation. - # This is because the engine assumes that a failure in model execution - # will crash the vLLM instance / will not retry. - for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: - self.block_manager.mark_blocks_as_computed( - scheduled_seq_group.seq_group, - scheduled_seq_group.token_chunk_size) - - self._seq_group_metadata_cache[self.next_cache_id].reset() - - scheduler_time = time.perf_counter() - scheduler_start_time - # Add this to scheduler time to all the sequences that are currently - # running. This will help estimate if the scheduler is a significant - # component in the e2e latency. - for seq_group in self.running: - if seq_group is not None and seq_group.metrics is not None: - if seq_group.metrics.scheduler_time is not None: - seq_group.metrics.scheduler_time += scheduler_time - else: - seq_group.metrics.scheduler_time = scheduler_time - - # Move to next cache (if exists) - self.cache_id = self.next_cache_id - - # Return results - return (seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc) - - def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None: - self.block_manager.fork(parent_seq, child_seq) - - def free_seq(self, seq: Sequence) -> None: - """Free a sequence from a block table.""" - self.block_manager.free(seq) - - def remove_seq_from_computed_blocks_tracker( - self, seq_group: SequenceGroup, - status: Optional[SequenceStatus]) -> None: - seqs = seq_group.get_seqs(status=status) - for seq in seqs: - self._remove_seq_from_computed_blocks_tracker(seq) - - def _remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: - """ - Free a sequence computed blocks tracker _seq_id_to_blocks_hashes - and _seq_id_to_num_tokens_computed. - """ - self.block_manager.remove_seq_from_computed_blocks_tracker(seq) - - def _free_finished_seqs(self, seq_group: SequenceGroup) -> None: - """Free finished seqs in a sequence group.""" - for seq in seq_group.get_seqs(): - if seq.is_finished(): - self.free_seq(seq) - - def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None: - if seq_group.is_finished(): - # Free cross-attention block table, if it exists - self._free_seq_group_cross_attn_blocks(seq_group) - - # Add the finished requests to the finished requests list. - # This list will be used to update the Mamba cache in the - # next step. - self._finished_requests_ids.append(seq_group.request_id) - - # Free finished seqs - self._free_finished_seqs(seq_group) - - def free_finished_seq_groups(self) -> None: - remaining: Deque[SequenceGroup] = deque() - for seq_group in self.running: - self._free_finished_seq_group(seq_group) - if not seq_group.is_finished(): - remaining.append(seq_group) - - self.running = remaining - - # Handle async stopped sequence groups - # (ones that reached max model len) - if self._async_stopped: - for seq_group in self._async_stopped: - self._free_seq_group_cross_attn_blocks(seq_group) - self._finished_requests_ids.append(seq_group.request_id) - - # Free finished seqs - self._free_finished_seqs(seq_group) - - self._async_stopped.clear() - - def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: - self.block_manager.allocate(seq_group) - for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): - seq.status = SequenceStatus.RUNNING - - def _append_slots( - self, - seq_group: SequenceGroup, - blocks_to_copy: List[Tuple[int, int]], - enable_chunking: bool = False, - ) -> None: - """Appends new slots to the sequences in the given sequence group. - - Args: - seq_group (SequenceGroup): The sequence group containing the - sequences to append slots to. - blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two - ints, the first int is the source block index, and the second - int is the destination block index. This list is updated with - the new source and destination block indices for the appended - slots. - enable_chunking (bool): True if chunked prefill is enabled. - """ - is_prefill: bool = seq_group.is_prefill() - num_lookahead_slots: int = self._get_num_lookahead_slots( - is_prefill, enable_chunking) - - seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING - for seq in seq_group.get_seqs(status=seq_status): - cows = self.block_manager.append_slots(seq, num_lookahead_slots) - if len(cows) > 0: - blocks_to_copy.extend(cows) - - def _preempt(self, seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode: - # If preemption mode is not specified, we determine the mode as follows: - # We use recomputation by default since it incurs lower overhead than - # swapping. However, when the sequence group has multiple sequences - # (e.g., beam search), recomputation is not currently supported. In - # such a case, we use swapping instead. - # FIXME(woosuk): This makes our scheduling policy a bit bizarre. - # As swapped sequences are prioritized over waiting sequences, - # sequence groups with multiple sequences are implicitly prioritized - # over sequence groups with a single sequence. - # TODO(woosuk): Support recomputation for sequence groups with multiple - # sequences. This may require a more sophisticated CUDA kernel. - if self.user_specified_preemption_mode is None: - if seq_group.get_max_num_running_seqs() == 1: - preemption_mode = PreemptionMode.RECOMPUTE - else: - preemption_mode = PreemptionMode.SWAP - - elif self.user_specified_preemption_mode == "swap": - preemption_mode = PreemptionMode.SWAP - else: - preemption_mode = PreemptionMode.RECOMPUTE - - if self.num_cumulative_preemption % 50 == 0: - logger.warning( - "Sequence group %s is preempted by %s mode because there is " - "not enough KV cache space. This can affect the end-to-end " - "performance. Increase gpu_memory_utilization or " - "tensor_parallel_size to provide more KV cache memory. " - "total_num_cumulative_preemption=%d", - seq_group.request_id, - preemption_mode, - self.num_cumulative_preemption + 1, - ) - self.num_cumulative_preemption += 1 - - if preemption_mode == PreemptionMode.RECOMPUTE: - self._preempt_by_recompute(seq_group) - elif preemption_mode == PreemptionMode.SWAP: - self._preempt_by_swap(seq_group, blocks_to_swap_out) - else: - raise AssertionError("Invalid preemption mode.") - return preemption_mode - - def _preempt_by_recompute( - self, - seq_group: SequenceGroup, - ) -> None: - seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - assert len(seqs) == 1 - for seq in seqs: - seq.status = SequenceStatus.WAITING - self.free_seq(seq) - seq.reset_state_for_recompute() - self._free_seq_group_cross_attn_blocks(seq_group) - - def _preempt_by_swap( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], - ) -> None: - self._swap_out(seq_group, blocks_to_swap_out) - - def _swap_in( - self, - seq_group: SequenceGroup, - blocks_to_swap_in: List[Tuple[int, int]], - ) -> None: - mapping = self.block_manager.swap_in(seq_group) - blocks_to_swap_in.extend(mapping) - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - seq.status = SequenceStatus.RUNNING - - def _swap_out( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], - ) -> None: - if not self.block_manager.can_swap_out(seq_group): - # FIXME(woosuk): Abort the sequence group instead of aborting the - # entire engine. - raise RuntimeError( - "Aborted due to the lack of CPU swap space. Please increase " - "the swap space to avoid this error.") - mapping = self.block_manager.swap_out(seq_group) - blocks_to_swap_out.extend(mapping) - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - seq.status = SequenceStatus.SWAPPED - - def _passed_delay(self, now: float) -> bool: - if self.prev_prompt: - self.last_prompt_latency = now - self.prev_time - self.prev_time, self.prev_prompt = now, False - # Delay scheduling prompts to let waiting queue fill up - if self.scheduler_config.delay_factor > 0 and self.waiting: - earliest_arrival_time = min( - [e.metrics.arrival_time for e in self.waiting]) - passed_delay = ((now - earliest_arrival_time) - > (self.scheduler_config.delay_factor * - self.last_prompt_latency) or not self.running) - else: - passed_delay = True - return passed_delay - - def _get_num_lookahead_slots(self, is_prefill: bool, - enable_chunking: bool) -> int: - """The number of slots to allocate per sequence per step, beyond known - token ids. Speculative decoding uses these slots to store KV activations - of tokens which may or may not be accepted. - """ - return 0 - - def _get_num_new_uncached_and_cached_tokens( - self, - seq_group: SequenceGroup, - status: SequenceStatus, - enable_chunking: bool, - budget: SchedulingBudget, - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> Tuple[int, int]: - """ - Returns the number of new uncached and cached tokens to schedule for a - given sequence group that's in a given `status`. - - The API could chunk the number of tokens to compute based on `budget` - if `enable_chunking` is True. If a sequence group has multiple - sequences (e.g., running beam search), it means it is in decoding - phase, so chunking doesn't happen. - - Returns (0, 0) if the new token cannot be computed due to token budget. - - The cached tokens's blocks are already computed, and the attention - backend will reuse the cached blocks rather than recomputing them. So - the scheduler could schedule these cached tokens "for free". - - Args: - seq_group: The sequence group to get the number of new tokens to - schedule. - status: The status of the sequences to get the number of new tokens - to schedule. - enable_chunking: Whether to chunk the number of tokens to compute. - budget: The budget to chunk the number of tokens to compute. - partial_prefill_metadata: information about the partial prefills - that are currently running - - - Returns: - A tuple of two ints. The first int is the number of new uncached - tokens to schedule. The second int is the number of cached tokens. - If no more new tokens can be scheduled, returns (0, 0). - """ - num_cached_new_tokens = 0 - num_uncached_new_tokens = 0 - - seqs = seq_group.get_seqs(status=status) - # Compute the number of new uncached and cached tokens for - # each sequence. - for seq in seqs: - if not seq.is_prefill(): - # Decode sequences should always just have 1 uncached token - # TODO(rickyx): Actually is this still correct for multi-step? - num_uncached_new_tokens += 1 - continue - - num_computed_tokens_seq = seq.get_num_computed_tokens() - all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq - if not self.cache_config.enable_prefix_caching: - # If prefix caching is not enabled, all new tokens are uncached. - num_uncached_new_tokens += all_num_new_tokens_seq - continue - - # NOTE: the cache token might be currently in a block that's in an - # evictor meaning that it's not yet allocated. However, we don't - # exclude such tokens in the cache count because it will be - # guaranteed to be allocated later if the sequence can be allocated. - num_cached_tokens_seq = self.block_manager.get_num_cached_tokens( - seq) - - # Sanity check. - if num_cached_tokens_seq < num_computed_tokens_seq: - # This should only happen with chunked prefill, and - # the seq is still in prefill. The `num_cached_tokens_seq` - # is the value we calculated on scheduling the first prefill. - # For subsequent continuous prefill steps, we cached the - # number of cache tokens for the sequence so the cached token - # count could be less than the number of computed tokens. - # See comments on `ComputedBlocksTracker` for more details. - assert ( - seq.is_prefill() and seq.status == SequenceStatus.RUNNING - and self.scheduler_config.chunked_prefill_enabled - ), ("Number of cached tokens should not be less than the " - "number of computed tokens for a sequence that's still " - f"in prefill. But there are {num_cached_tokens_seq} cached " - f"tokens and {num_computed_tokens_seq} computed tokens " - f"for sequence {seq.seq_id}.") - - num_cached_new_tokens_seq = max( - 0, num_cached_tokens_seq - num_computed_tokens_seq) - num_uncached_new_tokens_seq = (all_num_new_tokens_seq - - num_cached_new_tokens_seq) - - num_uncached_new_tokens += num_uncached_new_tokens_seq - num_cached_new_tokens += num_cached_new_tokens_seq - - if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0: - # For a fully cached hit sequence, we actually need to recompute the - # last token. So we need at least 1 uncached token to schedule. - # See ModelRunner._compute_for_prefix_cache_hit for more details. - num_uncached_new_tokens = 1 - num_cached_new_tokens -= 1 - - if enable_chunking and len(seqs) == 1: - # Chunk if a running request cannot fit in the given budget. - # If number of seq > 1, it means it is doing beam search - # in a decode phase. Do not chunk. - num_uncached_new_tokens = self._chunk_new_tokens_to_schedule( - self.scheduler_config, - self.cache_config, - budget, - self._get_prompt_limit(seq_group), - num_uncached_new_tokens, - self.partial_prefill_budget_lookup_list, - partial_prefill_metadata, - ) - - return num_uncached_new_tokens, num_cached_new_tokens - - @staticmethod - def _chunk_new_tokens_to_schedule( - scheduler_config: SchedulerConfig, - cache_config: CacheConfig, - budget: SchedulingBudget, - prompt_limit: int, - num_new_tokens: int, - partial_prefill_budget_lookup_list: List[int], - partial_prefill_metadata: Optional[PartialPrefillMetadata] = None, - ) -> int: - """ - Chunks the number of new tokens to schedule based on the budget when - chunked prefill is enabled. - - Args: - scheduler_config: The scheduler config. - cache_config: The cache config. - budget: The budget to chunk the number of tokens to compute. - prompt_limit: The maximum number of tokens allowed in a prompt. - num_new_tokens: The number of new tokens to schedule. - - Returns: - The number of new tokens to schedule after chunking. - """ - remaining_token_budget = budget.remaining_token_budget() - - # Get the number of tokens to allocate to this prefill slot - prefill_slot_budget = ( - remaining_token_budget if partial_prefill_metadata is None else - partial_prefill_budget_lookup_list[ - partial_prefill_metadata.schedulable_prefills]) - - if cache_config.enable_prefix_caching: - # When prefix caching is enabled and we're partially prefilling - # a sequence, we always allocate a number of new tokens that is - # divisible by the block size to avoid partial block matching. - block_size = cache_config.block_size - # Don't exceed either the total budget or slot budget. - # Take min of those and get the next lowest multiple of the - # block size: - remaining_token_budget = ( - min(remaining_token_budget, prefill_slot_budget) // - block_size) * block_size - # NB: In the case where num_new_tokens < budget, we are - # finishing prefill for this sequence, so we do not need to - # allocate a full block. - - num_new_tokens = min(num_new_tokens, remaining_token_budget, - prefill_slot_budget) - - return num_new_tokens diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index c345f17e6614f..e828ac04364ff 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -7,13 +7,11 @@ from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import ModelConfig, VllmConfig -from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.plugins.io_processors.interface import IOProcessor from vllm.pooling_params import PoolingParams @@ -266,11 +264,7 @@ class EngineClient(ABC): ... @abstractmethod - async def do_log_stats( - self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[list[SamplerOutput]] = None, - ) -> None: + async def do_log_stats(self) -> None: ... @abstractmethod diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 73165c7e4c0ad..757baecea9ce0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -601,11 +601,7 @@ class AsyncLLM(EngineClient): async def is_tracing_enabled(self) -> bool: return self.observability_config.otlp_traces_endpoint is not None - async def do_log_stats( - self, - scheduler_outputs=None, - model_output=None, - ) -> None: + async def do_log_stats(self) -> None: if self.logger_manager: self.logger_manager.log() diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py deleted file mode 100644 index 530907012f704..0000000000000 --- a/vllm/worker/cache_engine.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""CacheEngine class for managing the KV cache.""" -from typing import List - -import torch - -from vllm.attention import get_attn_backend -from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig -from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, - get_dtype_size, is_pin_memory_available) - -logger = init_logger(__name__) - - -class CacheEngine: - """Manages the KV cache. - - This class is responsible for initializing and managing the GPU and CPU KV - caches. It also provides methods for performing KV cache operations, such - as swapping and copying. - """ - - def __init__( - self, - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, - device_config: DeviceConfig, - ) -> None: - self.cache_config = cache_config - self.model_config = model_config - self.parallel_config = parallel_config - self.device_config = device_config - - self.head_size = model_config.get_head_size() - # Models like Jamba, have mixed typed layers, E.g Mamba - self.num_attention_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - - self.block_size = cache_config.block_size - self.num_gpu_blocks = cache_config.num_gpu_blocks - if self.num_gpu_blocks: - self.num_gpu_blocks //= parallel_config.pipeline_parallel_size - self.num_cpu_blocks = cache_config.num_cpu_blocks - if self.num_cpu_blocks: - self.num_cpu_blocks //= parallel_config.pipeline_parallel_size - - if cache_config.cache_dtype == "auto": - self.dtype = model_config.dtype - else: - self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - # Get attention backend. - self.attn_backend = get_attn_backend(self.head_size, - model_config.dtype, - cache_config.cache_dtype, - self.block_size, - model_config.is_attention_free, - use_mla=model_config.use_mla) - - # Initialize the cache. - self.gpu_cache = self._allocate_kv_cache( - self.num_gpu_blocks, self.device_config.device_type) - self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu") - - def _allocate_kv_cache( - self, - num_blocks: int, - device: str, - ) -> List[torch.Tensor]: - """Allocates KV cache on the specified device.""" - kv_cache_generic_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - pin_memory = is_pin_memory_available() if device == "cpu" else False - kv_cache: List[torch.Tensor] = [] - try: - kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order( - ) - except (AttributeError, NotImplementedError): - kv_cache_stride_order = tuple(range(len(kv_cache_generic_shape))) - - # The allocation respects the backend-defined stride order to ensure - # the semantic remains consistent for each backend. We first obtain the - # generic kv cache shape and then permute it according to the stride - # order which could result in a non-contiguous tensor. - kv_cache_allocation_shape = tuple(kv_cache_generic_shape[i] - for i in kv_cache_stride_order) - - for _ in range(self.num_attention_layers): - # null block in CpuGpuBlockAllocator requires at least that - # block to be zeroed-out. - # We zero-out everything for simplicity. - layer_kv_cache = torch.zeros( - kv_cache_allocation_shape, - dtype=self.dtype, - pin_memory=pin_memory, - device=device).permute(*kv_cache_stride_order) - - # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases - # when entry_shape is higher than 1D - kv_cache.append(layer_kv_cache) - return kv_cache - - def swap_in(self, src_to_dst: torch.Tensor) -> None: - for i in range(self.num_attention_layers): - self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i], - src_to_dst) - - def swap_out(self, src_to_dst: torch.Tensor) -> None: - for i in range(self.num_attention_layers): - self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i], - src_to_dst) - - def copy(self, src_to_dsts: torch.Tensor) -> None: - self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) - - @staticmethod - def get_cache_block_size( - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, - ) -> int: - head_size = model_config.get_head_size() - num_heads = model_config.get_num_kv_heads(parallel_config) - num_attention_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) - - if cache_config.cache_dtype == "auto": - dtype = model_config.dtype - else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - key_cache_entry = num_heads * head_size - - # For MLA there is no value cache, since the latent vector - # is joint keys and values. - value_cache_entry = key_cache_entry if not model_config.use_mla else 0 - total = num_attention_layers * cache_config.block_size * \ - (key_cache_entry + value_cache_entry) - - dtype_size = get_dtype_size(dtype) - return dtype_size * total diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py deleted file mode 100644 index bab89586b0f2c..0000000000000 --- a/vllm/worker/model_runner.py +++ /dev/null @@ -1,2031 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import gc -import inspect -import itertools -import time -import weakref -from contextlib import contextmanager -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, - Tuple, Type, TypeVar, Union) - -import numpy as np -import torch -import torch.distributed -import torch.nn as nn -from tqdm.auto import tqdm - -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.attention.backends.abstract import AttentionState -from vllm.attention.backends.utils import CommonAttentionState -from vllm.compilation.counter import compilation_counter -from vllm.config import CompilationLevel, VllmConfig -from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import broadcast_tensor_dict, get_pp_group -from vllm.distributed.kv_transfer import get_kv_transfer_group -from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, - graph_capture) -from vllm.forward_context import get_forward_context, set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.model_executor import SamplingMetadata, SamplingMetadataCache -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, - get_sampler) -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.model_executor.models import (supports_lora, supports_mrope, - supports_multimodal) -from vllm.model_executor.models.utils import set_cpu_offload_max_bytes -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs, MultiModalPlaceholderMap, - MultiModalRegistry) -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, - async_tensor_h2d, flatten_2d_lists, - is_pin_memory_available, supports_dynamo, - weak_ref_tensor) -from vllm.worker.model_runner_base import ( - InputProcessingError, ModelRunnerBase, ModelRunnerInputBase, - ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -LORA_WARMUP_RANK = 8 - -_NUM_WARMUP_ITERS = 2 - -TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") - -# For now, bump up cache limits for recompilations during CUDA graph warmups. -torch._dynamo.config.cache_size_limit = 128 -torch._dynamo.config.accumulated_cache_size_limit = 128 - - -@dataclass(frozen=True) -class ModelInputForGPU(ModelRunnerInputBase): - """ - This base class contains metadata needed for the base model forward pass - but not metadata for possible additional steps, e.g., sampling. Model - runners that run additional steps should subclass this method to add - additional fields. - """ - input_tokens: Optional[torch.Tensor] = None - inputs_embeds: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[BatchedTensorInputs] = None - request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None - finished_requests_ids: Optional[List[str]] = None - virtual_engine: int = 0 - async_callback: Optional[Callable] = None - scheduler_outputs: Optional[SchedulerOutputs] = None - previous_hidden_states: Optional[torch.Tensor] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForGPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> TModelInputForGPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - # Exclude `async_callback` to be able to pickle this object - def __getstate__(self): - state = self.__dict__.copy() - del state["async_callback"] - return state - - # TODO: What happens when we depickle this object? - # How can we update this callback to properly pass it to the engine? - def __setstate__(self, state): - self.__dict__.update(state) - self.__dict__.update({'async_callback': None}) - - -@dataclass(frozen=True) -class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - # Used for speculative decoding. We do not broadcast it because it is only - # used by the driver worker. - is_prompt: Optional[bool] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForGPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): - """Build ModelInputForGPU from SequenceGroupMetadata.""" - - # Note: ideally we would be using a dataclass(kw_only=True) - # here, so that this can be subclassed easily, - # but kw_only is not supported in python<3.10. - class InterDataForSeqGroup: - """Intermediate data for the current sequence group.""" - - def simple_reinit(self): - self.input_tokens[0].clear() # type: ignore - self.inputs_embeds = None # type: ignore - self.input_positions[0].clear() # type: ignore - self.mrope_input_positions = None # type: ignore - self.seq_lens[0] = 0 # type: ignore - self.orig_seq_lens[0] = 0 # type: ignore - self.prompt_lens[0] = 0 # type: ignore - self.query_lens[0] = 0 # type: ignore - self.context_lens[0] = 0 # type: ignore - self.curr_sliding_window_blocks[0] = 0 # type: ignore - self.lora_index_mapping.clear() # type: ignore - self.lora_prompt_mapping.clear() # type: ignore - self.lora_requests.clear() # type: ignore - - def __init__( - self, - *, - # From sequence group metadata. - request_id: str, - seq_ids: List[int], - is_prompt: bool, - block_tables: Optional[Dict[int, List[int]]], - computed_block_nums: List[int], - n_seqs: int = 0, - - # Input tokens and positions. - input_tokens: Optional[List[List[int]]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - input_positions: Optional[List[List[int]]] = None, - mrope_input_positions: Optional[List[List[List[int]]]] = None, - - # The sequence length (may be capped to the sliding window). - seq_lens: Optional[List[int]] = None, - # The original sequence length (before applying sliding window). - # This is used to compute slot mapping. - orig_seq_lens: Optional[List[int]] = None, - # This is used in the dual-chunk flash attention backend. - prompt_lens: Optional[List[int]] = None, - # The query length. - query_lens: Optional[List[int]] = None, - # The number of tokens that are already computed. - context_lens: Optional[List[int]] = None, - # The current sliding window block. - curr_sliding_window_blocks: Optional[List[int]] = None, - - # LoRA inputs. - lora_index_mapping: Optional[List[List[int]]] = None, - lora_prompt_mapping: Optional[List[List[int]]] = None, - lora_requests: Optional[Set[LoRARequest]] = None, - - # Multi-modal inputs. - multi_modal_kwargs: Optional[MultiModalKwargs] = None, - multi_modal_placeholder_maps: Optional[Dict[ - str, MultiModalPlaceholderMap]] = None, - - # Whether the prefix cache is hit (prefill only). - prefix_cache_hit: bool = False, - reinit: bool = False, - reinit_use_defaults: bool = False, - encoder_seq_len: int = 0, - ): - if reinit: - assert len(self.seq_ids) == len(seq_ids) # type: ignore - for i, seq_id in enumerate(seq_ids): - self.seq_ids[i] = seq_id # type: ignore - else: - self.seq_ids = seq_ids - - self.request_id = request_id - self.is_prompt = is_prompt - self.block_tables = block_tables - self.computed_block_nums = computed_block_nums - self.n_seqs = n_seqs - self.encoder_seq_len = encoder_seq_len - - if reinit: - if len(self.seq_ids) == 1 and reinit_use_defaults: - self.simple_reinit() - else: - if input_tokens: - self.input_tokens = input_tokens - else: - for seq_id in range(len(self.seq_ids)): - self.input_tokens[seq_id].clear() - - self.inputs_embeds = inputs_embeds - - if input_positions: - self.input_positions = input_positions - else: - for seq_id in range(len(self.seq_ids)): - self.input_positions[seq_id].clear() - - self.mrope_input_positions = None - - if seq_lens: - self.seq_lens = seq_lens - else: - for seq_id in range(len(self.seq_ids)): - self.seq_lens[seq_id] = 0 - - if orig_seq_lens: - self.orig_seq_lens = orig_seq_lens - else: - for seq_id in range(len(self.seq_ids)): - self.orig_seq_lens[seq_id] = 0 - - if prompt_lens: - self.prompt_lens = prompt_lens - else: - for seq_id in range(len(self.seq_ids)): - self.prompt_lens[seq_id] = 0 - - if query_lens: - self.query_lens = query_lens - else: - for seq_id in range(len(self.seq_ids)): - self.query_lens[seq_id] = 0 - - if context_lens: - self.context_lens = context_lens - else: - for seq_id in range(len(self.seq_ids)): - self.context_lens[seq_id] = 0 - - if curr_sliding_window_blocks: - self.curr_sliding_window_blocks = \ - curr_sliding_window_blocks - else: - for seq_id in range(len(self.seq_ids)): - self.curr_sliding_window_blocks[seq_id] = 0 - - if lora_index_mapping: - self.lora_index_mapping = lora_index_mapping - else: - self.lora_index_mapping.clear() - - if lora_prompt_mapping: - self.lora_prompt_mapping = lora_prompt_mapping - else: - self.lora_prompt_mapping.clear() - - if lora_requests: - self.lora_requests = lora_requests - else: - self.lora_requests.clear() - - else: - self.input_tokens = input_tokens or [] - self.inputs_embeds = inputs_embeds - self.input_positions = input_positions or [] - self.mrope_input_positions = mrope_input_positions or None - self.seq_lens = seq_lens or [] - self.orig_seq_lens = orig_seq_lens or [] - self.prompt_lens = prompt_lens or [] - self.query_lens = query_lens or [] - self.context_lens = context_lens or [] - self.curr_sliding_window_blocks = \ - curr_sliding_window_blocks or [] - - self.lora_index_mapping = lora_index_mapping or [] - self.lora_prompt_mapping = lora_prompt_mapping or [] - self.lora_requests = lora_requests or set() - - self.multi_modal_kwargs = multi_modal_kwargs - self.multi_modal_placeholder_maps = multi_modal_placeholder_maps - self.prefix_cache_hit = prefix_cache_hit - - self.n_seqs = len(self.seq_ids) - - if not reinit: - self.__post_init__() - - def __post_init__(self): - self.n_seqs = len(self.seq_ids) - - self.input_tokens = [[] for _ in range(self.n_seqs)] - self.input_positions = [[] for _ in range(self.n_seqs)] - self.mrope_input_positions = None - self.seq_lens = [0] * self.n_seqs - self.orig_seq_lens = [0] * self.n_seqs - self.prompt_lens = [0] * self.n_seqs - self.query_lens = [0] * self.n_seqs - self.context_lens = [0] * self.n_seqs - self.curr_sliding_window_blocks = [0] * self.n_seqs - - self.lora_index_mapping = [] - self.lora_prompt_mapping = [] - - def __repr__(self) -> str: - return (f"InterDataForSeqGroup(" - f"request_id={self.request_id}, " - f"seq_ids={self.seq_ids}, " - f"is_prompt={self.is_prompt}, " - f"block_tables={self.block_tables}, " - f"computed_block_nums={self.computed_block_nums}, " - f"n_seqs={self.n_seqs}, " - f"input_tokens={self.input_tokens}, " - f"inputs_embeds.shape=" - f"{getattr(self.inputs_embeds, 'shape', None)}, " - f"input_positions={self.input_positions}, " - f"mrope_input_positions={self.mrope_input_positions}, " - f"seq_lens={self.seq_lens}, " - f"orig_seq_lens={self.orig_seq_lens}, " - f"query_lens={self.query_lens}, " - f"context_lens={self.context_lens}, " - f"multi_modal_kwargs={self.multi_modal_kwargs}") - - def gen_inter_data_builder(self, num_seqs: int): - return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup( - request_id="", - seq_ids=[0] * num_seqs, - is_prompt=True, - block_tables=None, - computed_block_nums=[]) - - def init_cached_inter_data(self, *args, **kwargs): - assert len(args) == 0 - assert "seq_ids" in kwargs - seq_ids = kwargs["seq_ids"] - num_seqs = len(seq_ids) - - # The inter-data cache is per model_runner - inter_data_cache = self.runner.inter_data_cache - if num_seqs not in inter_data_cache: - inter_data_cache[num_seqs] = PyObjectCache( - self.gen_inter_data_builder(num_seqs)) - - obj = inter_data_cache[num_seqs].get_object() - obj.__init__(*args, **kwargs) - return obj - - def reset_cached_inter_data(self): - for cache in self.runner.inter_data_cache.values(): - cache.reset() - - def __init__(self, - runner: "GPUModelRunnerBase", - finished_requests_ids: Optional[List[str]] = None): - super().__init__() - # Compute functions for each sequence in a sequence group. - # WARNING: The order of the functions matters! - self.per_seq_compute_fns = [ - self._compute_lens, - self._compute_for_prefix_cache_hit, - self._compute_for_sliding_window, - self._compute_lora_input, - ] - # Compute functions for each sequence group. - # WARNING: The order of the functions matters! - self.per_seq_group_compute_fns = [ - self._compute_multi_modal_input, - ] - - self.runner = runner - self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend - self.scheduler_config = self.runner.scheduler_config - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.enable_lora = self.runner.lora_config is not None - - # Attention metadata inputs. - if self.attn_backend is not None: - # spec decode (e.g. Medusa) does not have atten backend - self.attn_metadata_builder = self.attn_backend.get_builder_cls()( - weakref.proxy(self)) - - # Engine/Model configurations. - self.chunked_prefill_enabled = ( - self.scheduler_config is not None - and self.scheduler_config.chunked_prefill_enabled) - if self.sliding_window is not None: - self.sliding_window_blocks = ( - self.sliding_window + self.block_size - 1) // self.block_size - self.block_aligned_sliding_window = \ - self.sliding_window_blocks * self.block_size - - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.finished_requests_ids = finished_requests_ids - - # if the current batch is decode-only. - # will be set to False if there is any non-decode request. - self.decode_only = True - - # Intermediate data (data in CPU before going to GPU) for - # the current sequence group. - self.inter_data_list: List[ - ModelInputForGPUBuilder.InterDataForSeqGroup] = [] - - self.attn_metadata_builder.prepare() - - def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Compute context length, sequence length and tokens - for the given sequence data. - """ - seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] - token_chunk_size = seq_group_metadata.token_chunk_size - - # Compute context length (the number of tokens that are - # already computed) and sequence length (total number of tokens). - - seq_len = seq_data.get_len() - if inter_data.is_prompt: - context_len = seq_data.get_num_computed_tokens() - seq_len = min(seq_len, context_len + token_chunk_size) - elif self.runner.model_config.is_encoder_decoder: - context_len = seq_len - 1 - else: - context_len = seq_data.get_num_computed_tokens() - - # Compute tokens. - if seq_data.prompt_embeds is None: - tokens = seq_data.get_token_ids()[context_len:seq_len] - prompt_embeds = None - else: - tokens = [0] * (seq_len - context_len) - prompt_embeds = seq_data.get_token_embeddings( - )[context_len:seq_len] - - inter_data.seq_lens[seq_idx] = seq_len - inter_data.orig_seq_lens[seq_idx] = seq_len - inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len() - inter_data.context_lens[seq_idx] = context_len - inter_data.input_tokens[seq_idx].extend(tokens) - inter_data.inputs_embeds = prompt_embeds - inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) - inter_data.query_lens[seq_idx] = seq_len - context_len - - if seq_data.mrope_position_delta is not None: - if inter_data.mrope_input_positions is None: - inter_data.mrope_input_positions = [None] * inter_data.n_seqs - - inter_data.mrope_input_positions[ - seq_idx] = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - - def _compute_for_prefix_cache_hit( - self, inter_data: InterDataForSeqGroup, seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Check if hit prefix cache (i.e., some blocks are already computed). - If hit, update input tokens and positions to only compute the - remaining blocks. - """ - computed_block_nums = inter_data.computed_block_nums - - # Note that prefix caching does not support sliding window. - prefix_cache_hit = (computed_block_nums is not None - and len(computed_block_nums) > 0 - and self.sliding_window is None - and inter_data.is_prompt) - inter_data.prefix_cache_hit = prefix_cache_hit - - if not prefix_cache_hit: - return - - assert computed_block_nums is not None - # The cache hit prompt tokens in this sequence. Note that - # this may be larger than the sequence length if chunked - # prefill is enabled. - prefix_cache_len = len(computed_block_nums) * self.block_size - seq_group_metadata.seq_data[inter_data.seq_ids[ - seq_idx]].update_num_cached_tokens(prefix_cache_len) - - # The number of so far computed prompt tokens in this sequence. - context_len = inter_data.context_lens[seq_idx] - # The total number of prompt tokens in this sequence. - # When chunked prefill is enabled, this is the token number of - # computed chunks + current chunk. - seq_len = inter_data.seq_lens[seq_idx] - if prefix_cache_len <= context_len: - # We already passed the cache hit region, - # so do normal computation. - pass - elif context_len < prefix_cache_len < seq_len: - # Partial hit. Compute the missing part. - uncomputed_start = prefix_cache_len - context_len - inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ - seq_idx][uncomputed_start:] - inter_data.input_positions[seq_idx] = inter_data.input_positions[ - seq_idx][uncomputed_start:] - context_len = prefix_cache_len - - inter_data.context_lens[seq_idx] = context_len - inter_data.query_lens[ - seq_idx] = inter_data.seq_lens[seq_idx] - context_len - elif seq_len <= prefix_cache_len: - # Full hit. Only compute the last token to avoid - # erroneous behavior. FIXME: Ideally we should directly - # mark all tokens as computed in the scheduler and do not - # schedule this sequence, so this case should not happen. - inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ - seq_idx][-1:] - inter_data.input_positions[seq_idx] = inter_data.input_positions[ - seq_idx][-1:] - inter_data.query_lens[seq_idx] = 1 - inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 - - def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup, - seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """Update seq_len and curr_sliding_window_block for the given - sequence data (only required by decoding) if sliding window is enabled. - """ - curr_sliding_window_block = 0 - sliding_seq_len = inter_data.seq_lens[seq_idx] - if not inter_data.is_prompt and self.sliding_window is not None: - # TODO(sang): This is a hack to make sliding window work with - # paged attn. We can remove it if we make paged attn kernel - # to properly handle slinding window attn. - curr_sliding_window_block = self.sliding_window_blocks - # number of elements in last block - suff_len = inter_data.seq_lens[seq_idx] % self.block_size - sliding_seq_len = min(inter_data.seq_lens[seq_idx], - self.block_aligned_sliding_window + suff_len) - if suff_len > 0: - curr_sliding_window_block += 1 - - inter_data.curr_sliding_window_blocks[ - seq_idx] = curr_sliding_window_block - inter_data.seq_lens[seq_idx] = sliding_seq_len - - def _compute_lora_input(self, inter_data: InterDataForSeqGroup, - seq_idx: int, - seq_group_metadata: SequenceGroupMetadata): - """If LoRA is enabled, compute LoRA index and prompt mapping.""" - if not self.enable_lora: - return - - lora_id = seq_group_metadata.lora_int_id - if lora_id > 0: - inter_data.lora_requests.add(seq_group_metadata.lora_request) - query_len = inter_data.query_lens[seq_idx] - inter_data.lora_index_mapping.append([lora_id] * query_len) - sampling_params = seq_group_metadata.sampling_params - if sampling_params and sampling_params.prompt_logprobs is not None: - inter_data.lora_prompt_mapping.append([lora_id] * query_len) - elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample: - inter_data.lora_prompt_mapping.append([lora_id]) - else: - inter_data.lora_prompt_mapping.append([]) - - def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, - seq_group_metadata: SequenceGroupMetadata): - """If multi-modal data is given, add it to the input.""" - # NOTE: mm_kwargs only includes the subset of multi-modal items that - # intersect with the current prefill positions. - positions = inter_data.input_positions[0] - mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group_metadata, - range(positions[0], positions[0] + len(positions))) - - # M-RoPE requires mrope_positions even for plain text; return early - # when mm_kwargs is empty only if inter_data.is_prompt is False. - if not mm_kwargs and not inter_data.is_prompt: - return - - inter_data.multi_modal_kwargs = mm_kwargs - inter_data.multi_modal_placeholder_maps = placeholder_maps - - # special processing for mrope position deltas. - if self.runner.model_config.uses_mrope: - image_grid_thw = mm_kwargs.get("image_grid_thw", None) - video_grid_thw = mm_kwargs.get("video_grid_thw", None) - audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", - None) - - second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) - use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) - hf_config = self.runner.model_config.hf_config - - inter_data.mrope_input_positions = [None] * inter_data.n_seqs - for seq_idx in range(inter_data.n_seqs): - seq_data = seq_group_metadata.seq_data[ - inter_data.seq_ids[seq_idx]] - token_ids = seq_data.get_token_ids() - - if supports_mrope(self.runner.model): - mrope_input_positions, mrope_position_delta = \ - self.runner.model.get_mrope_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - mrope_input_positions = mrope_input_positions.tolist() - else: - mrope_input_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - - seq_data.mrope_position_delta = mrope_position_delta - inter_data.mrope_input_positions[ - seq_idx] = mrope_input_positions - - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): - """Add a sequence group to the builder.""" - seq_ids = seq_group_metadata.seq_data.keys() - n_seqs = len(seq_ids) - is_prompt = seq_group_metadata.is_prompt - - if is_prompt: - assert n_seqs == 1 - self.decode_only = False - - encoder_seq_len = 0 - - if self.runner.model_config.is_encoder_decoder: - encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() - - inter_data = self.init_cached_inter_data( - request_id=seq_group_metadata.request_id, - seq_ids=seq_ids, - is_prompt=is_prompt, - block_tables=seq_group_metadata.block_tables, - computed_block_nums=seq_group_metadata.computed_block_nums, - reinit=True, - reinit_use_defaults=True, - encoder_seq_len=encoder_seq_len) - - self.inter_data_list.append(inter_data) - - for seq_idx in range(n_seqs): - for per_seq_fn in self.per_seq_compute_fns: - per_seq_fn(inter_data, seq_idx, seq_group_metadata) - for per_seq_group_fn in self.per_seq_group_compute_fns: - per_seq_group_fn(inter_data, seq_group_metadata) - - def _use_captured_graph(self, - batch_size: int, - decode_only: bool, - max_decode_seq_len: int, - max_encoder_seq_len: int = 0) -> bool: - return (decode_only and not self.runner.model_config.enforce_eager - and max_decode_seq_len <= self.runner.max_seq_len_to_capture - and max_encoder_seq_len <= self.runner.max_seq_len_to_capture - and batch_size <= self.runner.max_batchsize_to_capture) - - def _get_cuda_graph_pad_size(self, - num_seqs: int, - max_decode_seq_len: int, - max_encoder_seq_len: int = 0) -> int: - """ - Determine the number of padding sequences required for running in - CUDA graph mode. Returns -1 if CUDA graphs cannot be used. - - In the multi-step + chunked-prefill case, only the first step - has Prefills (if any). The rest of the steps are guaranteed to be all - decodes. In this case, we set up the padding as if all the sequences - are decodes so we may run all steps except the first step in CUDA graph - mode. - - Args: - num_seqs (int): Number of sequences scheduled to run. - max_decode_seq_len (int): Greatest of all the decode sequence - lengths. Used only in checking the viablility of using - CUDA graphs. - max_encoder_seq_len (int, optional): Greatest of all the encode - sequence lengths. Defaults to 0. Used only in checking the - viability of using CUDA graphs. - Returns: - int: Returns the determined number of padding sequences. If - CUDA graphs is not viable, returns -1. - """ - decode_only = self.decode_only - if not decode_only: - # Early exit so we can treat num_seqs as the batch_size below. - return -1 - - # batch_size out of this function refers to the number of input - # tokens being scheduled. This conflation of num_seqs as batch_size - # is valid as this is a decode-only case. - batch_size = num_seqs - if not self._use_captured_graph(batch_size, decode_only, - max_decode_seq_len, - max_encoder_seq_len): - return -1 - - graph_batch_size = self.runner.vllm_config.pad_for_cudagraph( - batch_size) - assert graph_batch_size >= batch_size - return graph_batch_size - batch_size - - def build(self) -> ModelInputForGPU: - """Finalize the builder intermediate data and - create on-device tensors. - """ - # Combine and flatten intermediate data. - input_tokens = list[int]() - inputs_embeds_list = list[torch.Tensor]() - for inter_data in self.inter_data_list: - for cur_input_tokens in inter_data.input_tokens: - input_tokens.extend(cur_input_tokens) - if inter_data.inputs_embeds is not None: - inputs_embeds_list.append( - inter_data.inputs_embeds.to( - dtype=self.runner.model_config.dtype, - device=self.runner.device)) - inputs_embeds: Optional[torch.Tensor] - if len(inputs_embeds_list) == 0: - inputs_embeds = None - else: - inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to( - dtype=self.runner.model_config.dtype, - device=self.runner.device) - assert len(inputs_embeds) == len(input_tokens) - - if not input_tokens and inputs_embeds is None: - # This may happen when all prefill requests hit - # prefix caching and there is no decode request. - return self.model_input_cls() - - mrope_input_positions: Optional[List[List[int]]] = None - if any(inter_data.mrope_input_positions is not None - for inter_data in self.inter_data_list): - mrope_input_positions = [[] for _ in range(3)] - for idx in range(3): - for inter_data in self.inter_data_list: - msections = inter_data.mrope_input_positions - if msections is None: - for _seq_input_positions in inter_data.input_positions: - mrope_input_positions[idx].extend( - _seq_input_positions) - else: - for _seq_mrope_input_positions in msections: - mrope_input_positions[idx].extend( - _seq_mrope_input_positions[idx]) - input_positions = None - else: - input_positions = [] - for inter_data in self.inter_data_list: - for cur_input_positions in inter_data.input_positions: - input_positions.extend(cur_input_positions) - - seq_lens = [] - query_lens = [] - max_decode_seq_len = 0 - max_encoder_seq_len = 0 - for inter_data in self.inter_data_list: - seq_lens.extend(inter_data.seq_lens) - query_lens.extend(inter_data.query_lens) - if not inter_data.is_prompt: - max_decode_seq_len = max(max_decode_seq_len, - max(inter_data.seq_lens)) - if self.runner.model_config.is_encoder_decoder: - max_encoder_seq_len = max(max_encoder_seq_len, - inter_data.encoder_seq_len) - - # Mapping from request IDs to sequence IDs. Used for Jamba models - # that manages the cache by itself. - request_ids_to_seq_ids = { - data.request_id: data.seq_ids - for data in self.inter_data_list - } - - cuda_graph_pad_size = self._get_cuda_graph_pad_size( - num_seqs=len(seq_lens), - max_decode_seq_len=max_decode_seq_len, - max_encoder_seq_len=max_encoder_seq_len) - - batch_size = len(input_tokens) - if cuda_graph_pad_size != -1: - # If cuda graph can be used, pad tensors accordingly. - # See `capture_model` API for more details. - # vLLM uses cuda graph only for decoding requests. - batch_size += cuda_graph_pad_size - - # Tokens and positions. - if cuda_graph_pad_size: - input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size)) - assert self.runner.device is not None - input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long, - self.runner.device, - self.runner.pin_memory) - - if mrope_input_positions is not None: - for idx in range(3): - mrope_input_positions[idx].extend( - itertools.repeat(0, cuda_graph_pad_size)) - input_positions_tensor = async_tensor_h2d(mrope_input_positions, - torch.long, - self.runner.device, - self.runner.pin_memory) - else: - input_positions.extend(itertools.repeat(0, cuda_graph_pad_size)) - input_positions_tensor = async_tensor_h2d(input_positions, - torch.long, - self.runner.device, - self.runner.pin_memory) - # Sequence and query lengths. - if cuda_graph_pad_size: - seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size)) - - # Attention metadata. - attn_metadata = self.attn_metadata_builder.build( - seq_lens, query_lens, cuda_graph_pad_size, batch_size) - - # LoRA data. - lora_requests = set() - lora_mapping = None - if self.enable_lora: - lora_requests = set(r for data in self.inter_data_list - for r in data.lora_requests) - lora_index_mapping = flatten_2d_lists([ - flatten_2d_lists(inter_data.lora_index_mapping) - for inter_data in self.inter_data_list - ]) - if cuda_graph_pad_size: - lora_index_mapping.extend( - itertools.repeat(0, cuda_graph_pad_size)) - lora_prompt_mapping = flatten_2d_lists([ - flatten_2d_lists(inter_data.lora_prompt_mapping) - for inter_data in self.inter_data_list - ]) - - lora_mapping = LoRAMapping( - **dict(index_mapping=lora_index_mapping, - prompt_mapping=lora_prompt_mapping, - is_prefill=not self.decode_only)) - - # Multi-modal data. - multi_modal_kwargs_list = [ - data.multi_modal_kwargs for data in self.inter_data_list - if data.multi_modal_kwargs is not None - ] - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return self.model_input_cls( - input_tokens=input_tokens_tensor, - inputs_embeds=inputs_embeds, - input_positions=input_positions_tensor, - attn_metadata=attn_metadata, - seq_lens=seq_lens, - query_lens=query_lens, - lora_mapping=lora_mapping, - lora_requests=lora_requests, - multi_modal_kwargs=multi_modal_kwargs, - request_ids_to_seq_ids=request_ids_to_seq_ids, - finished_requests_ids=self.finished_requests_ids) - - -class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): - """ - Helper class for shared methods between GPU model runners. - """ - _model_input_cls: Type[TModelInputForGPU] - _builder_cls: Type[ModelInputForGPUBuilder] - builder: ModelInputForGPUBuilder - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - input_registry: InputRegistry = INPUT_REGISTRY, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - - ModelRunnerBase.__init__(self, vllm_config) - model_config = self.model_config - cache_config = self.cache_config - - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.device = self.device_config.device - self.pin_memory = is_pin_memory_available() - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size - self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture - self.max_batchsize_to_capture = \ - self.vllm_config.compilation_config.max_capture_size - - # - self.graph_runners: List[Dict[Tuple[int, bool], CUDAGraphRunner]] = [ - {} for _ in range(self.parallel_config.pipeline_parallel_size) - ] - self.graph_memory_pool: Optional[Tuple[ - int, int]] = None # Set during graph capture. - - self.has_inner_state = model_config.has_inner_state - - self.in_profile_run = False - - # When using CUDA graph, the input block tables must be padded to - # max_seq_len_to_capture. However, creating the block table in - # Python can be expensive. To optimize this, we cache the block table - # in numpy and only copy the actual input content at every iteration. - # The shape of the cached block table will be - # (max batch size to capture, max seq len to capture / block size). - self.graph_block_tables = np.zeros( - (self.max_batchsize_to_capture, self.get_max_block_per_batch()), - dtype=np.int32) - - self.cross_layer_shared_graph_block_tables = np.zeros( - (self.max_batchsize_to_capture, self.get_max_block_per_batch()), - dtype=np.int32) - - # Attention-free but stateful models like Mamba need a placeholder attn - # backend, as the attention metadata is needed to manage internal state. - # However we must bypass attention selection altogether for some models - # used for speculative decoding to avoid a divide-by-zero in - # model_config.get_head_size() - num_attn_heads = self.model_config.get_num_attention_heads( - self.parallel_config) - needs_attn_backend = (num_attn_heads != 0 - or self.model_config.is_attention_free) - - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) if needs_attn_backend else None - if self.attn_backend: - self.attn_state = self.attn_backend.get_state_cls()( - weakref.proxy(self)) - else: - self.attn_state = CommonAttentionState(weakref.proxy(self)) - - # Multi-modal data support - self.input_registry = input_registry - self.mm_registry = mm_registry - - # Lazy initialization - self.model: nn.Module # Set after load_model - # Set after load_model. - self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None - self.sampler = get_sampler() - - set_cpu_offload_max_bytes( - int(self.cache_config.cpu_offload_gb * 1024**3)) - - # Used to cache python objects - self.inter_data_cache: Dict[int, PyObjectCache] = {} - - # Using the PythonizationCache in Pipeline-Parallel clobbers the - # SequenceGroupToSample object. In Pipeline-Parallel, we have - # more than 1 Scheduler, resulting in a potential back-to-back - # prepare_model_inputs() call. This clobbers the cached - # SequenceGroupToSample objects, as we reset the cache during - # every prepare_model_inputs() call. - self.sampling_metadata_cache: SamplingMetadataCache = \ - SamplingMetadataCache() \ - if self.parallel_config.pipeline_parallel_size == 1 else None - - if hasattr(self, "_builder_cls"): - # multi-step model runner does not have `_builder_cls` - self.builder = self._builder_cls(weakref.proxy(self)) - - def load_model(self) -> None: - logger.info("Starting to load model %s...", self.model_config.model) - with DeviceMemoryProfiler(self.device) as m: - time_before_load = time.perf_counter() - self.model = get_model(vllm_config=self.vllm_config) - if self.lora_config: - assert supports_lora( - self.model - ), f"{self.model.__class__.__name__} does not support LoRA yet." - - if supports_multimodal(self.model): - logger.warning( - "Regarding multimodal models, vLLM currently " - "only supports adding LoRA to language model.") - - self.lora_manager = LRUCacheWorkerLoRAManager( - self.vllm_config, - self.device, - self.model.embedding_modules, - self.model.embedding_padding_modules, - ) - - self.model = self.lora_manager.create_lora_manager(self.model) - time_after_load = time.perf_counter() - - self.model_memory_usage = m.consumed_memory - logger.info("Model loading took %.4f GiB and %.6f seconds", - self.model_memory_usage / GiB_bytes, - time_after_load - time_before_load) - - - if self.vllm_config.compilation_config.level ==\ - CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): - backend = self.vllm_config.compilation_config.init_backend( - self.vllm_config) - compilation_counter.dynamo_as_is_count += 1 - self.model = torch.compile(self.model, - fullgraph=True, - backend=backend) - - def get_model(self) -> nn.Module: - return self.model - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - from vllm.model_executor.model_loader import ShardedStateLoader - ShardedStateLoader.save_model( - self.model, - path, - pattern=pattern, - max_size=max_size, - ) - - def save_tensorized_model( - self, - tensorizer_config: TensorizerConfig, - ) -> None: - from vllm.model_executor.model_loader import TensorizerLoader - TensorizerLoader.save_model( - self.model, - tensorizer_config=tensorizer_config, - model_config=self.model_config, - ) - - def get_max_block_per_batch(self) -> int: - block_size = self.block_size - return (self.max_seq_len_to_capture + block_size - 1) // block_size - - def _prepare_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None - ) -> TModelInputForGPU: - """Helper method to prepare the model input based on a given sequence - group. Prepares metadata needed for the base model forward pass but not - metadata for possible additional steps, e.g., sampling. - - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - - If cuda graph is required, this API automatically pads inputs. - """ - self.builder.prepare(finished_requests_ids) - for seq_group_metadata in seq_group_metadata_list: - try: - self.builder.add_seq_group(seq_group_metadata) - except Exception as e: - # Raise an exception that tracks the ID of the bad request - raise InputProcessingError(seq_group_metadata.request_id, - str(e)) from e - - self.builder.reset_cached_inter_data() - - return self.builder.build() # type: ignore - - @contextmanager - def set_in_profile_run(self): - self.in_profile_run = True - try: - yield - finally: - self.in_profile_run = False - - @torch.inference_mode() - def profile_run(self) -> None: - max_num_batched_tokens = \ - self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - self._dummy_run(max_num_batched_tokens, max_num_seqs) - - def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]: - assert num_loras > 0 - assert self.lora_manager is not None - - dummy_lora_requests: list[LoRARequest] = [] - with self.lora_manager.dummy_lora_cache(): - for idx in range(num_loras): - lora_id = idx + 1 - dummy_lora_request = LoRARequest( - lora_name=f"warmup_{lora_id}", - lora_int_id=lora_id, - lora_path="/not/a/real/path", - ) - self.lora_manager.add_dummy_lora(dummy_lora_request, - rank=LORA_WARMUP_RANK) - dummy_lora_requests.append(dummy_lora_request) - return dummy_lora_requests - - def _remove_dummy_loras(self): - # Remove dummy loras. - assert self.lora_manager is not None - self.remove_all_loras() - - def _dummy_run(self, - max_num_batched_tokens: int, - max_num_seqs: int = 1) -> None: - with self.set_in_profile_run(): - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = \ - SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - - # This represents the maximum number of different requests - # that will have unique loras, and therefore the max amount of - # memory consumption. Create dummy lora request copies from the - # lora request passed in, which contains a lora from the lora - # warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config: - dummy_lora_requests = self._add_dummy_loras( - self.lora_config.max_loras) - assert len(dummy_lora_requests) == self.lora_config.max_loras - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] - - # Profile memory usage with max_num_sequences sequences and the - # total number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for multi-modal encoding, - # which needs to be accounted for when calculating the GPU blocks - # for vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. - - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - max_num_seqs_orig = max_num_seqs - max_num_seqs = min(max_num_seqs, - max_num_batched_tokens // max_mm_tokens) - if max_num_seqs < 1: - expr = (f"min({max_num_seqs_orig}, " - f"{max_num_batched_tokens} // {max_mm_tokens})") - logger.warning( - "Computed max_num_seqs (%s) to be less than 1. " - "Setting it to the minimum value of 1.", expr) - max_num_seqs = 1 - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_data.multi_modal_data, - multi_modal_placeholders=dummy_data. - multi_modal_placeholders, - ) - seqs.append(seq) - - # Run the model with the dummy inputs. - num_layers = self.model_config.get_num_layers(self.parallel_config) - # use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). - # it is important to create tensors inside the loop, rather than - # multiplying the list, to avoid Dynamo from treating them as - # tensor aliasing. - kv_caches = [ - torch.tensor([], dtype=torch.float32, device=self.device) - for _ in range(num_layers) - ] - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - if not get_pp_group().is_first_rank: - intermediate_tensors = \ - self.model.make_empty_intermediate_tensors( - batch_size=batch_size, - dtype=self.model_config.dtype, - device=self.device) - - # Disable KV Scale Calculation for dummy data during profile run - if model_input.attn_metadata is not None: - model_input.attn_metadata.enable_kv_scales_calculation = False - - self.execute_model(model_input, kv_caches, intermediate_tensors) - torch.cuda.synchronize() - if self.lora_config: - self._remove_dummy_loras() - - return - - def remove_all_loras(self): - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.remove_all_adapters() - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_adapter(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_adapter(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.pin_adapter(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_adapters() - - @torch.inference_mode() - def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> int: - """Cuda graph capture a model and return cudagraph memory - consumption in bytes. - - Note that CUDA graph's performance gain is negligible if number - of batched tokens are larger than 200. And since CUDA graph - requires fixed sized tensors, supporting large/variable batch - size requires high GPU memory overhead. Thus, vLLM only captures - decoding requests. Mixed batch (chunked prefill + decoding) or - prefill requests are not captured. - - Since it is used for decoding-only, it assumes there's only 1 token - per sequence in the batch. - """ - assert not self.model_config.enforce_eager - logger.info("Capturing cudagraphs for decoding. This may lead to " - "unexpected consequences if the model is not static. To " - "run the model in eager mode, set 'enforce_eager=True' or " - "use '--enforce-eager' in the CLI. " - "If out-of-memory error occurs during cudagraph capture," - " consider decreasing `gpu_memory_utilization` or " - "switching to eager mode. You can also reduce the " - "`max_num_seqs` as needed to decrease memory usage.") - start_time = time.perf_counter() - start_free_gpu_memory = torch.cuda.mem_get_info()[0] - - # Prepare dummy inputs. These will be reused for all batch sizes. - max_batch_size = self.max_batchsize_to_capture - input_tokens = torch.zeros(max_batch_size, - dtype=torch.long, - device=self.device) - input_positions = torch.zeros(max_batch_size, - dtype=torch.long, - device=self.device) - inputs_embeds = torch.zeros( - (max_batch_size, self.model_config.get_hidden_size()), - dtype=self.model_config.dtype, - device=self.device) - if self.model_config.uses_mrope: - input_positions = torch.tile(input_positions, - (3, 1)).cuda(device=self.device) - # Prepare dummy previous_hidden_states only if needed by the model. - # This is used by draft models such as EAGLE. - previous_hidden_states = None - if "previous_hidden_states" in inspect.signature( - self.model.forward).parameters: - previous_hidden_states = torch.empty( - [max_batch_size, - self.model_config.get_hidden_size()], - dtype=self.model_config.dtype, - device=self.device) - - intermediate_inputs = None - if not get_pp_group().is_first_rank: - intermediate_inputs = self.model.make_empty_intermediate_tensors( - batch_size=max_batch_size, - dtype=self.model_config.dtype, - device=self.device) - - dummy_lora_id: Optional[int] = None - dummy_lora_request: LoRARequest = [] - if self.lora_config: - # The goal is to capture the LoRA kernels in cuda graphs. - # for this purpose, as single dummy lora is sufficient. - dummy_lora_requests = self._add_dummy_loras(num_loras=1) - assert len(dummy_lora_requests) == 1 - dummy_lora_request = dummy_lora_requests[0] - dummy_lora_id = dummy_lora_request.lora_int_id - - with self.attn_state.graph_capture(max_batch_size), graph_capture( - self.device) as graph_capture_context: - # NOTE: Capturing the largest batch size first may help reduce the - # memory usage of CUDA graph. - for virtual_engine in range( - self.parallel_config.pipeline_parallel_size): - # We need to not only iterate over batch sizes, but also whether - # to use inputs_embeds or not, hence we use the cartesian - # product. - cudagraph_capture_sizes = self.vllm_config.compilation_config\ - .cudagraph_capture_sizes - cudagraph_inputs_embeds = (( - True, False) if self.model_config.enable_prompt_embeds else - (False, )) - compilation_cases = itertools.product( - cudagraph_capture_sizes, - cudagraph_inputs_embeds, - ) - # Only rank 0 should print progress bar during capture - if get_tensor_model_parallel_rank() == 0: - compilation_cases = tqdm( - list(compilation_cases), - disable=not self.load_config.use_tqdm_on_load, - desc="Capturing CUDA graph shapes") - for batch_size, use_inputs_embeds in compilation_cases: - attn_metadata = ( - self.attn_state.graph_capture_get_metadata_for_batch( - batch_size, - is_encoder_decoder_model=self.model_config. - is_encoder_decoder)) - # Disable KV Scale Calculation for graph capture - attn_metadata.enable_kv_scales_calculation = False - if self.lora_config: - lora_mapping = LoRAMapping( - **dict(index_mapping=[dummy_lora_id] * batch_size, - prompt_mapping=[dummy_lora_id] * batch_size, - is_prefill=False)) - self.set_active_loras(set([dummy_lora_request]), - lora_mapping) - - graph_runner = CUDAGraphRunner( - self.model, self.attn_backend.get_name(), - self.attn_state.graph_clone(batch_size), - self.model_config.is_encoder_decoder) - - capture_inputs = { - "input_ids": - input_tokens[:batch_size], - "inputs_embeds": - inputs_embeds[:batch_size] - if use_inputs_embeds else None, - "positions": - input_positions[..., :batch_size], - "intermediate_inputs": - intermediate_inputs[:batch_size] - if intermediate_inputs is not None else None, - "kv_caches": - kv_caches[virtual_engine], - "attn_metadata": - attn_metadata, - "memory_pool": - self.graph_memory_pool, - "stream": - graph_capture_context.stream - } - if previous_hidden_states is not None: - capture_inputs[ - "previous_hidden_states"] = previous_hidden_states[: - batch_size] - - if self.has_inner_state: - # Only used by Mamba-based models CUDA graph atm (Jamba) - capture_inputs.update({ - "seqlen_agnostic_capture_inputs": - self.model.get_seqlen_agnostic_capture_inputs( - batch_size) - }) - if self.model_config.is_encoder_decoder: - # add the additional inputs to capture for - # encoder-decoder models. - self._update_inputs_to_capture_for_enc_dec_model( - capture_inputs) - - with set_forward_context(attn_metadata, self.vllm_config, - virtual_engine): - graph_runner.capture(**capture_inputs) - self.graph_memory_pool = graph_runner.graph.pool() - self.graph_runners[virtual_engine][( - batch_size, use_inputs_embeds)] = graph_runner - - if self.lora_config: - self._remove_dummy_loras() - - end_time = time.perf_counter() - end_free_gpu_memory = torch.cuda.mem_get_info()[0] - elapsed_time = end_time - start_time - cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory - # This usually takes < 10 seconds. - logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", - elapsed_time, cuda_graph_size / GiB_bytes) - return cuda_graph_size - - def _update_inputs_to_capture_for_enc_dec_model(self, - capture_inputs: Dict[str, - Any]): - """ - Updates the set of input tensors needed for CUDA graph capture in an - encoder-decoder model. - - This method modifies the provided `capture_inputs` dictionary by - adding tensors specific to encoder-decoder specific models that - need to be captured for CUDA Graph replay. - """ - # During the decode phase encoder_input_ids and encoder_positions are - # unset. Do the same thing for graph capture. - capture_inputs["encoder_input_ids"] = torch.tensor([], - dtype=torch.long, - device=self.device) - capture_inputs["encoder_positions"] = torch.tensor([], - dtype=torch.long, - device=self.device) - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - -class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): - """ - GPU model runner with sampling step. - """ - _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( - ModelInputForGPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForGPUWithSamplingMetadata: - model_input = \ - ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - return model_input - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> ModelInputForGPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - - If cuda graph is required, this API automatically pads inputs. - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - if get_pp_group().is_last_rank: - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, model_input.seq_lens, - model_input.query_lens, self.device, self.pin_memory, - generators, self.sampling_metadata_cache) - else: - sampling_metadata = None - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - is_prompt=is_prompt, - virtual_engine=virtual_engine) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError("num_steps > 1 is not supported in ModelRunner") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - self.attn_state.begin_forward(model_input) - - # Currently cuda graph is only supported by the decode phase. - assert model_input.attn_metadata is not None - prefill_meta = model_input.attn_metadata.prefill_metadata - decode_meta = model_input.attn_metadata.decode_metadata - # TODO(andoorve): We can remove this once all - # virtual engines share the same kv cache. - virtual_engine = model_input.virtual_engine - previous_hidden_states = kwargs.get("previous_hidden_states") - if prefill_meta is None and decode_meta.use_cuda_graph: - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - use_inputs_embeds = model_input.inputs_embeds is not None - model_executable = self.graph_runners[virtual_engine][( - graph_batch_size, use_inputs_embeds)] - if previous_hidden_states is not None: - previous_hidden_states = torch.cat([ - previous_hidden_states, - torch.empty([ - graph_batch_size - previous_hidden_states.shape[0], - *previous_hidden_states.shape[1:] - ], - dtype=previous_hidden_states.dtype, - device=previous_hidden_states.device) - ]) - else: - model_executable = self.model - - # Receive KV cache in distributed KV cache transfer setting - # In disagg prefill setting, it will also recv hidden states and bypass - # model forwarding - # In KV cache database setting, it will change the model input so that - # we can skip prefilling on tokens that successfully received KV caches - # NOTE: The receive operation is blocking - bypass_model_exec = False - if self.need_recv_kv(model_input, kv_caches): - hidden_or_intermediate_states, bypass_model_exec, model_input = \ - get_kv_transfer_group().recv_kv_caches_and_hidden_states( - # model is used to know which layer the current worker - # is working on, so that we can receive KV for only those - # layers. - model_executable, - model_input, - kv_caches=kv_caches - ) - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - seqlen_agnostic_kwargs = { - "finished_requests_ids": model_input.finished_requests_ids, - "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_inner_state else {} - model_kwargs = {} - if previous_hidden_states is not None: - model_kwargs["previous_hidden_states"] = previous_hidden_states - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_start = torch.cuda.Event(enable_timing=True) - model_forward_end = torch.cuda.Event(enable_timing=True) - model_forward_start.record() - - if not bypass_model_exec: - with set_forward_context(model_input.attn_metadata, - self.vllm_config, virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=model_input.inputs_embeds, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - multi_modal_kwargs, - device=self.device, - ), - **seqlen_agnostic_kwargs, - **model_kwargs, - ) - - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.record() - - # Sending KV cache in distributed KV cache transfer setting - # NOTE: the send operation is non-blocking - if self.need_send_kv(model_input, kv_caches): - get_kv_transfer_group().send_kv_caches_and_hidden_states( - # model_executable is used to know which layer the current - # worker is working on, so that we can send KV for only those - # layers. - model_executable, - model_input, - kv_caches, - hidden_or_intermediate_states, - ) - - # Compute the logits in the last pipeline stage. - if not get_pp_group().is_last_rank: - if (self.is_driver_worker - and hidden_or_intermediate_states is not None - and isinstance(hidden_or_intermediate_states, - IntermediateTensors) - and self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - hidden_or_intermediate_states.tensors["model_forward_time"] = ( - torch.tensor(model_forward_time + orig_model_forward_time)) - return hidden_or_intermediate_states - - logits = self.model.compute_logits(hidden_or_intermediate_states, - model_input.sampling_metadata) - - if self.is_driver_worker: - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - assert isinstance(self.sampler, Sampler) - orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor - if model_input.inputs_embeds is not None: - self.sampler.include_gpu_probs_tensor = True - - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time - and output is not None): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - # If there are multiple workers, we are still tracking the - # latency from the start time of the driver worker to the end - # time of the driver worker. The model forward time will then - # end up covering the communication time as well. - output.model_forward_time = (orig_model_forward_time + - model_forward_time) - - if model_input.inputs_embeds is not None: - if self.is_driver_worker: - sampled_token_ids = [] - valid_outputs = [] - for sequence_group_output in output.outputs: - if len(sequence_group_output.samples) == 0: - continue - assert len(sequence_group_output.samples) == 1 - valid_outputs.append(sequence_group_output) - sampled_token_ids.append( - sequence_group_output.samples[0].output_token) - sampled_token_ids = torch.tensor(sampled_token_ids).to( - self.device) - sampled_token_ids = broadcast_tensor_dict( - {"sampled_token_ids": - sampled_token_ids})["sampled_token_ids"] - else: - sampled_token_ids = broadcast_tensor_dict( - )["sampled_token_ids"] - if len(sampled_token_ids) > 0: - sampled_token_embeds = \ - self.model.get_input_embeddings(sampled_token_ids) - if self.is_driver_worker: - self.sampler.include_gpu_probs_tensor = \ - orig_include_gpu_probs - for i, sequence_group_output in enumerate(valid_outputs): - sequence_group_output.samples[0].output_embed = \ - sampled_token_embeds[i] - - if not self.is_driver_worker: - return [] - - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - assert model_input.sampling_metadata is not None - indices = model_input.sampling_metadata.selected_token_indices - if model_input.is_prompt: - hidden_states = hidden_or_intermediate_states.index_select( - 0, indices) - output.prefill_hidden_states = hidden_or_intermediate_states - elif decode_meta.use_cuda_graph: - hidden_states = hidden_or_intermediate_states[:len(indices)] - else: - hidden_states = hidden_or_intermediate_states - - output.hidden_states = hidden_states - - return [output] - - def need_recv_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor]) -> bool: - """Check if we need to receive kv-cache from the other worker. - We need to receive KV when - 1. current vLLM instance is KV cache consumer/decode vLLM instance - 2. this batch is not a profiling run - 3. this batch is a prefill run - - Args: - model_input: input to the model executable - kv_caches: vLLM's paged memory - """ - - if self.vllm_config.kv_transfer_config is None: - return False - - if model_input.attn_metadata is None: - raise ValueError("model_input.attn_metadata cannot be None") - - prefill_meta = model_input.attn_metadata.prefill_metadata - - # check if the current run is profiling - is_profile_run = (kv_caches[0].numel() == 0) - # check if the current run is prefill - is_prefill_run = prefill_meta is not None - - return self.vllm_config.kv_transfer_config.is_kv_consumer and ( - not is_profile_run) and is_prefill_run - - def need_send_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor]) -> bool: - """Check if we need to send kv-cache to the other worker. - We need to send KV when - 1. current vLLM instance is KV cache producer/prefill vLLM instance - 2. this batch is not a profiling run - 3. this batch is a prefill run - - Args: - model_input: input to the model executable - kv_caches: vLLM's paged memory - """ - - if self.vllm_config.kv_transfer_config is None: - return False - - if model_input.attn_metadata is None: - raise ValueError("model_input.attn_metadata cannot be None") - - prefill_meta = model_input.attn_metadata.prefill_metadata - - # check if the current run is profiling - is_profile_run = (kv_caches[0].numel() == 0) - # check if the current run is prefill - is_prefill_run = prefill_meta is not None - - return self.vllm_config.kv_transfer_config.is_kv_producer and ( - not is_profile_run) and is_prefill_run - - -# NOTE: this is nn.Module so the profiler can properly capture/group -# kernels calls made within the graph -class CUDAGraphRunner(nn.Module): - - def __init__(self, model: nn.Module, backend_name: str, - attn_state: AttentionState, is_encoder_decoder_model: bool): - super().__init__() - self.model = model - self.backend_name = backend_name - self.attn_state = attn_state - - self.input_buffers: Dict[str, torch.Tensor] = {} - self.output_buffers: Dict[str, torch.Tensor] = {} - - self._graph: Optional[torch.cuda.CUDAGraph] = None - self._is_encoder_decoder_model = is_encoder_decoder_model - - @property - def graph(self): - assert self._graph is not None - return self._graph - - def capture( - self, - input_ids: torch.Tensor, - inputs_embeds: Optional[torch.Tensor], - positions: torch.Tensor, - intermediate_inputs: Optional[IntermediateTensors], - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - memory_pool: Optional[Tuple[int, int]], - stream: torch.cuda.Stream, - **kwargs, - ): - assert self._graph is None - # Run the model a few times without capturing the graph. - # This is to make sure that the captured graph does not include the - # kernel launches for initial benchmarking (e.g., Triton autotune). - # Note one iteration is not enough for torch.compile - for _ in range(_NUM_WARMUP_ITERS): - self.model( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - positions=positions, - intermediate_tensors=intermediate_inputs, - **kwargs, - ) - # Wait for the warm up operations to finish before proceeding with - # Graph Capture. - torch.cuda.synchronize() - # Capture the graph. - self._graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): - output_hidden_or_intermediate_states = self.model( - input_ids=input_ids, - **({ - "inputs_embeds": inputs_embeds, - } if inputs_embeds is not None else {}), - positions=positions, - intermediate_tensors=intermediate_inputs, - **kwargs, - ) - - if isinstance(output_hidden_or_intermediate_states, torch.Tensor): - hidden_or_intermediate_states = weak_ref_tensor( - output_hidden_or_intermediate_states) - elif isinstance(output_hidden_or_intermediate_states, - IntermediateTensors): - hidden_or_intermediate_states = IntermediateTensors( - tensors={ - key: weak_ref_tensor(value) - for key, value in - output_hidden_or_intermediate_states.tensors.items() - }) - - del output_hidden_or_intermediate_states - # make sure `output_hidden_or_intermediate_states` is deleted - # in the graph's memory pool - gc.collect() - torch.cuda.synchronize() - - # Save the input and output buffers. - self.input_buffers = { - "input_ids": - input_ids, - **({ - "inputs_embeds": inputs_embeds, - } if inputs_embeds is not None else {}), - "positions": - positions, - "kv_caches": - kv_caches, - **self.attn_state.get_graph_input_buffers( - attn_metadata, self._is_encoder_decoder_model), - **kwargs, - } - if intermediate_inputs is not None: - self.input_buffers.update(intermediate_inputs.tensors) - if get_pp_group().is_last_rank: - self.output_buffers = { - "hidden_states": hidden_or_intermediate_states - } - else: - self.output_buffers = hidden_or_intermediate_states - - def forward( - self, - input_ids: torch.Tensor, - inputs_embeds: Optional[torch.Tensor], - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors], - **kwargs, - ) -> torch.Tensor: - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - - # Copy the input tensors to the input buffers. - self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) - if positions is not None: - # in some case like MLA, it will reuse positions in metadata - # but truncate them to the original size - # so the shape is not padded, we need to copy partial only - self.input_buffers["positions"][:positions.shape[0]].copy_( - positions, non_blocking=True) - if inputs_embeds is not None: - self.input_buffers["inputs_embeds"][:inputs_embeds.shape[0]].copy_( - inputs_embeds, non_blocking=True) - - if self.backend_name != "NO_ATTENTION": - self.input_buffers["slot_mapping"].copy_( - attn_metadata.slot_mapping, non_blocking=True) - - self.attn_state.prepare_graph_input_buffers( - self.input_buffers, attn_metadata, self._is_encoder_decoder_model) - - if "seqlen_agnostic_capture_inputs" in self.input_buffers: - self.model.copy_inputs_before_cuda_graphs(self.input_buffers, - **kwargs) - - if "previous_hidden_states" in self.input_buffers: - self.input_buffers["previous_hidden_states"].copy_( - kwargs["previous_hidden_states"], non_blocking=True) - - if intermediate_tensors is not None: - for key in intermediate_tensors.tensors: - if key != "model_execute_time" and key != "model_forward_time": - self.input_buffers[key].copy_(intermediate_tensors[key], - non_blocking=True) - if self._is_encoder_decoder_model: - self.input_buffers["encoder_input_ids"].copy_( - kwargs['encoder_input_ids'], non_blocking=True) - self.input_buffers["encoder_positions"].copy_( - kwargs['encoder_positions'], non_blocking=True) - - # Run the graph. - self.graph.replay() - # Return the output tensor. - if get_pp_group().is_last_rank: - return self.output_buffers["hidden_states"] - - return self.output_buffers diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py deleted file mode 100644 index 12047bc390737..0000000000000 --- a/vllm/worker/worker.py +++ /dev/null @@ -1,666 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A GPU worker class.""" -import gc -import os -from contextlib import nullcontext -from typing import Dict, List, Optional, Set, Tuple, Type, Union - -import torch -import torch.distributed - -import vllm.envs as envs -from vllm.attention.layer import Attention -from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.device_allocator.cumem import CuMemAllocator -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment, - set_custom_all_reduce) -from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.platforms import current_platform -from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, - SequenceGroupMetadata, SequenceGroupMetadataDelta) -from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, - memory_profiling) -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class Worker(LocalOrDistributedWorkerBase): - """A worker class that executes (a partition of) the model on a GPU. - - Each worker is associated with a single GPU. The worker is responsible for - maintaining the KV cache and executing the model on the GPU. In case of - distributed inference, each worker is assigned a partition of the model. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False, - model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None, - ) -> None: - WorkerBase.__init__(self, vllm_config) - self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - # Return hidden states from target model if the draft model is an - # mlp_speculator - speculative_config = self.speculative_config - model_config = self.model_config - speculative_args = {} if speculative_config is None \ - or (speculative_config.draft_model_config.hf_config.model_type == - model_config.hf_config.model_type) \ - or (speculative_config.draft_model_config.hf_config.model_type - not in ("medusa", - "mlp_speculator", - "eagle", - "deepseek_mtp", - "glm4_moe_mtp", - "mimo_mtp", - "ernie_mtp", - "qwen3_next_mtp")) \ - else {"return_hidden_states": True} - - self.model_runner: GPUModelRunnerBase = ModelRunner( - vllm_config=self.vllm_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker, - **speculative_args, - ) - if model_runner_cls is not None: - self.model_runner = model_runner_cls(self.model_runner) - - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[CacheEngine] - self.gpu_cache: Optional[List[List[torch.Tensor]]] = None - self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} - - # Buffers saved before sleep - self._sleep_saved_buffers: Dict[str, torch.Tensor] = {} - - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - with_stack=True, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True)) - else: - self.profiler = None - - def start_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.start() - - def stop_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.stop() - # only print profiler results on rank 0 - if self.local_rank == 0: - print(self.profiler.key_averages().table( - sort_by="self_cuda_time_total")) - - def sleep(self, level: int = 1) -> None: - free_bytes_before_sleep = torch.cuda.mem_get_info()[0] - - # Save the buffers before level 2 sleep - if level == 2: - model = self.model_runner.model - self._sleep_saved_buffers = { - name: buffer.cpu().clone() - for name, buffer in model.named_buffers() - } - - allocator = CuMemAllocator.get_instance() - allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) - free_bytes_after_sleep, total = torch.cuda.mem_get_info() - freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep - used_bytes = total - free_bytes_after_sleep - assert freed_bytes >= 0, "Memory usage increased after sleeping." - logger.info( - "Sleep mode freed %.2f GiB memory, " - "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, - used_bytes / GiB_bytes) - - def wake_up(self, tags: Optional[list[str]] = None) -> None: - allocator = CuMemAllocator.get_instance() - allocator.wake_up(tags=tags) - - # Restore the buffers after level 2 sleep - if len(self._sleep_saved_buffers): - model = self.model_runner.model - for name, buffer in model.named_buffers(): - if name in self._sleep_saved_buffers: - buffer.data.copy_(self._sleep_saved_buffers[name].data) - self._sleep_saved_buffers = {} - - def init_device(self) -> None: - if self.device_config.device.type == "cuda": - # torch.distributed.all_reduce does not free the input tensor until - # the synchronization point. This causes the memory usage to grow - # as the number of all_reduce calls increases. This env var disables - # this behavior. - # Related issue: - # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 - os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" - - # This env var set by Ray causes exceptions with graph building. - os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) - self.device = torch.device(f"cuda:{self.local_rank}") - torch.cuda.set_device(self.device) - - _check_if_gpu_supports_dtype(self.model_config.dtype) - gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - self.baseline_snapshot = MemorySnapshot() - else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") - # Initialize the distributed environment. - init_worker_distributed_environment(self.vllm_config, self.rank, - self.distributed_init_method, - self.local_rank) - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - if self.vllm_config.model_config.enable_sleep_mode: - allocator = CuMemAllocator.get_instance() - assert allocator.get_current_usage() == 0, ( - "Sleep mode can only be " - "used for one instance per process.") - context = allocator.use_memory_pool(tag="weights") - else: - context = nullcontext() - with context: - self.model_runner.load_model() - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - self.model_runner.save_sharded_state( - path, - pattern=pattern, - max_size=max_size, - ) - - def save_tensorized_model( - self, - tensorizer_config: TensorizerConfig, - ) -> None: - self.model_runner.save_tensorized_model( - tensorizer_config=tensorizer_config, ) - - @torch.inference_mode() - def determine_available_kv_cache_memory(self, - total_gpu_memory: int) -> float: - if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes: - # still need a profile run which compiles the model for - # max_num_batched_tokens - self.model_runner.profile_run() - - GiB = lambda b: b / GiB_bytes - msg = ( - f"Initial free memory " - f"{GiB(self.baseline_snapshot.free_memory):.2f} " - f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for " - "KV Cache as specified by kv_cache_memory_bytes config and " - "skipped memory profiling. This does does not respect the " - "gpu_memory_utilization config. Only use kv_cache_memory_bytes " - "config when you want manual control of KV cache memory " - "size. If OOM'ed, check the difference of initial free " - "memory between the current run and the previous run " - "where kv_cache_memory_bytes is suggested and update it " - "correspondingly.") - logger.info(msg) - return self.cache_config.kv_cache_memory_bytes - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - with memory_profiling( - self.baseline_snapshot, - weights_memory=self.model_runner.model_memory_usage) as result: - self.model_runner.profile_run() - - self.non_torch_memory = result.non_torch_increase - self.peak_activation_memory = result.torch_peak_increase - - self._assert_memory_footprint_increased_during_profiling() - - self.requested_memory = total_gpu_memory * \ - self.cache_config.gpu_memory_utilization - - self.available_kv_cache_memory = (self.requested_memory - - result.non_kv_cache_memory) - - msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n" - "the current vLLM instance can use " - "total_gpu_memory " - f"({(total_gpu_memory / GiB_bytes):.2f}GiB)" - " x gpu_memory_utilization " - f"({self.cache_config.gpu_memory_utilization:.2f})" - f" = {(self.requested_memory / GiB_bytes):.2f}GiB\n" - "model weights take " - f"{(result.weights_memory / GiB_bytes):.2f}GiB;" - " non_torch_memory takes " - f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" - " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" - " the rest of the memory reserved for KV Cache is " - f"{(self.available_kv_cache_memory / GiB_bytes):.2f}GiB.") - - logger.info(msg) - return self.available_kv_cache_memory - - @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculates the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - - Tip: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - - free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info() - available_kv_cache_memory = self.determine_available_kv_cache_memory( - total_gpu_memory) - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - cache_block_size = self.get_cache_block_size_bytes() - if cache_block_size == 0: - num_gpu_blocks = 0 - num_cpu_blocks = 0 - else: - num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - - # Final cleanup - gc.collect() - - return num_gpu_blocks, num_cpu_blocks - - def _assert_memory_footprint_increased_during_profiling(self): - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - free_gpu_memory, total = torch.cuda.mem_get_info() - cuda_memory = total - free_gpu_memory - assert self.baseline_snapshot.cuda_memory < cuda_memory, ( - "Error in memory profiling. " - f"Initial used memory {self.baseline_snapshot.cuda_memory}, " - f"currently used memory {cuda_memory}. " - f"This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Allocate GPU and CPU KV cache with the specified number of blocks. - - This also warms up the model, which may record CUDA graphs. - """ - raise_if_cache_size_invalid( - num_gpu_blocks, self.cache_config.block_size, - self.cache_config.is_attention_free, - self.model_config.max_model_len, - self.parallel_config.pipeline_parallel_size) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - if self.vllm_config.model_config.enable_sleep_mode: - allocator = CuMemAllocator.get_instance() - context = allocator.use_memory_pool(tag="kv_cache") - else: - context = nullcontext() - with context: - self._init_cache_engine() - self._warm_up_model() - - def _init_cache_engine(self): - assert self.cache_config.num_gpu_blocks is not None - self.cache_engine = [ - CacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - self.gpu_cache = [ - self.cache_engine[ve].gpu_cache - for ve in range(self.parallel_config.pipeline_parallel_size) - ] - - # Layer pairings for cross-layer KV sharing. - # If an Attention layer `layer_name` is in the keys of this dict, it - # means this layer will perform attention using the keys and values - # from the KV cache of `shared_kv_cache_layers[layer_name]`. - shared_kv_cache_layers: dict[str, str] = {} - - attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention) - - for layer_name, attn_module in attn_layers.items(): - if (kv_tgt_layer := - attn_module.kv_sharing_target_layer_name) is not None: - # The layer doesn't need its own KV cache and will use that of - # the target layer. We skip creating a KVCacheSpec for it, so - # that KV cache management logic will act as this layer does - # not exist, and doesn't allocate KV cache for the layer. This - # enables the memory saving of cross-layer kv sharing, allowing - # a given amount of memory to accommodate longer context lengths - # or enable more requests to be processed simultaneously. - shared_kv_cache_layers[layer_name] = kv_tgt_layer - - bind_kv_cache(self.compilation_config.static_forward_context, - self.gpu_cache, shared_kv_cache_layers) - - def _warm_up_model(self) -> None: - # warm up sizes that are not in cudagraph capture sizes, - # but users still want to compile for better performance, - # e.g. for the max-num-batched token size in chunked prefill. - warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() - if not self.model_config.enforce_eager: - warmup_sizes = [ - x for x in warmup_sizes if x not in - self.vllm_config.compilation_config.cudagraph_capture_sizes - ] - for size in sorted(warmup_sizes, reverse=True): - logger.info("Compile and warming up model for size %d", size) - self.model_runner._dummy_run(size) - - cuda_graph_memory_bytes = 0 - if not self.model_config.enforce_eager: - cuda_graph_memory_bytes = self.model_runner.capture_model( - self.gpu_cache) - - if (self.cache_config.kv_cache_memory_bytes is None - and hasattr(self, "peak_activation_memory")): - # Suggests optimal kv cache memory size if we rely on - # memory_profiling to guess the kv cache memory size which - # provides peak_activation_memory and a few other memory - # consumption. `memory_profiling` does not consider - # CUDAGraph memory size and may not utilize all gpu memory. - # Users may want fine-grained control to specify kv cache - # memory size. - GiB = lambda b: round(b / GiB_bytes, 2) - non_kv_cache_memory = (self.model_runner.model_memory_usage + - self.peak_activation_memory + - self.non_torch_memory + - cuda_graph_memory_bytes) - - # empirically observed that the memory profiling may - # slightly underestimate the memory consumption. - # So leave a small buffer (=150MiB) to avoid OOM. - redundancy_buffer_memory = 150 * (1 << 20) - kv_cache_memory_bytes_to_gpu_limit = ( - self.baseline_snapshot.free_memory - non_kv_cache_memory - - redundancy_buffer_memory) - kv_cache_memory_bytes_to_requested_limit = ( - int(self.requested_memory) - non_kv_cache_memory - - redundancy_buffer_memory) - - msg = ( - f"Free memory on device " - f"({GiB(self.baseline_snapshot.free_memory)}/" - f"{GiB(self.baseline_snapshot.total_memory)} GiB) on startup. " - f"Desired GPU memory utilization is " - f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(self.requested_memory)} GiB). " - f"Actual usage is {GiB(self.model_runner.model_memory_usage)} " - f"GiB for weight, {GiB(self.peak_activation_memory)} GiB " - f"for peak activation, {GiB(self.non_torch_memory)} GiB " - f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " - f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " - f"config with `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_requested_limit}` to fit into " - f"requested memory, or `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_gpu_limit}` to fully " - f"utilize gpu memory. Current kv cache memory in use is " - f"{int(self.available_kv_cache_memory)} bytes.") - logger.info(msg) - - # Reset the seed to ensure that the random state is not affected by - # the model initialization and profiling. - set_random_seed(self.model_config.seed) - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return self.gpu_cache - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - virtual_engine = execute_model_req.virtual_engine - num_steps = execute_model_req.num_steps - num_seq_groups = len(execute_model_req.seq_group_metadata_list) - # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. - # they contain parameters to launch cudamemcpyasync. - blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, - device="cpu", - dtype=torch.int64).view(-1, 2) - blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, - device="cpu", - dtype=torch.int64).view(-1, 2) - # `blocks_to_copy` is a gpu tensor. The src and tgt of - # blocks to copy are in the same device, and `blocks_to_copy` - # can be used directly within cuda kernels. - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device=self.device, - dtype=torch.int64).view(-1, 2) - - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - num_steps=num_steps, - ) - - @torch.inference_mode() - def execute_worker(self, worker_input: WorkerInput) -> None: - virtual_engine = worker_input.virtual_engine - # Issue cache operations. - if (worker_input.blocks_to_swap_in is not None - and worker_input.blocks_to_swap_in.numel() > 0): - self.cache_engine[virtual_engine].swap_in( - worker_input.blocks_to_swap_in) - if (worker_input.blocks_to_swap_out is not None - and worker_input.blocks_to_swap_out.numel() > 0): - self.cache_engine[virtual_engine].swap_out( - worker_input.blocks_to_swap_out) - if (worker_input.blocks_to_copy is not None - and worker_input.blocks_to_copy.numel() > 0): - self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) - - def _get_cached_seq_group_metadata( - self, - seq_group_metadata_list: List[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]], - finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: - """Return a list of cached Sequence Group Metadata after updating its - state. - - It is used because scheduler only sends delta to workers to reduce - the data payload size. The function also cleans up cache based on - a given `finished_request_ids`. - """ - new_seq_group_metadata_list = [] - for metadata_or_delta in seq_group_metadata_list: - request_id = metadata_or_delta.request_id - if request_id not in self._seq_group_metadata_cache: - # The first prefill. - assert isinstance(metadata_or_delta, SequenceGroupMetadata) - self._seq_group_metadata_cache[request_id] = metadata_or_delta - else: - # The first prefill is already cached. - if isinstance(metadata_or_delta, SequenceGroupMetadataDelta): - self._seq_group_metadata_cache[request_id].apply_delta( - metadata_or_delta) - else: - # If metadata snapshot is sent again, it is - # preempted. Reset the cache because we need to start - # from scratch. - assert isinstance(metadata_or_delta, SequenceGroupMetadata) - self._seq_group_metadata_cache[ - request_id] = metadata_or_delta - - new_seq_group_metadata_list.append( - self._seq_group_metadata_cache[request_id]) - - # Clean up finished ids - for finished_id in finished_request_ids: - del self._seq_group_metadata_cache[finished_id] - - return new_seq_group_metadata_list - - def _execute_model_spmd( - self, - execute_model_req: ExecuteModelRequest, - intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Optional[List[SamplerOutput]]: - if execute_model_req is not None: - new_seq_group_metadata_list = self._get_cached_seq_group_metadata( - execute_model_req.seq_group_metadata_list, - execute_model_req.finished_requests_ids) - - execute_model_req.seq_group_metadata_list = ( - new_seq_group_metadata_list) - output = super()._execute_model_spmd(execute_model_req, - intermediate_tensors) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_runner.list_loras() - - @property - def max_model_len(self) -> int: - return self.model_config.max_model_len - - @property - def vocab_size(self) -> int: - return self.model_runner.vocab_size - - def get_cache_block_size_bytes(self) -> int: - """Get the size of the KV cache block size in bytes. - """ - return CacheEngine.get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - - -def init_worker_distributed_environment( - vllm_config: VllmConfig, - rank: int, - distributed_init_method: Optional[str] = None, - local_rank: int = -1, -) -> None: - """Initialize the distributed environment.""" - parallel_config = vllm_config.parallel_config - set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) - - init_distributed_environment(parallel_config.world_size, rank, - distributed_init_method, local_rank, - current_platform.dist_backend) - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.decode_context_parallel_size) - - ensure_kv_transfer_initialized(vllm_config) - - -def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): - # Check if the GPU supports the dtype. - if torch_dtype == torch.bfloat16: # noqa: SIM102 - if not current_platform.has_device_capability(80): - capability = current_platform.get_device_capability() - gpu_name = current_platform.get_device_name() - - if capability is None: - compute_str = "does not have a compute capability" - else: - version_str = capability.as_version_str() - compute_str = f"has compute capability {version_str}" - - raise ValueError( - "Bfloat16 is only supported on GPUs with compute capability " - f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " - "You can use float16 instead by explicitly setting the " - "`dtype` flag in CLI, for example: --dtype=half.") - - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, - max_model_len, pipeline_parallel_size) -> None: - if is_attention_free and num_gpu_blocks != 0: - raise ValueError("No memory should be allocated for the cache blocks " - f"for an attention-free model, but {num_gpu_blocks} " - "blocks are allocated.") - if not is_attention_free and num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size) - if not is_attention_free and max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") From 62b38dc8322f9f31fff7c680028b0b88306e1c39 Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Sat, 20 Sep 2025 20:29:12 -0700 Subject: [PATCH 519/584] [Doc] improve test-pipeline.yaml documentation (#25305) Signed-off-by: Huamin Li <3ericli@gmail.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9d38e571324b4..fe4796b35786c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -6,24 +6,28 @@ # to generate the final pipeline yaml file. # Documentation -# label(str): the name of the test. emoji allowed. -# fast_check(bool): whether to run this on each commit on fastcheck pipeline. -# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline. -# fast_check_only(bool): run this test on fastcheck pipeline only -# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. +# label(str): the name of the test. emojis allowed. +# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. +# fast_check_only(bool): run this test on the fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. +# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). # command(str): the single command to run for tests. incompatible with commands. -# commands(list): the list of commands to run for test. incompatbile with command. -# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] -# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100 -# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4. -# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, -# in this case, commands must be specified. the first command runs on first host, the second +# commands(list): the list of commands to run for the test. incompatible with command. +# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] +# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 +# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. +# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, +# in this case, commands must be specified. the first command runs on the first host, the second # command runs on the second host. -# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests -# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run. +# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. +# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB +# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. +# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. # When adding a test -# - If the test belong to an existing group, add it there +# - If the test belongs to an existing group, add it there # - If the test is short, add to any existing step # - If the test takes more than 10min, then it is okay to create a new step. # Note that all steps execute in parallel. From 1cd885bd540c3765e4c2d378b3167102dfa26ab5 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 20:49:09 -0700 Subject: [PATCH 520/584] [V0 Deprecation] Remove V0 model runner base & simplify worker base (#25328) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/attention/backends/abstract.py | 15 +- vllm/attention/backends/utils.py | 8 +- vllm/worker/model_runner_base.py | 307 ---------------------- vllm/worker/worker_base.py | 387 +--------------------------- 4 files changed, 11 insertions(+), 706 deletions(-) delete mode 100644 vllm/worker/model_runner_base.py diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index dfde67e1713c5..ab7ef2112b083 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -4,19 +4,14 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, - Protocol, Set, Tuple, Type, TypeVar) +from typing import (Any, Dict, Generic, List, Optional, Protocol, Set, Tuple, + Type, TypeVar) import torch from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey from vllm.multimodal import MultiModalPlaceholderMap -if TYPE_CHECKING: - from vllm.worker.model_runner_base import (ModelRunnerBase, - ModelRunnerInputBase, - ModelRunnerInputBuilderBase) - class AttentionType: """ @@ -170,7 +165,7 @@ class AttentionState(ABC, Generic[T]): lifetime of the model runner.""" @abstractmethod - def __init__(self, runner: "ModelRunnerBase"): + def __init__(self, runner: Any): ... @abstractmethod @@ -210,7 +205,7 @@ class AttentionState(ABC, Generic[T]): ... @abstractmethod - def begin_forward(self, model_input: "ModelRunnerInputBase") -> None: + def begin_forward(self, model_input) -> None: """Prepare state for forward pass.""" ... @@ -219,7 +214,7 @@ class AttentionMetadataBuilder(ABC, Generic[T]): """Abstract class for attention metadata builders.""" @abstractmethod - def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None: + def __init__(self, input_builder) -> None: """Create the builder, remember some configuration and parameters.""" raise NotImplementedError diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 289cfa2177438..b28e6a4237cb6 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -5,8 +5,7 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, - TypeVar, Union) +from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union import numpy as np import torch @@ -21,9 +20,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad logger = init_logger(__name__) -if TYPE_CHECKING: - from vllm.worker.model_runner_base import ModelRunnerBase - # Error string(s) for encoder/decoder # unsupported attention scenarios STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported " @@ -286,7 +282,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): class CommonAttentionState(AttentionState): - def __init__(self, runner: "ModelRunnerBase"): + def __init__(self, runner): self.runner = runner self._is_graph_capturing = False diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py deleted file mode 100644 index 1008b743619a4..0000000000000 --- a/vllm/worker/model_runner_base.py +++ /dev/null @@ -1,307 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from abc import ABC, abstractmethod -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, - TypeVar) - -import torch -import torch.nn as nn - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.models.interfaces import supports_transcription -from vllm.model_executor.models.interfaces_base import is_text_generation_model -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.tasks import GenerationTask, SupportedTask - -if TYPE_CHECKING: - from vllm.attention import AttentionMetadata - from vllm.attention.backends.abstract import AttentionBackend - from vllm.model_executor import SamplingMetadata - -logger = init_logger(__name__) - -T = TypeVar('T', bound="BroadcastableModelInput") - - -def _add_attn_metadata_broadcastable_dict( - tensor_dict: Dict[str, Any], - attn_metadata: Optional["AttentionMetadata"]) -> None: - """ - Helper method to update tensor_dict with broadcastable - AttentionMetadata fields. - """ - if attn_metadata is not None: - tensor_dict.update(attn_metadata.asdict_zerocopy()) - - -def _init_attn_metadata_from_tensor_dict( - attn_backend: "AttentionBackend", - tensor_dict: Dict[str, Any], -) -> Dict[str, Any]: - """ - Helper method to initialize AttentionMetadata based on an - AttentionBackend and broadcastable AttentionMetadata fields. - """ - # Extract the fields used to create AttentionMetadata. - valid_attn_kwargs = {} - for field in dataclasses.fields(attn_backend.get_metadata_cls()): - if field.name in tensor_dict: - if field.name == "input_positions": - valid_attn_kwargs[field.name] = tensor_dict[field.name] - else: - valid_attn_kwargs[field.name] = tensor_dict.pop(field.name) - - attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs) - tensor_dict["attn_metadata"] = attn_metadata - return tensor_dict - - -def _init_sampling_metadata_from_tensor_dict( # type: ignore - tensor_dict: Dict[str, Any]) -> Dict[str, Any]: - """ - Helper method to initialize SamplingMetadata based on broadcastable - SamplingMetadata fields. - """ - from vllm.model_executor import SamplingMetadata - - selected_token_indices = tensor_dict.pop("selected_token_indices", None) - # An empty SamplingMetadata to signal that the worker should skip - # sampling. - if selected_token_indices is not None: - tensor_dict["sampling_metadata"] = SamplingMetadata( - seq_groups=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - num_prompts=0, - ) - return tensor_dict - - -def _add_sampling_metadata_broadcastable_dict( - tensor_dict: Dict[str, Any], - sampling_metadata: Optional["SamplingMetadata"]) -> None: - """ - Helper method to update tensor_dict with broadcastable - SamplingMetadata fields. - """ - if sampling_metadata is not None: - tensor_dict["selected_token_indices"] = ( - sampling_metadata.selected_token_indices) - - -def _init_frozen_model_input_from_tensor_dict( - frozen_model_input_cls: Type["ModelRunnerInputBase"], - tensor_dict: Dict[str, Any]) -> Dict[str, Any]: - """ - Helper method to initialize a frozen ModelInput based on broadcastable - """ - valid_tensor_kwargs = {} - for field in dataclasses.fields(frozen_model_input_cls): - val = tensor_dict.pop(field.name, None) - if val is not None: - valid_tensor_kwargs[field.name] = val - - frozen_model_input = frozen_model_input_cls(**valid_tensor_kwargs) - tensor_dict["frozen_model_input"] = frozen_model_input - return tensor_dict - - -class BroadcastableModelInput(ABC): - - @abstractmethod - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - """ - Extract broadcastable fields. Override for fields that require some - custom deserialization. - """ - raise NotImplementedError - - @classmethod - @abstractmethod - def from_broadcasted_tensor_dict( - cls: Type[T], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> T: - """ - Pop fields from the given tensor_dict and populate a new instance of - BroadcastableModelInput. - """ - raise NotImplementedError - - -@dataclasses.dataclass(frozen=True) -class ModelRunnerInputBase(BroadcastableModelInput): - """Local inputs to each worker's model runner. May contain - device-specific data. Different worker backends may have different methods - of converting from the global ExecuteModelRequest produced by the LLM - engine to the worker-local ModelRunnerInputBase objects. - - Model runners that support multi-GPU execution should define a - ModelRunnerInputBase subclass, add their required fields, and specify how to - serialize/deserialize a ModelInput for broadcast between workers. - """ - pass - - -class ModelRunnerInputBuilderBase(ABC, Generic[T]): - """A builder to create ModelRunnerInputBase objects. - """ - - @abstractmethod - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - raise NotImplementedError - - @abstractmethod - def add_seq_group(self, seq_group_metadata): - """TBA""" - raise NotImplementedError - - @abstractmethod - def build(self, *args, **kwargs) -> T: - """Build metadata with on-device tensors.""" - raise NotImplementedError - - -class ModelRunnerBase(ABC, Generic[T]): - """ - Model runner interface that abstracts a particular hardware and/or type of - model. Model execution may communicate data with model runners in other - processes, but it should not include control plane metadata communication. - - Each ModelRunnerBase subclass should define a corresponding - ModelRunnerInputBase subclass. - """ - - def __init__( - self, - vllm_config: VllmConfig, - ) -> None: - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.load_config = vllm_config.load_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config - self.observability_config = vllm_config.observability_config - - # Map of request_id -> generator used for seeded random sampling - generators: Dict[str, torch.Generator] = {} - - @abstractmethod - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> T: - """ - Make an instance of a ModelRunnerInputBase from the broadcasted tensor - dict. - """ - raise NotImplementedError - - @abstractmethod - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> T: - """ - Prepare the inputs to ModelRunnerBase.execute_model from an execution - request. This method may move data to the worker's local device. It is - not allowed to communicate with other workers or devices. - """ - raise NotImplementedError - - @abstractmethod - def get_model(self) -> nn.Module: - raise NotImplementedError - - def get_supported_generation_tasks(self) -> list[GenerationTask]: - model = self.get_model() - supported_tasks = list[GenerationTask]() - - if is_text_generation_model(model): - supported_tasks.append("generate") - - if supports_transcription(model): - if model.supports_transcription_only: - return ["transcription"] - - supported_tasks.append("transcription") - - return supported_tasks - - def get_supported_tasks(self) -> tuple[SupportedTask, ...]: - tasks = list[SupportedTask]() - - if self.model_config.runner_type == "generate": - tasks.extend(self.get_supported_generation_tasks()) - - return tuple(tasks) - - def execute_model( - self, - model_input: T, - kv_caches: Optional[List[torch.Tensor]], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[List[SamplerOutput]]: - """ - Execute the model on the given input. - """ - raise NotImplementedError - - def get_generators(self, finished_request_ids: Optional[List[str]] = None): - """ - Return dict of per-request generators used for random sampling. - """ - - # Clean up generators from completed requests - if finished_request_ids: - for request_id in finished_request_ids: - self.generators.pop(request_id, None) - - return self.generators - - -class ModelRunnerWrapperBase: - """ - The whole point of this class is to lazily initialize the model_runner. - """ - - def __init__( - self, - model_runner: ModelRunnerBase, - ) -> None: - self.model_runner: ModelRunnerBase = model_runner - - def __getattr__(self, attr): - return getattr(self.model_runner, attr) - - -class InputProcessingError(Exception): - """This exception is raised when an error occurs preparing the inputs for - a single sequence group. - This allows the engine to gracefully handle errors with a single sequence - group without having to fail the entire batch. - """ - - def __init__(self, request_id, message): - """request_id is the id of the offending sequence group""" - self.request_id = request_id - self.message = message - super().__init__(self.message) - - def __str__(self): - return "Failed to prepare inputs for sequence group with request id: " \ - f"{self.request_id}, Error: {self.message}" diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index d0a56f6ff4637..eaab976bf7f75 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,31 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses import os -import time -from abc import abstractmethod -from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, Type, - TypeVar, Union) +from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar, + Union) import cloudpickle -import torch import torch.nn as nn -from vllm.config import (ObservabilityConfig, VllmConfig, - set_current_vllm_config) -from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group +from vllm.config import VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, IntermediateTensors +from vllm.sequence import ExecuteModelRequest from vllm.utils import (enable_trace_function_call_for_thread, resolve_obj_by_qualname, run_method, update_environment_variables, warn_for_unimplemented_methods) -from vllm.worker.model_runner_base import (BroadcastableModelInput, - ModelRunnerBase, - ModelRunnerInputBase) logger = init_logger(__name__) @@ -141,356 +132,6 @@ class WorkerBase: return -class DelegateWorkerBase(WorkerBase): - """ - A class that delegates all methods to another WorkerBase instance. This is - useful for creating a WorkerBase that wraps another WorkerBase instance, - e.g. speculative decoding. - """ - worker: WorkerBase - - def __init__( - self, - *args, - **kwargs, - ) -> None: - vllm_config: VllmConfig = kwargs.get("vllm_config") - cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls) - self.worker = cls(*args, **kwargs) - - def init_device(self) -> None: - self.worker.init_device() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - return self.worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def load_model(self) -> None: - """Load model onto target device.""" - self.worker.load_model() - - def get_model(self) -> nn.Module: - return self.worker.get_model() - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[List[SamplerOutput]]: - return self.worker.execute_model(execute_model_req) - - def get_cache_block_size_bytes(self) -> int: - return self.worker.get_cache_block_size_bytes() - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.worker.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.worker.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.worker.list_loras() - - def __getattr__(self, attr): - return getattr(self.worker, attr) - - -class LoRANotSupportedWorkerBase(WorkerBase): - """Partial implementation of WorkerBase that raises exceptions when LoRA - methods are invoked. - """ - - def add_lora(self, lora_request: LoRARequest) -> bool: - raise ValueError(f"{type(self)} does not support LoRA") - - def remove_lora(self, lora_id: int) -> bool: - raise ValueError(f"{type(self)} does not support LoRA") - - def pin_lora(self, lora_id: int) -> bool: - raise ValueError(f"{type(self)} does not support LoRA") - - def list_loras(self) -> Set[int]: - raise ValueError(f"{type(self)} does not support LoRA") - - -@dataclasses.dataclass(frozen=True) -class WorkerInput: - """Local inputs to each worker. May contain device-specific data. These - fields should be broadcastable to other workers. - """ - - num_seq_groups: Optional[int] = None - blocks_to_swap_in: Optional[torch.Tensor] = None - blocks_to_swap_out: Optional[torch.Tensor] = None - blocks_to_copy: Optional[torch.Tensor] = None - virtual_engine: int = 0 - num_steps: int = 1 - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type["WorkerInput"], - tensor_dict: Dict[str, Any], - ) -> "WorkerInput": - """ - Pop fields from the given tensor_dict and populate a new instance of - WorkerInput. - """ - return cls( - num_seq_groups=tensor_dict.pop("num_seq_groups"), - blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"), - blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"), - blocks_to_copy=tensor_dict.pop("blocks_to_copy"), - virtual_engine=tensor_dict["virtual_engine"], - num_steps=tensor_dict.pop("num_steps"), - ) - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - """ - Extract broadcastable fields. - """ - tensor_dict = { - "num_seq_groups": self.num_seq_groups, - "blocks_to_swap_in": self.blocks_to_swap_in, - "blocks_to_swap_out": self.blocks_to_swap_out, - "blocks_to_copy": self.blocks_to_copy, - "virtual_engine": self.virtual_engine, - "num_steps": self.num_steps, - } - - return tensor_dict - - -class LocalOrDistributedWorkerBase(WorkerBase): - """ - Partial implementation of WorkerBase that has a default `execute_model` - definition to perform metadata transfer between workers when in distributed - mode. Subclasses of this interface should use model runners that inherit - from ModelRunnerBase, and should only need to implement worker-local logic. - If custom control plane logic is needed to transfer metadata, or if the - model runner cannot inherit from ModelRunnerBase, use WorkerBase instead. - """ - is_driver_worker: bool - model_runner: ModelRunnerBase - observability_config: Optional[ObservabilityConfig] = None - - @property - @abstractmethod - def do_metadata_broadcast(self) -> bool: - """ - Used by the default `execute_model` to check whether broadcast is - needed to transfer request inputs from the driver worker to other - workers in the TP group. If WorkerBase subclass only supports - single-worker execution, then this method should return False. - """ - raise NotImplementedError - - @property - @abstractmethod - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - """ - Gets the list of kv caches to pass to the worker's model runner. Each - element in the list is a kv cache corresponding to a particular virtual - engine (PP stream). Used by the default `execute_model`. If the worker's - model runner does not follow the ModelRunnerBase interface, then inherit - from WorkerBase instead. - """ - raise NotImplementedError - - @abstractmethod - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - """ - Prepare the inputs to WorkerBase.execute_worker from an execution - request. This method may move data to the worker's local device. It is - not allowed to communicate with other workers or devices. - """ - raise NotImplementedError - - @abstractmethod - def execute_worker(self, worker_input: WorkerInput) -> None: - """ - Process an execution request. - """ - raise NotImplementedError - - def _get_worker_input_from_broadcast( - self - ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ - str, torch.Tensor]]]: - """ Get the worker input from the broadcasted tensor dict. """ - assert self.do_metadata_broadcast - assert not self.is_driver_worker - broadcast_data = broadcast_tensor_dict(src=0) - if not broadcast_data: - return None - - worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data) - model_input = ( - self.model_runner.make_model_input_from_broadcasted_tensor_dict( - broadcast_data)) - - kwargs = extract_previous_hidden_states(broadcast_data) - - return model_input, worker_input, kwargs - - def _get_driver_input_and_broadcast( - self, execute_model_req: ExecuteModelRequest - ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: - """ Get the driver input and broadcast it to other workers. """ - assert self.is_driver_worker - - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - model_input: ModelRunnerInputBase = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list, - execute_model_req.virtual_engine, - execute_model_req.finished_requests_ids)) - - kwargs = extract_previous_hidden_states(execute_model_req) - - if self.do_metadata_broadcast: - broadcast_data = worker_input.as_broadcastable_tensor_dict() - broadcast_data.update(model_input.as_broadcastable_tensor_dict()) - broadcast_data.update(kwargs) - broadcast_tensor_dict(broadcast_data, src=0) - - if execute_model_req.async_callback: - model_input = dataclasses.replace( # type: ignore - model_input, - async_callback=execute_model_req.async_callback) - - return model_input, worker_input, kwargs - - def prepare_input( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ - str, torch.Tensor]]]: - """ - Prepare the inputs to ModelRunner and workers. - """ - if self.is_driver_worker: - if execute_model_req is None: - if self.do_metadata_broadcast: - # This signals that there's no more requests to process for - # now. All workers are running infinite loop with - # broadcast_tensor_dict, and it stops the loop when the - # driver broadcasts an empty input. Send an empty input to - # notify all other workers to stop their execution loop. - broadcast_tensor_dict({}, src=0) - return None - return self._get_driver_input_and_broadcast(execute_model_req) - else: - return self._get_worker_input_from_broadcast() - - def get_model(self) -> nn.Module: - return self.model_runner.get_model() - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[List[SamplerOutput]]: - """Executes at least one model step on the given sequences, unless no - sequences are provided.""" - start_time = time.perf_counter() - - inputs = self.prepare_input(execute_model_req) - if inputs is None: - return None - - model_input, worker_input, kwargs = inputs - num_steps = worker_input.num_steps - - self.execute_worker(worker_input) - - # If there is no input, we don't need to execute the model. - if worker_input.num_seq_groups == 0: - return [] - - intermediate_tensors = None - orig_model_execute_time = 0.0 - if not get_pp_group().is_first_rank: - intermediate_tensors = IntermediateTensors( - get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group())) - if (self.observability_config is not None - and self.observability_config.collect_model_execute_time): - orig_model_execute_time = intermediate_tensors.tensors.get( - "model_execute_time", torch.tensor(0)).item() - - output = self.model_runner.execute_model( - model_input=model_input, - kv_caches=self.kv_cache[worker_input.virtual_engine] - if self.kv_cache is not None else None, - intermediate_tensors=intermediate_tensors, - num_steps=num_steps, - **kwargs, - ) - - model_execute_time = time.perf_counter() - start_time - if not get_pp_group().is_last_rank: - # output is IntermediateTensors - assert isinstance(output, IntermediateTensors) - if (self.observability_config is not None - and self.observability_config.collect_model_execute_time): - output.tensors["model_execute_time"] = torch.tensor( - model_execute_time + orig_model_execute_time) - get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group()) - return [None] - if (self.observability_config is not None - and self.observability_config.collect_model_execute_time - and output is not None): - for o in output: - o.model_execute_time = (orig_model_execute_time + - model_execute_time) - - # output is List[SamplerOutput] - return output - - def _execute_model_spmd( - self, - execute_model_req: ExecuteModelRequest, - intermediate_tensors: Optional[IntermediateTensors] = None - ) -> Optional[List[SamplerOutput]]: - """ - Execute model in Single Program Multiple Data (SPMD) fashion. - All workers take the same request, prepare the input and - execute the model. - """ - assert execute_model_req is not None, ( - "_execute_model_spmd() requires each worker to take in an " - "ExecuteModelRequest") - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - model_input: ModelRunnerInputBase = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list)) - - self.execute_worker(worker_input) - - # If there is no input, we don't need to execute the model. - if worker_input.num_seq_groups == 0: - return [] - - kwargs = extract_previous_hidden_states(execute_model_req) - - return self.model_runner.execute_model( - model_input=model_input, - kv_caches=self.kv_cache[worker_input.virtual_engine] - if self.kv_cache is not None else None, - intermediate_tensors=intermediate_tensors, - **kwargs, - ) - - class WorkerWrapperBase: """ This class represents one process in an executor/engine. It is responsible @@ -636,23 +277,3 @@ class WorkerWrapperBase: def __getattr__(self, attr): return getattr(self.worker, attr) - - -def extract_previous_hidden_states( - data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \ - Dict[str, torch.Tensor]: - """If data contains previous_hidden_states, extract it. This returns a dict - which can be used directly as additional kwargs in any following - execute_model calls. This is used in draft models like EAGLE.""" - output = {} - - # When called from non-driver worker, data is dict but when called from - # driver worker, data is ExecuteModelRequest. - if isinstance(data, dict): - if "previous_hidden_states" in data: - output["previous_hidden_states"] = data["previous_hidden_states"] - elif data.previous_hidden_states is not None: - output["previous_hidden_states"] = data.previous_hidden_states\ - .hidden_states - - return output From 035fd2bd2cd2fb70f5834f5ca6c2ea30cdae9187 Mon Sep 17 00:00:00 2001 From: Wenlong Wang <wangwenlong2755@gmail.com> Date: Sat, 20 Sep 2025 20:55:10 -0700 Subject: [PATCH 521/584] [Multi Modal][Performance] Fused Q,K's apply_rope in more models (#25005) Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/ernie45_vl.py | 15 +++++++++------ vllm/model_executor/models/glm4_1v.py | 17 ++++++++++------- vllm/model_executor/models/qwen2_vl.py | 16 ++++++++++------ 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 3396c67f42b7b..0d4aced93ca1c 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -234,8 +234,9 @@ class Ernie4_5_VisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: # from vllm_flash_attn.flash_attn_interface import ( @@ -261,8 +262,8 @@ class Ernie4_5_VisionAttention(nn.Module): causal=False) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -281,6 +282,8 @@ class Ernie4_5_VisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -291,8 +294,8 @@ class Ernie4_5_VisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index cbf327ce02b6b..308b0cb602bc9 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -315,8 +315,10 @@ class Glm4vVisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.attn_backend == _Backend.FLASH_ATTN: # from vllm_flash_attn.flash_attn_interface import ( @@ -341,8 +343,8 @@ class Glm4vVisionAttention(nn.Module): ) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -361,6 +363,8 @@ class Glm4vVisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -371,9 +375,8 @@ class Glm4vVisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 7f361678ba72e..dd4e7731e0b08 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -377,8 +377,10 @@ class Qwen2VisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: if self.attn_backend == _Backend.ROCM_AITER_FA: @@ -402,8 +404,8 @@ class Qwen2VisionAttention(nn.Module): causal=False) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -422,6 +424,8 @@ class Qwen2VisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -432,8 +436,8 @@ class Qwen2VisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output From 12dbd834cf33b539a50d65516e44327caff9d824 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 21:10:48 -0700 Subject: [PATCH 522/584] [V0 Deprecation] Remove from_seq_group methods (#25330) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/multimodal/base.py | 122 +------------------------ vllm/outputs.py | 195 +--------------------------------------- 2 files changed, 2 insertions(+), 315 deletions(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index ef8f1b2e17b47..e0edb3e883ed6 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -2,14 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod -from collections.abc import Sequence from pathlib import Path -from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar - -if TYPE_CHECKING: - from vllm.sequence import SequenceGroupMetadata - -from .inputs import MultiModalKwargs, PlaceholderRange +from typing import Generic, NamedTuple, TypeVar _T = TypeVar("_T") @@ -53,120 +47,6 @@ class MultiModalPlaceholderMap: self.dest_ranges = [] self.dest_len = 0 - @classmethod - def from_seq_group( - cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]: - """ - Returns the multi-modal items that intersect with the portion of a - prompt (``seq_group``) represented by ``positions``, as well as a - ``MultiModalPlaceholderMap`` that relates the multi-modal embedding - vectors to their corresponding placeholders. - - Examples: - - ``` - Prompt: |AAAA BBBB What's in these images?| - Positions: |.................................| - - images = [A, B] - src_ranges = [(0, 4), (4, 8)] - dest_ranges = [(0, 4), (5, 9)] - - Prompt: |AAAA BBBB What's in these images?| - Positions: | ..... | - - images = [A, B] - src_ranges = [(2, 4), (4, 6)] - dest_ranges = [(0, 2), (3, 5)] - - Prompt: |AAAA BBBB What's in these images?| - Positions: | ......... | - - images = [B] - src_ranges = [(0, 4)] - dest_ranges = [(0, 4)] - - Prompt: |AAAA BBBB What's in these images?| - Positions: | .......................| - - images = [] - src_ranges = [] - dest_ranges = [] - ``` - """ - seq_mm_data = seq_group.multi_modal_data - seq_mm_placeholders = seq_group.multi_modal_placeholders - - if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs(), {} - - placeholder_maps = dict[str, MultiModalPlaceholderMap]() - - for modality, placeholders in seq_mm_placeholders.items(): - placeholder_map = MultiModalPlaceholderMap() - - if positions: - placeholder_map.append_items_from_seq_group( - positions, - # Dummy, since we don't care about intersecting items - [None] * len(placeholders), - placeholders, - ) - - placeholder_maps[modality] = placeholder_map - - return seq_mm_data, placeholder_maps - - def append_items_from_seq_group( - self, - positions: range, - multi_modal_items: list[_T], - multi_modal_placeholders: Sequence[PlaceholderRange], - ) -> list[_T]: - """ - Adds the multi-modal items that intersect ```positions`` to this - placeholder map and returns the intersecting items. - """ - intersecting_items = [] - - if len(multi_modal_items) != len(multi_modal_placeholders): - raise ValueError( - "Multi-modal placeholders and items must have the same length." - ) - for placeholder_dict, mm_item in zip(multi_modal_placeholders, - multi_modal_items): - placeholder = range( - placeholder_dict.offset, - placeholder_dict.offset + placeholder_dict.length, - ) - intersection = range( - max(positions.start, placeholder.start), - min(positions.stop, placeholder.stop), - ) - - if not intersection: - # Skip this multi-modal item. - continue - - token_embedding_range = range( - intersection.start - positions.start, - intersection.stop - positions.start, - ) - - multimodal_embedding_range = range( - intersection.start - placeholder.start + self.src_len, - intersection.stop - placeholder.start + self.src_len, - ) - - intersecting_items.append(mm_item) - self.dest_ranges.append(token_embedding_range) - self.src_ranges.append(multimodal_embedding_range) - self.src_len += len(placeholder) - - self.dest_len += len(positions) - return intersecting_items - def extend(self, other: "MultiModalPlaceholderMap"): """ Adds the placeholders from another ``MultiModalPlaceholderMap`` to this diff --git a/vllm/outputs.py b/vllm/outputs.py index 64bcfd472f2ad..4d8206bb2d830 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time from collections.abc import MutableSequence from collections.abc import Sequence as GenericSequence from dataclasses import dataclass @@ -14,9 +13,7 @@ from vllm.logger import init_logger from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict -from vllm.sampling_params import RequestOutputKind -from vllm.sequence import (RequestMetrics, SequenceGroup, SequenceGroupBase, - SequenceStatus) +from vllm.sequence import RequestMetrics logger = init_logger(__name__) @@ -171,170 +168,6 @@ class RequestOutput: else: self.outputs.append(next_completion) - @classmethod - def from_seq_group( - cls, seq_group: SequenceGroup, use_cache: bool, - seq_id_to_seq_group: dict[str, SequenceGroupBase] - ) -> Optional["RequestOutput"]: - finished = seq_group.is_finished() - - if seq_group.request_id in seq_id_to_seq_group: - group: SequenceGroupBase = seq_id_to_seq_group[ - seq_group.request_id] - assembled_seq_group = group.maybe_assemble_group(seq_group) - if finished: - group.finish_seq(seq_group) - if assembled_seq_group is None: - return None - - # clear finished seq in seq_id_to_seq_group - if len(group.to_be_finished) == 0: - for sub_request_id in list(group.seq_id_to_index.keys()): - if sub_request_id in seq_id_to_seq_group: - del seq_id_to_seq_group[sub_request_id] - - return cls.from_seq_group(assembled_seq_group, use_cache, - seq_id_to_seq_group) - - sampling_params = seq_group.sampling_params - if sampling_params is None: - raise ValueError( - "Sampling parameters are missing for a CompletionRequest.") - - if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and ( - not finished): - return None - - # Init cache (if needed) - if use_cache and seq_group.cached_request_output is None: - seq_group.cached_request_output = RequestOutput( # type: ignore - request_id="", - prompt=None, - prompt_token_ids=[], - prompt_logprobs=None, - outputs=[], - finished=False) - - top_n_seqs = seq_group.get_seqs() - - # Create the outputs. - # NOTE: We need omit logprobs here explicitly because the sequence - # always has the logprobs of the sampled tokens even if the - # logprobs are not requested. - include_logprobs = sampling_params.logprobs is not None - text_buffer_length = sampling_params.output_text_buffer_length - delta = sampling_params.output_kind == RequestOutputKind.DELTA - - outputs = [] - include_prompt = True - # num_cached_tokens should be the same for all the sequences - num_cached_tokens = None - for i, seq in enumerate(top_n_seqs): - output_text = seq.get_output_text_to_return( - text_buffer_length, delta) - - output_token_ids = seq.get_output_token_ids_to_return(delta) - num_output_tokens = 1 if isinstance(output_token_ids, - int) else len(output_token_ids) - num_cached_tokens = seq.data.get_num_cached_tokens() - - output_logprobs = seq.output_logprobs if include_logprobs else None - - if delta: - # Slice logprobs delta if applicable - if output_logprobs: - # num_output_tokens can be 0 when n > 1 and request finishes - # before the others - if num_output_tokens > 0: - output_logprobs = output_logprobs[-num_output_tokens:] - else: - output_logprobs = None - # Don't include prompt if this is after the first output - # containing decode token ids - if include_prompt and seq.get_output_len() > num_output_tokens: - include_prompt = False - - if use_cache: - # Get cached output object - cached_outputs = seq_group.cached_request_output.outputs # type: ignore - if i >= len(cached_outputs): - cached_outputs.append( - CompletionOutput(index=i, - text="", - token_ids=[], - cumulative_logprob=None, - logprobs=None, - finish_reason=None, - stop_reason=None)) - output = cached_outputs[i] - - # Init cached output object - assert output.index == i - output.text = output_text - - if isinstance(output_token_ids, int): - output.token_ids.clear() - output.token_ids.append(output_token_ids) - else: - output.token_ids = output_token_ids - - output.cumulative_logprob = seq.get_cumulative_logprob() \ - if include_logprobs else None - output.logprobs = output_logprobs - output.finish_reason = SequenceStatus.get_finished_reason( - seq.status) - output.stop_reason = seq.stop_reason - - else: - output = CompletionOutput( - top_n_seqs.index(seq), output_text, [output_token_ids] - if isinstance(output_token_ids, int) else output_token_ids, - seq.get_cumulative_logprob() if include_logprobs else None, - output_logprobs, - SequenceStatus.get_finished_reason(seq.status), - seq.stop_reason) - - outputs.append(output) - - # Every sequence in the sequence group should have the same prompt. - if include_prompt: - prompt = seq_group.prompt - prompt_token_ids = seq_group.prompt_token_ids - encoder_prompt = seq_group.encoder_prompt - encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids - prompt_logprobs = seq_group.prompt_logprobs - else: - prompt = None - prompt_token_ids = None - encoder_prompt = None - encoder_prompt_token_ids = None - prompt_logprobs = None - finished_time = time.time() if finished else None - seq_group.set_finished_time(finished_time) - - init_kwargs = { - "request_id": seq_group.request_id, - "prompt": prompt, - "prompt_token_ids": prompt_token_ids, - "prompt_logprobs": prompt_logprobs, - "outputs": outputs, - "finished": finished, - "metrics": seq_group.metrics, - "lora_request": seq_group.lora_request, - "encoder_prompt": encoder_prompt, - "encoder_prompt_token_ids": encoder_prompt_token_ids, - "num_cached_tokens": num_cached_tokens, - "multi_modal_placeholders": seq_group.multi_modal_placeholders - } - - if use_cache: - request_output = seq_group.cached_request_output - request_output.__init__(**init_kwargs) # type: ignore - else: - request_output = cls(**init_kwargs) # type: ignore - - return request_output - def __repr__(self) -> str: return (f"RequestOutput(request_id={self.request_id}, " f"prompt={self.prompt!r}, " @@ -371,19 +204,6 @@ class PoolingRequestOutput(Generic[_O]): self.finished = finished self.outputs = outputs - @staticmethod - def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": - pooled_data = seq_group.pooled_data - assert pooled_data is not None - - data = pooled_data.to(dtype=torch.float32, device="cpu") - output = PoolingOutput(data) - prompt_token_ids = seq_group.prompt_token_ids - finished = seq_group.is_finished() - - return PoolingRequestOutput(seq_group.request_id, output, - prompt_token_ids, finished) - def __repr__(self): return (f"{type(self).__name__}(request_id={self.request_id!r}, " f"outputs={self.outputs!r}, " @@ -391,19 +211,6 @@ class PoolingRequestOutput(Generic[_O]): f"finished={self.finished})") -class RequestOutputFactory: - - @staticmethod - def create(seq_group: SequenceGroup, - seq_id_to_seq_group: dict[str, SequenceGroupBase], - use_cache: bool = False): - if seq_group.pooled_data is not None: - return PoolingRequestOutput.from_seq_group(seq_group) - else: - return RequestOutput.from_seq_group(seq_group, use_cache, - seq_id_to_seq_group) - - @dataclass class EmbeddingOutput: """The output data of one embedding output of a request. From 7ed82d1974837957a3bfb6d576b9cffba24d31ae Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 20 Sep 2025 21:26:35 -0700 Subject: [PATCH 523/584] [V0 Deprecation] Remove V0 MP executor (#25329) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/executor/mp_distributed_executor.py | 244 -------------------- vllm/executor/multiproc_worker_utils.py | 279 ----------------------- vllm/v1/executor/multiproc_executor.py | 40 +++- 3 files changed, 33 insertions(+), 530 deletions(-) delete mode 100644 vllm/executor/mp_distributed_executor.py delete mode 100644 vllm/executor/multiproc_worker_utils.py diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py deleted file mode 100644 index 136dca54e6e52..0000000000000 --- a/vllm/executor/mp_distributed_executor.py +++ /dev/null @@ -1,244 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import os -from typing import Any, Callable, List, Optional, Union - -import cloudpickle - -from vllm.executor.executor_base import DistributedExecutorBase -from vllm.executor.multiproc_worker_utils import ( - ProcessWorkerWrapper, ResultHandler, WorkerMonitor, - set_multiprocessing_worker_envs) -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, - get_distributed_init_method, get_ip, get_open_port, - make_async, run_method, update_environment_variables) -from vllm.worker.worker_base import WorkerWrapperBase - -logger = init_logger(__name__) - - -class MultiprocessingDistributedExecutor(DistributedExecutorBase): - """Python multiprocessing-based distributed executor""" - - uses_ray: bool = False - - def _check_cuda(self) -> None: - """Check that the number of GPUs is sufficient for the parallel - configuration. Separate from _init_executor to reduce the number of - indented blocks. - """ - parallel_config = self.parallel_config - world_size = parallel_config.world_size - tensor_parallel_size = parallel_config.tensor_parallel_size - - cuda_device_count = cuda_device_count_stateless() - # Use confusing message for more common TP-only case. - if tensor_parallel_size > cuda_device_count: - raise RuntimeError( - f"please set tensor_parallel_size ({tensor_parallel_size}) " - f"to less than max local gpu count ({cuda_device_count})") - - if world_size > cuda_device_count: - raise RuntimeError( - f"please ensure that world_size ({world_size}) " - f"is less than than max local gpu count ({cuda_device_count})") - - # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers - if "CUDA_VISIBLE_DEVICES" not in os.environ: - update_environment_variables({ - "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) - }) - - def _init_executor(self) -> None: - - from vllm.platforms import current_platform - if current_platform.is_cuda_alike(): - self._check_cuda() - - # Create the parallel GPU workers. - world_size = self.parallel_config.world_size - tensor_parallel_size = self.parallel_config.tensor_parallel_size - - # Set multiprocessing envs that are common to V0 and V1 - set_multiprocessing_worker_envs(self.parallel_config) - - # Multiprocessing-based executor does not support multi-node setting. - # Since it only works for single node, we can use the loopback address - # 127.0.0.1 for communication. - distributed_init_method = get_distributed_init_method( - "127.0.0.1", get_open_port()) - - self.workers: List[ProcessWorkerWrapper] = [] - # This is the list of workers that are rank 0 of each TP group EXCEPT - # global rank 0. These are the workers that will broadcast to the - # rest of the workers. - self.tp_driver_workers: List[ProcessWorkerWrapper] = [] - # This is the list of workers that are not drivers and not the first - # worker in a TP group. These are the workers that will be - # broadcasted to. - self.non_driver_workers: List[ProcessWorkerWrapper] = [] - - if world_size == 1: - self.worker_monitor = None - else: - result_handler = ResultHandler() - for rank in range(1, world_size): - worker = ProcessWorkerWrapper(result_handler, - WorkerWrapperBase, - self.vllm_config, rank) - self.workers.append(worker) - if rank % tensor_parallel_size == 0: - self.tp_driver_workers.append(worker) - else: - self.non_driver_workers.append(worker) - - self.worker_monitor = WorkerMonitor(self.workers, result_handler) - result_handler.start() - self.worker_monitor.start() - - # Set up signal handlers to shut down the executor cleanly - # sometimes gc does not work well - - self.driver_worker = WorkerWrapperBase(self.vllm_config, 0) - - all_kwargs = [] - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - for i in range(world_size): - local_rank = i - rank = i - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=(not self.parallel_config) - or (rank % self.parallel_config.tensor_parallel_size == 0), - ) - all_kwargs.append(kwargs) - self._run_workers("init_worker", all_kwargs) - self._run_workers("init_device") - self._run_workers("load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers) - self.driver_exec_model = make_async(self.driver_worker.execute_model) - self.pp_locks: Optional[List[asyncio.Lock]] = None - - def shutdown(self): - if (worker_monitor := getattr(self, "worker_monitor", - None)) is not None: - worker_monitor.close() - - def _driver_execute_model( - self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[List[SamplerOutput]]: - """Run execute_model in the driver worker. - - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. - """ - return self.driver_worker.execute_model(execute_model_req) - - def _run_workers( - self, - method: Union[str, Callable], - *args, - async_run_tensor_parallel_workers_only: bool = False, - max_concurrent_workers: Optional[int] = None, - **kwargs, - ) -> List[Any]: - """Runs the given method on all workers. - - Args: - async_run_tensor_parallel_workers_only: If True the method will be - run only in the remote TP workers, not the driver worker. - It will also be run asynchronously and return a list of futures - rather than blocking on the results. - """ - if isinstance(method, str): - sent_method = method - else: - sent_method = cloudpickle.dumps(method) - del method - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - if async_run_tensor_parallel_workers_only: - # Run only non-driver workers and just return futures. - return [ - worker.execute_method(sent_method, *args, **kwargs) - for worker in self.non_driver_workers - ] - - # Start all remote workers first. - worker_outputs = [ - worker.execute_method(sent_method, *args, **kwargs) - for worker in self.workers - ] - - driver_worker_output = run_method(self.driver_worker, sent_method, - args, kwargs) - - # Get the results of the workers. - return [driver_worker_output - ] + [output.get() for output in worker_outputs] - - def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - if self.worker_monitor is not None and not self.worker_monitor.is_alive( - ): - raise RuntimeError("Worker processes are not running") - - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - for result in parallel_worker_tasks: - result.get() - - async def _driver_execute_model_async( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - if not self.tp_driver_workers: - return await self.driver_exec_model(execute_model_req) - - if self.pp_locks is None: - # This locks each pipeline parallel stage so multiple virtual - # engines can't execute on the same stage at the same time - # We create the locks here to avoid creating them in the constructor - # which uses a different asyncio loop. - self.pp_locks = [ - asyncio.Lock() - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - - tasks = [ - asyncio.create_task( - _run_task_with_lock(self.driver_exec_model, self.pp_locks[0], - execute_model_req)) - ] - for pp_rank, driver_worker in enumerate(self.tp_driver_workers, - start=1): - tasks.append( - asyncio.create_task( - _run_task_with_lock(driver_worker.execute_method_async, - self.pp_locks[pp_rank], - "execute_model", execute_model_req))) - results = await asyncio.gather(*tasks) - - # Only the last PP stage has the final results. - return results[-1] - - async def _start_worker_execution_loop(self): - coros = [ - worker.execute_method_async("start_worker_execution_loop") - for worker in self.non_driver_workers - ] - return await asyncio.gather(*coros) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py deleted file mode 100644 index 48b3479ed7997..0000000000000 --- a/vllm/executor/multiproc_worker_utils.py +++ /dev/null @@ -1,279 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import os -import threading -import uuid -from dataclasses import dataclass -from multiprocessing import Queue -from multiprocessing.connection import wait -from multiprocessing.process import BaseProcess -from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union - -import torch - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context, - run_method) - -logger = init_logger(__name__) - -T = TypeVar('T') - -_TERMINATE = "TERMINATE" # sentinel - -JOIN_TIMEOUT_S = 2 - - -@dataclass -class Result(Generic[T]): - """Result of task dispatched to worker""" - - task_id: uuid.UUID - value: Optional[T] = None - exception: Optional[BaseException] = None - - -class ResultFuture(threading.Event, Generic[T]): - """Synchronous future for non-async case""" - - def __init__(self): - super().__init__() - self.result: Optional[Result[T]] = None - - def set_result(self, result: Result[T]): - self.result = result - self.set() - - def get(self) -> T: - self.wait() - assert self.result is not None - if self.result.exception is not None: - raise self.result.exception - return self.result.value # type: ignore[return-value] - - -def _set_future_result(future: Union[ResultFuture, asyncio.Future], - result: Result): - if isinstance(future, ResultFuture): - future.set_result(result) - return - loop = future.get_loop() - if not loop.is_closed(): - if result.exception is not None: - loop.call_soon_threadsafe(future.set_exception, result.exception) - else: - loop.call_soon_threadsafe(future.set_result, result.value) - - -class ResultHandler(threading.Thread): - """Handle results from all workers (in background thread)""" - - def __init__(self) -> None: - super().__init__(daemon=True) - self.result_queue = get_mp_context().Queue() - self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {} - - def run(self): - for result in iter(self.result_queue.get, _TERMINATE): - future = self.tasks.pop(result.task_id) - _set_future_result(future, result) - # Ensure that all waiters will receive an exception - for task_id, future in self.tasks.items(): - _set_future_result( - future, - Result(task_id=task_id, - exception=ChildProcessError("worker died"))) - - def close(self): - self.result_queue.put(_TERMINATE) - - -class WorkerMonitor(threading.Thread): - """Monitor worker status (in background thread)""" - - def __init__(self, workers: List['ProcessWorkerWrapper'], - result_handler: ResultHandler): - super().__init__(daemon=True) - self.workers = workers - self.result_handler = result_handler - self._close = False - - def run(self) -> None: - # Blocks until any worker exits - dead_sentinels = wait([w.process.sentinel for w in self.workers]) - if not self._close: - self._close = True - - # Kill / cleanup all workers - for worker in self.workers: - process = worker.process - if process.sentinel in dead_sentinels: - process.join(JOIN_TIMEOUT_S) - if process.exitcode is not None and process.exitcode != 0: - logger.error("Worker %s pid %s died, exit code: %s", - process.name, process.pid, process.exitcode) - # Cleanup any remaining workers - if logger: - logger.info("Killing local vLLM worker processes") - for worker in self.workers: - worker.kill_worker() - # Must be done after worker task queues are all closed - self.result_handler.close() - - for worker in self.workers: - worker.process.join(JOIN_TIMEOUT_S) - - def close(self): - if self._close: - return - self._close = True - logger.info("Terminating local vLLM worker processes") - for worker in self.workers: - worker.terminate_worker() - # Must be done after worker task queues are all closed - self.result_handler.close() - - -class ProcessWorkerWrapper: - """Local process wrapper for vllm.worker.Worker, - for handling single-node multi-GPU tensor parallel.""" - - def __init__(self, result_handler: ResultHandler, - worker_factory: Callable[[VllmConfig, int], Any], - vllm_config: VllmConfig, rank: int) -> None: - self.mp = get_mp_context() - self._task_queue = self.mp.Queue() - self.result_queue = result_handler.result_queue - self.tasks = result_handler.tasks - self.process: BaseProcess = self.mp.Process( # type: ignore[attr-defined] - target=_run_worker_process, - name="VllmWorkerProcess", - kwargs=dict( - worker_factory=worker_factory, - task_queue=self._task_queue, - result_queue=self.result_queue, - vllm_config=vllm_config, - rank=rank, - ), - daemon=True) - - self.process.start() - - def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], - method: Union[str, bytes], args, kwargs): - task_id = uuid.uuid4() - self.tasks[task_id] = future - try: - self._task_queue.put((task_id, method, args, kwargs)) - except SystemExit: - raise - except BaseException as e: - del self.tasks[task_id] - raise ChildProcessError("worker died") from e - - def execute_method(self, method: Union[str, bytes], *args, **kwargs): - future: ResultFuture = ResultFuture() - self._enqueue_task(future, method, args, kwargs) - return future - - async def execute_method_async(self, method: Union[str, bytes], *args, - **kwargs): - future = asyncio.get_running_loop().create_future() - self._enqueue_task(future, method, args, kwargs) - return await future - - def terminate_worker(self): - try: - self._task_queue.put(_TERMINATE) - except ValueError: - self.process.kill() - self._task_queue.close() - - def kill_worker(self): - self._task_queue.close() - self.process.kill() - - -def _run_worker_process( - worker_factory: Callable[[VllmConfig, int], Any], - task_queue: Queue, - result_queue: Queue, - vllm_config: VllmConfig, - rank: int, -) -> None: - """Worker process event loop""" - - # Add process-specific prefix to stdout and stderr - process_name = get_mp_context().current_process().name - decorate_logs(process_name) - - # Initialize worker - worker = worker_factory(vllm_config, rank) - del worker_factory - - # Accept tasks from the engine in task_queue - # and return task output in result_queue - logger.info("Worker ready; awaiting tasks") - try: - for items in iter(task_queue.get, _TERMINATE): - output = None - exception = None - task_id, method, args, kwargs = items - try: - output = run_method(worker, method, args, kwargs) - except SystemExit: - raise - except KeyboardInterrupt: - break - except BaseException as e: - logger.exception( - "Exception in worker %s while processing method %s.", - process_name, method) - exception = e - result_queue.put( - Result(task_id=task_id, value=output, exception=exception)) - except KeyboardInterrupt: - pass - except Exception: - logger.exception("Worker failed") - - # Flush TunableOp results when TunableOp is enabled and - # online (in situ) tuning is enabled. - # Offline tuning API (record_untuned_is_enabled()) only - # available in PyTorch 2.6 or later. - if torch.cuda.is_available(): - import torch.cuda.tunable as tunable - if (tunable.is_enabled() and tunable.tuning_is_enabled() - and not tunable.record_untuned_is_enabled()): - tunable.write_file() - - logger.info("Worker exiting") - - -def set_multiprocessing_worker_envs(parallel_config): - """ Set up environment variables that should be used when there are workers - in a multiprocessing environment. This should be called by the parent - process before worker processes are created""" - - _maybe_force_spawn() - - # Configure thread parallelism if OMP_NUM_THREADS isn't set - # - # Helps to avoid CPU contention. The default of spawning a thread per - # core combined with multiprocessing for each GPU can have a negative - # impact on performance. The contention is amplified when running in a - # container where CPU limits can cause throttling. - default_omp_num_threads = 1 - if "OMP_NUM_THREADS" not in os.environ and ( - current_parallelism := - torch.get_num_threads()) > default_omp_num_threads: - logger.warning( - "Reducing Torch parallelism from %d threads to %d to avoid " - "unnecessary CPU contention. Set OMP_NUM_THREADS in the " - "external environment to tune this value as needed.", - current_parallelism, default_omp_num_threads) - os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) - torch.set_num_threads(default_omp_num_threads) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 3aa373f12b609..2aa732f34bcc8 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing +import os import pickle import queue import signal @@ -19,6 +20,7 @@ from threading import Thread from typing import Any, Callable, Optional, Union, cast import cloudpickle +import torch import vllm.envs as envs from vllm.config import VllmConfig @@ -28,14 +30,12 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, get_pp_group, get_tp_group) -from vllm.executor.multiproc_worker_utils import ( - set_multiprocessing_worker_envs) from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import worker_receiver_cache_from_config -from vllm.utils import (decorate_logs, get_distributed_init_method, - get_loopback_ip, get_mp_context, get_open_port, - set_process_title) +from vllm.utils import (_maybe_force_spawn, decorate_logs, + get_distributed_init_method, get_loopback_ip, + get_mp_context, get_open_port, set_process_title) from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.executor.utils import get_and_update_mm_cache @@ -67,8 +67,8 @@ class MultiprocExecutor(Executor): f"tensor_parallel_size ({tensor_parallel_size}) x pipeline" f"_parallel_size ({pp_parallel_size}). ") - # Set multiprocessing envs that are common to V0 and V1 - set_multiprocessing_worker_envs(self.parallel_config) + # Set multiprocessing envs + set_multiprocessing_worker_envs() # Multiprocessing-based executor does not support multi-node setting. # Since it only works for single node, we can use the loopback address @@ -698,3 +698,29 @@ class WorkerProc: process_name += f"_EP{ep_rank}" set_process_title(name=process_name) decorate_logs(process_name) + + +def set_multiprocessing_worker_envs(): + """ Set up environment variables that should be used when there are workers + in a multiprocessing environment. This should be called by the parent + process before worker processes are created""" + + _maybe_force_spawn() + + # Configure thread parallelism if OMP_NUM_THREADS isn't set + # + # Helps to avoid CPU contention. The default of spawning a thread per + # core combined with multiprocessing for each GPU can have a negative + # impact on performance. The contention is amplified when running in a + # container where CPU limits can cause throttling. + default_omp_num_threads = 1 + if "OMP_NUM_THREADS" not in os.environ and ( + current_parallelism := + torch.get_num_threads()) > default_omp_num_threads: + logger.warning( + "Reducing Torch parallelism from %d threads to %d to avoid " + "unnecessary CPU contention. Set OMP_NUM_THREADS in the " + "external environment to tune this value as needed.", + current_parallelism, default_omp_num_threads) + os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) + torch.set_num_threads(default_omp_num_threads) From cf56cf78b47e5f9b6a81ce0d50a94f9291922315 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Sun, 21 Sep 2025 13:08:07 +0800 Subject: [PATCH 524/584] [V1] Add sliding window support to Flex Attention backend (#24089) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- tests/v1/attention/test_attention_backends.py | 204 +++++++++++++----- vllm/v1/attention/backends/flex_attention.py | 90 ++++++-- 2 files changed, 227 insertions(+), 67 deletions(-) diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 0b7e103beca63..8a4fc15791b08 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -1,15 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for v1 attention backends without GPUModelRunner dependency.""" +from functools import partial +from typing import Optional, Union import pytest import torch +from torch.nn.attention.flex_attention import create_block_mask, flex_attention from tests.v1.attention.utils import (BatchSpec, _Backend, create_common_attn_metadata, create_standard_kv_cache_spec, create_vllm_config, get_attention_backend) +from vllm.config import ModelConfig +from vllm.platforms import current_platform from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, set_kv_cache_layout) @@ -183,13 +188,19 @@ class MockAttentionLayer: self._v_scale_float = 1.0 -def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, - layer_names: list[str], vllm_config, - device: torch.device, - common_attn_metadata: CommonAttentionMetadata, - query: torch.Tensor, key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor) -> torch.Tensor: +def run_attention_backend( + backend: _Backend, + kv_cache_spec: FullAttentionSpec, + layer_names: list[str], + vllm_config, + device: torch.device, + common_attn_metadata: CommonAttentionMetadata, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + sliding_window: Optional[int] = None, +) -> torch.Tensor: """Run attention computation using the specified backend's AttentionImpl.""" # Handle special case for FLEX_ATTENTION_SLOW @@ -253,7 +264,7 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, scale=scale, num_kv_heads=num_kv_heads, alibi_slopes=None, - sliding_window=None, + sliding_window=sliding_window, kv_cache_dtype="auto", ) @@ -275,13 +286,16 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, return output -@pytest.mark.parametrize("batch_spec_name", [ - "small_decode", "small_prefill", "mixed_small", "medium_decode", - "medium_prefill", "mixed_medium", "large_decode", "large_prefill", - "single_decode", "single_prefill" -]) -@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) -def test_backend_correctness(batch_spec_name: str, model: str): +def _test_backend_correctness( + batch_spec: BatchSpec, + model: str, + backend_to_test: list[Union[_Backend, str]], + mask_mod, + *, + block_size: int = 16, + atol: float = 1e-2, + rtol: float = 1e-2, +): """ Test that all backends produce similar outputs to a reference implementation using torch.nn.functional.scaled_dot_product_attention. @@ -297,9 +311,10 @@ def test_backend_correctness(batch_spec_name: str, model: str): simulated paged KV cache. 5. Comparing the vLLM backend's output to the ground-truth SDPA output. """ - batch_spec = BATCH_SPECS[batch_spec_name] + current_platform.seed_everything(42) vllm_config = create_vllm_config(model_name=model, max_model_len=max(batch_spec.seq_lens), + block_size=block_size, num_gpu_blocks=8192) device = torch.device("cuda:0") @@ -314,6 +329,7 @@ def test_backend_correctness(batch_spec_name: str, model: str): num_kv_heads = vllm_config.model_config.get_num_kv_heads( vllm_config.parallel_config) head_size = vllm_config.model_config.get_head_size() + sliding_window = vllm_config.model_config.get_sliding_window() dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) block_size = vllm_config.cache_config.block_size scale = 1.0 / (head_size**0.5) @@ -361,22 +377,21 @@ def test_backend_correctness(batch_spec_name: str, model: str): # Create causal mask: query token i attends to positions 0 to # (context_len + i) kv_len = s_len - offset = context_len - attn_mask = torch.full((q_len, kv_len), - float('-inf'), - device=device, - dtype=dtype) - for i in range(q_len): - attn_mask[i, :offset + i + 1] = 0.0 - sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( - q_sdpa_in, - k_sdpa_in, - v_sdpa_in, - attn_mask=attn_mask, - scale=scale, - enable_gqa=True) - # Convert back to (L, H, D) + final_mask_mod = partial(mask_mod, context_len=context_len) + block_mask = create_block_mask(final_mask_mod, + B=None, + H=None, + Q_LEN=q_len, + KV_LEN=kv_len, + device=device) + sdpa_out_i = flex_attention(q_sdpa_in, + k_sdpa_in, + v_sdpa_in, + block_mask=block_mask, + scale=scale, + enable_gqa=True) + all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0)) # Inputs for vLLM backends are just the new tokens @@ -412,7 +427,7 @@ def test_backend_correctness(batch_spec_name: str, model: str): # 4. Run vLLM backends and compare # Note: flex_attention has known Triton kernel compatibility issues # with test infrastructures - for backend_name in BACKENDS_TO_TEST: + for backend_name in backend_to_test: # FlashAttentionm + FlexAttention: # [2, num_blocks, block_size, num_kv_heads, head_size] # FlashInfer: @@ -427,12 +442,19 @@ def test_backend_correctness(batch_spec_name: str, model: str): 2, 3).contiguous().transpose(2, 3) set_kv_cache_layout("HND") - backend_output = run_attention_backend(backend_name, kv_cache_spec, - ["placeholder"], vllm_config, - device, common_attn_metadata, - query_vllm, key_vllm, - value_vllm, - kv_cache_for_backend) + backend_output = run_attention_backend( + backend_name, + kv_cache_spec, + ["placeholder"], + vllm_config, + device, + common_attn_metadata, + query_vllm, + key_vllm, + value_vllm, + kv_cache_for_backend, + sliding_window=sliding_window, + ) # Check shape and dtype consistency assert backend_output.shape == sdpa_output.shape, ( @@ -446,18 +468,102 @@ def test_backend_correctness(batch_spec_name: str, model: str): f"[{backend_name}] produced non-finite values") # Check numerical similarity - rtol = 1e-2 - atol = 5e-3 + def error_msg(msg: str, backend_name: str): + return (f"[{backend_name}] output differs from SDPA baseline. " + f"{msg}") - max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item() - max_rel_diff = torch.max( - torch.abs(backend_output - sdpa_output) / - torch.abs(sdpa_output)).item() - all_close = torch.allclose(backend_output, + torch.testing.assert_close(backend_output, sdpa_output, rtol=rtol, - atol=atol) + atol=atol, + msg=partial(error_msg, + backend_name=backend_name)) - assert all_close, ( - f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") \ No newline at end of file + +@pytest.mark.parametrize("batch_spec_name", [ + "small_decode", "small_prefill", "mixed_small", "medium_decode", + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" +]) +@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) +def test_causal_backend_correctness(batch_spec_name: str, model: str): + """Test backend's correctness with causal attention.""" + + def causal_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + kv_idx: torch.Tensor, + *, + context_len: int, + ): + return (q_idx + context_len) >= kv_idx + + batch_spec = BATCH_SPECS[batch_spec_name] + LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION] + if is_torch_equal_or_newer("2.9.0.dev0") else []) + SMALL_BLOCK_BACKENDS = [ + x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS + ] + _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS, + causal_mask_mod) + + # Fast FlexAttention needs to run with block_size=128 + if LARGE_BLOCK_BACKENDS: + _test_backend_correctness(batch_spec, + model, + LARGE_BLOCK_BACKENDS, + causal_mask_mod, + block_size=128) + + +SLIDING_WINDOW_BACKENDS_TO_TEST = [ + _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION, + _Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW" +] + + +@pytest.mark.parametrize("batch_spec_name", [ + "small_decode", "small_prefill", "mixed_medium", "large_decode", + "large_prefill" +]) +@pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"]) +def test_sliding_window_backend_correctness(batch_spec_name: str, model: str): + """Test backend's correctness with sliding window attention.""" + + def sliding_window_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + kv_idx: torch.Tensor, + *, + context_len: int, + sliding_window: int, + ): + causal_mask = q_idx + context_len >= kv_idx + window_mask = q_idx + context_len - kv_idx < sliding_window + return causal_mask & window_mask + + batch_spec = BATCH_SPECS[batch_spec_name] + model_config = ModelConfig(model=model, + max_model_len=max(batch_spec.seq_lens)) + sliding_window = model_config.get_sliding_window() + sliding_window_mask_mod_fn = partial(sliding_window_mask_mod, + sliding_window=sliding_window) + + LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION] + if is_torch_equal_or_newer("2.9.0.dev0") else []) + SMALL_BLOCK_BACKENDS = [ + x for x in SLIDING_WINDOW_BACKENDS_TO_TEST + if x not in LARGE_BLOCK_BACKENDS + ] + _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS, + sliding_window_mask_mod_fn) + + # Fast FlexAttention needs to run with block_size=128 + if LARGE_BLOCK_BACKENDS: + _test_backend_correctness(batch_spec, + model, + LARGE_BLOCK_BACKENDS, + sliding_window_mask_mod_fn, + block_size=128) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 662d3984554ad..c3358bfa74e91 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -9,7 +9,7 @@ import torch import torch._dynamo.decorators import torch.nn.functional as F from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature, - _score_mod_signature, + _score_mod_signature, and_masks, create_block_mask, flex_attention) @@ -292,6 +292,7 @@ class FlexAttentionMetadata: q_block_size: int = 16 kv_block_size: int = 16 transformed_score_mod: Optional[_score_mod_signature] = None + sliding_window: Optional[int] = None def _convert_physical_to_logical( self, @@ -380,6 +381,53 @@ class FlexAttentionMetadata: return final_mask_mod + def get_sliding_window_mask_mod(self) -> _mask_mod_signature: + """Creates the sliding window mask_mod function for FlexAttention. + + Note that the sliding window mask here is bidirectional, we need + to mask it with the bidirectional/causal mask for encoder/decoder. + """ + + if self.sliding_window is None: + raise ValueError( + "sliding_window must be set for sliding window attention") + + def sliding_window_mask_mod(b: torch.Tensor, h: torch.Tensor, + q_idx: torch.Tensor, kv_idx: torch.Tensor): + return torch.abs(q_idx - kv_idx) < self.sliding_window + + def final_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + physical_kv_idx: torch.Tensor, + ) -> torch.Tensor: + (is_valid, logical_q_idx, + logical_kv_idx) = self._convert_physical_to_logical( + self.doc_ids, q_idx, physical_kv_idx) + return torch.where( + is_valid, + sliding_window_mask_mod(b, h, logical_q_idx, logical_kv_idx), + False, + ) + + return final_mask_mod if self.causal else sliding_window_mask_mod + + def get_mask_mod(self): + # Stage-1: initialize the base mask_mod + # (causal mask for decoder or bidirectional mask for encoder) + if self.causal: + mask_mod = self.get_causal_mask_mod() + else: + mask_mod = self.get_bidirectional_mask_mod() + # stage-2: add external mask_mod for special attention during + # forwarding runtime to create the combined mask_mod. + if self.sliding_window is not None: + # Add sliding window mask for sliding window attention + sliding_window_mask_mod = self.get_sliding_window_mask_mod() + mask_mod = and_masks(mask_mod, sliding_window_mask_mod) + return mask_mod + def get_transformed_score_mod(self) -> Optional[_score_mod_signature]: """Creates the transformed score_mod function for FlexAttention. @@ -472,12 +520,9 @@ class FlexAttentionMetadata: return BlockMask.from_kv_blocks(**block_mask_kwargs) def build_block_mask(self) -> BlockMask: - if self.causal: - mask_mod = self.get_causal_mask_mod() - kv_len = self.total_cache_tokens - else: - mask_mod = self.get_bidirectional_mask_mod() - kv_len = self.num_actual_tokens + mask_mod = self.get_mask_mod() + kv_len = (self.total_cache_tokens + if self.causal else self.num_actual_tokens) return create_block_mask_compiled( mask_mod, None, @@ -498,11 +543,7 @@ class FlexAttentionMetadata: self.doc_ids = _offsets_to_doc_ids_tensor(self.query_start_loc) self.num_blocks = self.total_cache_tokens // self.block_size - if self.causal: - self.mask_mod = self.get_causal_mask_mod() - else: - self.mask_mod = self.get_bidirectional_mask_mod() - + self.mask_mod = self.get_mask_mod() self.transformed_score_mod = self.get_transformed_score_mod() if self.direct_build and self.causal: @@ -607,7 +648,7 @@ class FlexAttentionMetadataBuilder( class FlexAttentionImpl(AttentionImpl): - sliding_window: Optional[tuple[int, int]] + sliding_window: Optional[int] alibi_slopes: Optional[torch.Tensor] logits_soft_cap: Optional[float] @@ -641,11 +682,9 @@ class FlexAttentionImpl(AttentionImpl): "FlexAttention does not support alibi slopes yet.") else: self.alibi_slopes = None - if sliding_window is not None: - raise NotImplementedError( - "FlexAttention does not support sliding window yet.") - else: - self.sliding_window = (-1, -1) + + self.sliding_window = sliding_window + self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap if self.logits_soft_cap is not None: @@ -712,6 +751,21 @@ class FlexAttentionImpl(AttentionImpl): num_actual_tokens = attn_metadata.num_actual_tokens + if attn_metadata.sliding_window != self.sliding_window: + attn_metadata.sliding_window = self.sliding_window + if attn_metadata.direct_build: + # TODO: Support skipping the computation of sliding window + # in direct block mask building code path. + logger.warning_once( + "Using direct block mask building with sliding window, " + "which is suboptimal now. Performance may be degraded.") + # update mask mod in attention metadata + attn_metadata.mask_mod = attn_metadata.get_mask_mod() + attn_metadata.block_mask = ( + attn_metadata._build_block_mask_direct()) + else: + attn_metadata.block_mask = attn_metadata.build_block_mask() + if not attn_metadata.causal: assert self.attn_type == AttentionType.ENCODER_ONLY From 30d08911f7cf78287f8da003ddcc99f6ef196f9f Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Sun, 21 Sep 2025 04:05:20 -0700 Subject: [PATCH 525/584] [MM][Perf] Minor Optimization on Qwen3-VL `fast_pos_embed_interpolate` (#25337) Signed-off-by: Roger Wang <hey@rogerw.io> --- vllm/model_executor/models/qwen3_vl.py | 121 +++++++++++-------------- 1 file changed, 53 insertions(+), 68 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 17375ff0959d7..ca232e03767b1 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -270,6 +270,7 @@ class Qwen3_VisionTransformer(nn.Module): self.temporal_patch_size = vision_config.temporal_patch_size self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes self.use_data_parallel = use_data_parallel + self.num_grid_per_side = int(self.num_position_embeddings**0.5) # NOTE: This is used for creating empty tensor for all_gather for # DP ViT. Here out_hidden_size is enlarged due to deepstack @@ -377,82 +378,68 @@ class Qwen3_VisionTransformer(nn.Module): rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb - def fast_pos_embed_interpolate(self, grid_thw): - num_grid_per_side = int(self.num_position_embeddings**0.5) + def fast_pos_embed_interpolate(self, + grid_thw: list[list[int]]) -> torch.Tensor: - idx_list = [[] for _ in range(4)] - weight_list = [[] for _ in range(4)] + num_grid_per_side = self.num_grid_per_side + m_size = self.spatial_merge_size + hidden_dim = self.pos_embed.embedding_dim + outputs = [] for t, h, w in grid_thw: h_idxs = torch.linspace(0, num_grid_per_side - 1, h, - dtype=torch.float32) + dtype=torch.float32, + device=self.device) w_idxs = torch.linspace(0, num_grid_per_side - 1, w, - dtype=torch.float32) + dtype=torch.float32, + device=self.device) - h_idxs_floor = h_idxs.to(torch.long) - w_idxs_floor = w_idxs.to(torch.long) - h_idxs_ceil = torch.clamp(h_idxs.to(torch.long) + 1, - max=num_grid_per_side - 1) - w_idxs_ceil = torch.clamp(w_idxs.to(torch.long) + 1, - max=num_grid_per_side - 1) + h_floor = h_idxs.to(torch.long) + w_floor = w_idxs.to(torch.long) + h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1) + w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1) - dh = h_idxs - h_idxs_floor - dw = w_idxs - w_idxs_floor + dh = h_idxs - h_floor + dw = w_idxs - w_floor - idx_list[0].extend(((h_idxs_floor * num_grid_per_side)[None].T + - w_idxs_floor[None]).flatten().tolist() * t) - idx_list[1].extend(((h_idxs_floor * num_grid_per_side)[None].T + - w_idxs_ceil[None]).flatten().tolist() * t) - idx_list[2].extend(((h_idxs_ceil * num_grid_per_side)[None].T + - w_idxs_floor[None]).flatten().tolist() * t) - idx_list[3].extend(((h_idxs_ceil * num_grid_per_side)[None].T + - w_idxs_ceil[None]).flatten().tolist() * t) + w00 = ((1 - dh)[:, None] * (1 - dw)[None, :]).reshape(-1) + w01 = ((1 - dh)[:, None] * dw[None, :]).reshape(-1) + w10 = (dh[:, None] * (1 - dw)[None, :]).reshape(-1) + w11 = (dh[:, None] * dw[None, :]).reshape(-1) - weight_list[0].extend( - ((1 - dh)[None].T * (1 - dw)[None]).flatten().tolist() * t) - weight_list[1].extend( - ((1 - dh)[None].T * dw[None]).flatten().tolist() * t) - weight_list[2].extend( - (dh[None].T * (1 - dw)[None]).flatten().tolist() * t) - weight_list[3].extend( - (dh[None].T * dw[None]).flatten().tolist() * t) + idx00 = (h_floor[:, None] * num_grid_per_side + + w_floor[None, :]).reshape(-1) + idx01 = (h_floor[:, None] * num_grid_per_side + + w_ceil[None, :]).reshape(-1) + idx10 = (h_ceil[:, None] * num_grid_per_side + + w_floor[None, :]).reshape(-1) + idx11 = (h_ceil[:, None] * num_grid_per_side + + w_ceil[None, :]).reshape(-1) - device = self.pos_embed.weight.device - dtype = self.pos_embed.weight.dtype + indices = torch.stack([idx00, idx01, idx10, idx11], dim=0) + weights = torch.stack([w00, w01, w10, w11], + dim=0).to(dtype=self.dtype, + device=self.device) + weights = weights.unsqueeze(-1) - p0 = self.pos_embed( - torch.tensor( - idx_list[0], dtype=torch.long, device=device)) * torch.tensor( - weight_list[0], dtype=dtype, device=device)[:, None] - p1 = self.pos_embed( - torch.tensor( - idx_list[1], dtype=torch.long, device=device)) * torch.tensor( - weight_list[1], dtype=dtype, device=device)[:, None] - p2 = self.pos_embed( - torch.tensor( - idx_list[2], dtype=torch.long, device=device)) * torch.tensor( - weight_list[2], dtype=dtype, device=device)[:, None] - p3 = self.pos_embed( - torch.tensor( - idx_list[3], dtype=torch.long, device=device)) * torch.tensor( - weight_list[3], dtype=dtype, device=device)[:, None] + embeds = self.pos_embed(indices) + weighted_embeds = embeds * weights + p0, p1, p2, p3 = weighted_embeds.unbind(dim=0) + combined = p0 + p1 + p2 + p3 - patch_pos_embeds = p0 + p1 + p2 + p3 - patch_pos_embeds = patch_pos_embeds.split( - [t * h * w for t, h, w in grid_thw]) - patch_pos_embeds_permute = [] - m_size = self.spatial_merge_size - for pos_embed, (t, h, w) in zip(patch_pos_embeds, grid_thw): - pos_embed = pos_embed.view(t, h // m_size, m_size, w // m_size, - m_size, -1).permute(0, 1, 3, 2, 4, - 5).flatten(0, 4) - patch_pos_embeds_permute.append(pos_embed) - patch_pos_embeds = torch.cat(patch_pos_embeds_permute) - return patch_pos_embeds + combined = combined.view(h * w, hidden_dim) + repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous() + repeated = repeated.view(t, h // m_size, m_size, w // m_size, + m_size, hidden_dim) + repeated = repeated.permute(0, 1, 3, 2, 4, + 5).reshape(-1, hidden_dim) + outputs.append(repeated) + + return torch.cat(outputs, dim=0) def compute_attn_mask_seqlen( self, @@ -477,12 +464,9 @@ class Qwen3_VisionTransformer(nn.Module): hidden_states = hidden_states + pos_embeds rotary_pos_emb = self.rot_pos_emb(grid_thw) - if isinstance(grid_thw, list): - grid_thw_tensor = torch.tensor(grid_thw, - device=hidden_states.device, - dtype=torch.int32) - else: - grid_thw_tensor = grid_thw + grid_thw_tensor = torch.tensor(grid_thw, + device=self.device, + dtype=torch.int32) cu_seqlens = torch.repeat_interleave( grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], @@ -1224,7 +1208,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw_list, rope_type="rope_3d") else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync @@ -1526,4 +1511,4 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, language_model="language_model", connector="model.visual.merger", tower_model="model.visual.", - ) \ No newline at end of file + ) From 9aea7373fffe71c239b08a89edbbc46bbb560f45 Mon Sep 17 00:00:00 2001 From: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Date: Sun, 21 Sep 2025 13:36:47 +0200 Subject: [PATCH 526/584] [Bugfix] Typos in error message for missing model config file (#25339) Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com> --- vllm/transformers_utils/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index cafc43f6b7673..52e2c18a77847 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -524,10 +524,10 @@ def get_config( else: raise ValueError( "Could not detect config format for no config file found. " - "With config_format 'auto', ensure your model has either" - "config.json (HF format) or params.json (Mistral format)." - "Otherwise please specify your_custom_config_format" - "in engine args for customized config parser") + "With config_format 'auto', ensure your model has either " + "config.json (HF format) or params.json (Mistral format). " + "Otherwise please specify your_custom_config_format " + "in engine args for customized config parser.") except Exception as e: error_message = ( From 65a5910ce35f889740bddb2e19dad35c83278873 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sun, 21 Sep 2025 19:41:02 +0800 Subject: [PATCH 527/584] [Optimization] Cache chat template result when processor fails to be loaded (#25341) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/entrypoints/chat_utils.py | 71 +++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c2c0ad74ef431..df49119d86420 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -421,6 +421,51 @@ def resolve_mistral_chat_template( return None +_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]() +""" +Used in `_try_get_processor_chat_template` to avoid calling +`cached_get_processor` again if the processor fails to be loaded. + +This is needed because `lru_cache` does not cache when an exception happens. +""" + + +def _try_get_processor_chat_template( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + model_config: ModelConfig, +) -> Optional[str]: + cache_key = (tokenizer.name_or_path, model_config.trust_remote_code) + if cache_key in _PROCESSOR_CHAT_TEMPLATES: + return _PROCESSOR_CHAT_TEMPLATES[cache_key] + + try: + processor = cached_get_processor( + tokenizer.name_or_path, + processor_cls=( + PreTrainedTokenizer, + PreTrainedTokenizerFast, + ProcessorMixin, + ), + trust_remote_code=model_config.trust_remote_code, + ) + if ( + isinstance(processor, ProcessorMixin) + and hasattr(processor, "chat_template") + and (chat_template := processor.chat_template) is not None + ): + _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template + return chat_template + except Exception: + logger.debug( + "Failed to load AutoProcessor chat template for %s", + tokenizer.name_or_path, + exc_info=True, + ) + + _PROCESSOR_CHAT_TEMPLATES[cache_key] = None + return None + + def resolve_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], chat_template: Optional[str], @@ -434,28 +479,10 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: - try: - processor = cached_get_processor( - tokenizer.name_or_path, - processor_cls=( - PreTrainedTokenizer, - PreTrainedTokenizerFast, - ProcessorMixin, - ), - trust_remote_code=model_config.trust_remote_code, - ) - if ( - isinstance(processor, ProcessorMixin) - and hasattr(processor, "chat_template") - and processor.chat_template is not None - ): - return processor.chat_template - except Exception: - logger.debug( - "Failed to load AutoProcessor chat template for %s", - tokenizer.name_or_path, - exc_info=True, - ) # noqa: E501 + chat_template = _try_get_processor_chat_template(tokenizer, + model_config) + if chat_template is not None: + return chat_template # 3rd priority: AutoTokenizer chat template try: From 26e673fe9303ad759a47e19a087764393c69109f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sun, 21 Sep 2025 08:52:15 -0700 Subject: [PATCH 528/584] [V0 Deprecation] Remove V0 Sequence class & Sampler (#25332) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- tests/conftest.py | 2 +- .../generation/test_granite_speech.py | 2 +- .../multimodal/generation/test_phi4mm.py | 2 +- .../multimodal/generation/test_pixtral.py | 2 +- .../generation/vlm_utils/model_utils.py | 2 +- .../multimodal/generation/vlm_utils/types.py | 2 +- tests/models/utils.py | 2 +- tests/tokenization/test_detokenize.py | 140 +- tests/tool_use/test_jamba_tool_parser.py | 2 +- tests/tool_use/test_qwen3coder_tool_parser.py | 2 +- tests/tool_use/test_seed_oss_tool_parser.py | 2 +- tests/tool_use/test_xlam_tool_parser.py | 2 +- tests/v1/engine/test_output_processor.py | 2 +- vllm/executor/executor_base.py | 2 +- vllm/executor/ray_distributed_executor.py | 2 +- vllm/inputs/__init__.py | 13 +- vllm/inputs/registry.py | 67 +- vllm/model_executor/__init__.py | 4 +- .../model_executor/layers/logits_processor.py | 91 -- vllm/model_executor/layers/sampler.py | 1198 --------------- vllm/model_executor/models/medusa.py | 60 +- vllm/model_executor/models/mlp_speculator.py | 78 +- vllm/model_executor/models/phi4flash.py | 6 +- vllm/model_executor/sampling_metadata.py | 594 +------- vllm/sequence.py | 1322 +---------------- vllm/transformers_utils/detokenizer.py | 162 -- vllm/worker/worker_base.py | 2 +- 27 files changed, 69 insertions(+), 3696 deletions(-) delete mode 100644 vllm/model_executor/layers/sampler.py delete mode 100644 vllm/transformers_utils/detokenizer.py diff --git a/tests/conftest.py b/tests/conftest.py index f14b1e8780ad9..dc70c98359598 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,10 +48,10 @@ from vllm.distributed import (cleanup_dist_env_and_memory, initialize_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger +from vllm.logprobs import Logprob from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams -from vllm.sequence import Logprob from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils import set_default_torch_num_threads diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index f2e6fbfad6e80..c1305e0ae31ce 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -7,8 +7,8 @@ from typing import Optional import pytest from transformers import AutoModelForSpeechSeq2Seq +from vllm.logprobs import SampleLogprobs from vllm.lora.request import LoRARequest -from vllm.sequence import SampleLogprobs from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner) diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index 67d35213d6422..77e2b90dd5e96 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -12,10 +12,10 @@ from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm.assets.image import ImageAsset +from vllm.logprobs import SampleLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal.image import convert_image_mode, rescale_image_size from vllm.platforms import current_platform -from vllm.sequence import SampleLogprobs from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput, PromptImageInput, VllmRunner) diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index cb3cc1d3d330d..715b08ef90e54 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -13,8 +13,8 @@ from mistral_common.tokens.tokenizers.multimodal import image_from_chunk from transformers import AutoProcessor from vllm import SamplingParams, TextPrompt, TokensPrompt +from vllm.logprobs import Logprob, SampleLogprobs from vllm.multimodal import MultiModalDataBuiltins -from vllm.sequence import Logprob, SampleLogprobs from ....utils import VLLM_PATH, large_gpu_test from ...utils import check_logprobs_close diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 8b7d051218f14..ba55450ec8a90 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -19,7 +19,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature, GenerationConfig, GenerationMixin) from transformers.video_utils import VideoMetadata -from vllm.sequence import SampleLogprobs +from vllm.logprobs import SampleLogprobs from vllm.utils import is_list_of from .....conftest import HfRunner, ImageAsset, ImageTestAssets diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 9451131960885..e39ca40fbbf5e 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -12,7 +12,7 @@ from transformers import AutoModelForCausalLM from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.config import RunnerOption -from vllm.sequence import SampleLogprobs +from vllm.logprobs import SampleLogprobs from vllm.transformers_utils.tokenizer import AnyTokenizer from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset, diff --git a/tests/models/utils.py b/tests/models/utils.py index 76c6e4823a12c..5da2382cef814 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -12,7 +12,7 @@ from transformers import PretrainedConfig from vllm.config import ModelConfig, ModelDType, RunnerOption from vllm.inputs import InputContext -from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs +from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from .registry import HF_EXAMPLE_MODELS diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index bd2b91073d568..fe6c313d2966f 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -8,10 +8,7 @@ import pytest from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.inputs import token_inputs -from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer, @@ -217,138 +214,3 @@ def test_oov_decode(tokenizer, fast): assert decoded_text == '' assert out_ids == [len(tokenizer)] - - -@pytest.fixture -def detokenizer(tokenizer_name: str) -> Detokenizer: - tokenizer = get_tokenizer( - tokenizer_name, - tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto", - trust_remote_code=False, - revision=None, - ) - - return Detokenizer(tokenizer) - - -@pytest.fixture(name="complete_sequence_token_ids") -def create_complete_sequence_token_ids(complete_sequence: str, - tokenizer) -> list[int]: - return tokenizer(complete_sequence, add_special_tokens=False).input_ids - - -def create_sequence(prompt_token_ids=None): - prompt_token_ids = prompt_token_ids or [] - return Sequence( - seq_id=0, - inputs=token_inputs(prompt_token_ids), - block_size=16, - ) - - -def create_dummy_logprobs( - complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]: - return [{ - token_id: Logprob(logprob=0.0), - token_id + 1: Logprob(logprob=0.1) - } for token_id in complete_sequence_token_ids] - - -def create_dummy_prompt_logprobs( - complete_sequence_token_ids: list[int] -) -> list[Optional[dict[int, Any]]]: - # logprob for the first prompt token is None. - logprobs: list[Optional[dict[int, Any]]] = [None] - logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:]) - return logprobs - - -@pytest.mark.parametrize("complete_sequence", TRUTH) -@pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True) -def test_decode_sequence_logprobs(complete_sequence: str, - complete_sequence_token_ids: list[int], - detokenizer: Detokenizer, - skip_special_tokens: bool): - """Verify Detokenizer decodes logprobs correctly.""" - sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, - logprobs=2) - - # Run sequentially. - seq = create_sequence() - dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) - sequential_logprobs_text_chosen_token: list[str] = [] - sequential_logprobs_text_other_token: list[str] = [] - for new_token, logprobs in zip(complete_sequence_token_ids, - dummy_logprobs): - seq.append_token_id(new_token, logprobs) - detokenizer.decode_sequence_inplace(seq, sampling_params) - sequential_logprobs_text_chosen_token.append( - seq.output_logprobs[-1][new_token].decoded_token) - sequential_logprobs_text_other_token.append( - seq.output_logprobs[-1][new_token + 1].decoded_token) - sequential_result = seq.output_text - - assert sequential_result == "".join(sequential_logprobs_text_chosen_token) - assert sequential_result != "".join(sequential_logprobs_text_other_token) - - if not skip_special_tokens: - # Text for logprobs for the chosen token should be the same as the - # generated text. Note that this will only be true if we skip - # special tokens. - assert sequential_result == complete_sequence - - -@pytest.mark.parametrize("complete_sequence", TRUTH) -@pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -def test_decode_prompt_logprobs(complete_sequence: str, - complete_sequence_token_ids: list[int], - detokenizer: Detokenizer): - - # We want to use skip_special_tokens=False here but Mistral tokenizers - # don't support that. - if complete_sequence not in SPECIAL_TOKS_TRUTH: - skip_special_tokens = True - elif not isinstance(detokenizer.tokenizer, MistralTokenizer): - skip_special_tokens = False - else: - pytest.skip("MistralTokenizers don't support " - "skip_special_tokens=False") - return - """Verify Detokenizer decodes prompt logprobs correctly.""" - sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, - prompt_logprobs=1) - - # Run sequentially. - seq = create_sequence(complete_sequence_token_ids) - seq_group = SequenceGroup(request_id="1", - seqs=[seq], - sampling_params=sampling_params, - arrival_time=0.0) - dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids) - detokenizer.decode_prompt_logprobs_inplace(seq_group, - dummy_logprobs, - position_offset=0) - # First logprob is None. - decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[ - 1:] # type: ignore - - # decoded_prompt_logprobs doesn't contain the first token. - token_ids = complete_sequence_token_ids - tokenizer = detokenizer.tokenizer - text_full = tokenizer.decode(token_ids, - skip_special_tokens=skip_special_tokens) - text_first = tokenizer.decode(token_ids[0], - skip_special_tokens=skip_special_tokens) - text = text_full[len(text_first):] - - # Text for logprobs for the chosen token should be the same as the - # prompt text. Note that the first logprob is None. - assert text == "".join([ - logprobs[token_id].decoded_token - for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs) - ]) - assert text != "".join([ - logprobs[token_id + 1].decoded_token - for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs) - ]) diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 35153139350bf..57ace1fa22ac9 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -12,7 +12,7 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import JambaToolParser -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer MODEL = "ai21labs/Jamba-tiny-dev" diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index ccb2acf512caf..f06fb2b9f2f04 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ToolCall) from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( Qwen3CoderToolParser) -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py index c276a598aa68c..118c7534622e2 100644 --- a/tests/tool_use/test_seed_oss_tool_parser.py +++ b/tests/tool_use/test_seed_oss_tool_parser.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer # Use a common model that is likely to be available diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py index 0bc22e4f1031c..c07ca0f56d6b0 100644 --- a/tests/tool_use/test_xlam_tool_parser.py +++ b/tests/tool_use/test_xlam_tool_parser.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers import xLAMToolParser -from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer # Use a common model that is likely to be available diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index a9632ce54eac8..bdb40be99aa3f 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -12,9 +12,9 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST, STOP_STRINGS, DummyOutputProcessorTestVectors, MockEngineCore) +from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.sequence import PromptLogprobs, SampleLogprobs from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.output_processor import (OutputProcessor, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index b75b94ad0acc2..fd4b992c3821b 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -15,10 +15,10 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.tasks import SupportedTask from vllm.utils import make_async +from vllm.v1.outputs import SamplerOutput from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 78d0ee6c1e3fc..84747575b4960 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -17,12 +17,12 @@ from vllm.executor.msgspec_utils import encode_hook from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster, ray) from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform from vllm.ray.ray_env import get_env_vars_to_copy from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, get_ip, get_open_port, make_async) +from vllm.v1.outputs import SamplerOutput if ray is not None: from ray.actor import ActorHandle diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index e9db2a0dc13a8..46f49aaa013da 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -7,15 +7,7 @@ from .data import (DataPrompt, DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt, SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt, build_explicit_enc_dec_prompt, embeds_inputs, to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) -from .registry import (DummyData, InputContext, InputProcessingContext, - InputRegistry) - -INPUT_REGISTRY = InputRegistry() -""" -The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used -by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the -target model. -""" +from .registry import InputContext, InputProcessingContext __all__ = [ "DataPrompt", @@ -36,9 +28,6 @@ __all__ = [ "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompts", - "INPUT_REGISTRY", - "DummyData", "InputContext", "InputProcessingContext", - "InputRegistry", ] diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index f0b392e9767ae..b5316b6d0574c 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union +from typing import TYPE_CHECKING, Any, Union import torch from transformers import BatchFeature, PretrainedConfig, ProcessorMixin @@ -15,16 +15,9 @@ from vllm.utils.jsontree import JSONTree, json_map_leaves if TYPE_CHECKING: from vllm.config import ModelConfig - from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict, - MultiModalRegistry) - from vllm.sequence import SequenceData from vllm.transformers_utils.tokenizer import AnyTokenizer else: ModelConfig = Any - MultiModalDataDict = Any - MultiModalPlaceholderDict = Any - MultiModalRegistry = Any - SequenceData = Any AnyTokenizer = Any _T = TypeVar("_T") @@ -191,61 +184,3 @@ class InputProcessingContext(InputContext): f"on data={data} with kwargs={allowed_kwargs}") raise ValueError(msg) from exc - - -class DummyData(NamedTuple): - """ - Dummy data used for profiling. - - Note: This is only used in V0. - """ - - seq_data: SequenceData - multi_modal_data: Optional[MultiModalDataDict] = None - multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None - - -class InputRegistry: - """ - Note: This is only used in V0. - """ - - def dummy_data_for_profiling( - self, - model_config: ModelConfig, - seq_len: int, - mm_registry: MultiModalRegistry, - is_encoder_data: bool = False, - ) -> DummyData: - """ - Create dummy data for profiling the memory usage of a model. - - The model is identified by ``model_config``. - """ - # Avoid circular import - from vllm.multimodal.cache import processor_only_cache_from_config - from vllm.sequence import SequenceData - - if not model_config.is_multimodal_model: - seq_data = SequenceData.from_prompt_token_counts((0, seq_len)) - return DummyData(seq_data=seq_data) - - cache = processor_only_cache_from_config(model_config, mm_registry) - - # Encoder dummy data does not contain multi-modal data - if is_encoder_data: - enc_data = mm_registry.get_encoder_dummy_data(model_config, - seq_len, - cache=cache) - seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids) - return DummyData(seq_data=seq_data) - - dec_data = mm_registry.get_decoder_dummy_data(model_config, - seq_len, - cache=cache) - - return DummyData( - seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids), - multi_modal_data=dec_data.multi_modal_data.get_data(), - multi_modal_placeholders=dec_data.multi_modal_placeholders, - ) diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 55dfe8088c8f3..a59aebfac4ff9 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -3,13 +3,11 @@ from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) -from vllm.model_executor.sampling_metadata import (SamplingMetadata, - SamplingMetadataCache) +from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed __all__ = [ "SamplingMetadata", - "SamplingMetadataCache", "set_random_seed", "BasevLLMParameter", "PackedvLLMParameter", diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 8a4ac214443eb..8226437cb1898 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,13 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that compute logits from hidden_stats.""" -import inspect -from concurrent.futures import ThreadPoolExecutor from typing import Optional import torch -import vllm.envs as envs from vllm.distributed import (tensor_model_parallel_all_gather, tensor_model_parallel_gather) from vllm.model_executor.custom_op import CustomOp @@ -16,11 +13,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform -_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None -if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: - _logits_processor_threadpool = ThreadPoolExecutor( - envs.VLLM_LOGITS_PROCESSOR_THREADS) - @CustomOp.register("logits_processor") class LogitsProcessor(CustomOp): @@ -60,15 +52,10 @@ class LogitsProcessor(CustomOp): hidden_states: torch.Tensor, sampling_metadata: Optional[SamplingMetadata] = None, embedding_bias: Optional[torch.Tensor] = None, - prune_hidden_states: bool = True, ) -> Optional[torch.Tensor]: if self.logits_as_input: logits = hidden_states else: - if sampling_metadata is not None and prune_hidden_states: - hidden_states = _prune_hidden_states(hidden_states, - sampling_metadata) - # Get the logits for the next tokens. logits = self._get_logits(hidden_states, lm_head, embedding_bias) if logits is not None: @@ -79,12 +66,6 @@ class LogitsProcessor(CustomOp): if self.scale != 1.0: logits *= self.scale - - # Apply logits processors (if any). - if sampling_metadata is not None and \ - sampling_metadata.seq_groups is not None: - logits = _apply_logits_processors(logits, sampling_metadata) - return logits def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor: @@ -125,75 +106,3 @@ class LogitsProcessor(CustomOp): s += f", org_vocab_size={self.org_vocab_size}" s += f", scale={self.scale}, logits_as_input={self.logits_as_input}" return s - - -def _prune_hidden_states( - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios - # (warmup, profile_run) we might not have selected_token_indices, - # so we skip pruning. - if sampling_metadata.selected_token_indices is not None: - return hidden_states.index_select( - 0, sampling_metadata.selected_token_indices) - else: - return hidden_states - - -def _apply_logits_processors( - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - found_logits_processors = False - logits_processed = 0 - logits_row_ids_and_logits_row_futures = [] - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - logits_processors = sampling_params.logits_processors - if logits_processors: - found_logits_processors = True - - for seq_id, logits_row_idx in zip(seq_ids, - seq_group.sample_indices): - logits_row = logits[logits_row_idx] - past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids - prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids - - if _logits_processor_threadpool is not None: - logits_row_ids_and_logits_row_futures.append( - (logits_row_idx, - _logits_processor_threadpool.submit( - _apply_logits_processors_single_seq, logits_row, - logits_processors, past_tokens_ids, - prompt_tokens_ids))) - else: - logits[logits_row_idx] = \ - _apply_logits_processors_single_seq( - logits_row, logits_processors, past_tokens_ids, - prompt_tokens_ids) - - logits_processed += len(seq_group.sample_indices) + len( - seq_group.prompt_logprob_indices) - - for logits_row_idx, future in logits_row_ids_and_logits_row_futures: - logits[logits_row_idx] = future.result() - - if found_logits_processors: - # verifies that no rows in logits were missed unexpectedly - assert logits_processed == logits.shape[0] - return logits - - -def _apply_logits_processors_single_seq(logits_row, logits_processors, - past_tokens_ids, - prompt_tokens_ids) -> torch.Tensor: - for logits_processor in logits_processors: - parameters = inspect.signature(logits_processor).parameters - if len(parameters) == 3: - logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids, - logits_row) - else: - logits_row = logits_processor(past_tokens_ids, logits_row) - return logits_row diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py deleted file mode 100644 index 9d93cad2420ad..0000000000000 --- a/vllm/model_executor/layers/sampler.py +++ /dev/null @@ -1,1198 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A layer that samples the next tokens from the model's outputs.""" -import itertools -from collections.abc import Iterator -from dataclasses import dataclass -from importlib.util import find_spec -from math import inf -from typing import Optional, Union - -import msgspec -import torch -import torch.nn as nn - -import vllm.envs as envs -from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs -from vllm.model_executor.layers.utils import apply_penalties -from vllm.model_executor.sampling_metadata import (SamplingMetadata, - SamplingTensors, - SequenceGroupToSample) -from vllm.sampling_params import SamplingType -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, SequenceOutput) - -if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): - # yapf: disable - from flashinfer.sampling import ( - top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) - - # yapf: enable -else: - flashinfer_top_k_top_p_sampling = None - -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -def get_sampler() -> torch.nn.Module: - if envs.VLLM_USE_V1: - # Lazy import: the v1 package isn't distributed - from vllm.v1.sample.sampler import Sampler as V1Sampler - return V1Sampler() - return Sampler() - - -# (num_token_ids, num_parent_ids) per sequence group. -SampleResultType = list[tuple[list[int], list[int]]] - -# Types of temporary data structures used for -# computing sample_result -SampleMetadataType = dict[SamplingType, tuple[list[int], - list[SequenceGroupToSample]]] -MultinomialSamplesType = dict[SamplingType, torch.Tensor] -SampleResultsDictType = dict[int, tuple[list[int], list[int]]] - - -# Encapsulates temporary data structures for computing -# sample_result. -# -# * For multi-step scheduling: must be returned -# by `Sampler.forward()` and used later to compute the pythonized -# sample_result -# -# * For single-step scheduling: consumed immediately -# inside `Sampler.forward()` to compute pythonized sample_result. -@dataclass -class SampleResultArgsType: - sample_metadata: SampleMetadataType - multinomial_samples: MultinomialSamplesType - sample_results_dict: SampleResultsDictType - sampling_metadata: SamplingMetadata - greedy_samples: Optional[torch.Tensor] - - -# Union of non-deferred (single-step scheduling) -# vs deferred (multi-step scheduling) -# sample result types -MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType] - -# Abbreviation of the _sample() return type -SampleReturnType = tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] - - -class SamplerOutput( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """For each sequence group, we generate a list of SequenceOutput object, - each of which contains one possible candidate for the next token. - - This data structure implements methods, so it can be used like a list, but - also has optional fields for device tensors. - """ - - outputs: list[CompletionSequenceGroupOutput] - - # On-device tensor containing probabilities of each token. - sampled_token_probs: Optional[torch.Tensor] = None - - # On-device tensor containing the logprobs of each token. - logprobs: Optional["torch.Tensor"] = None - - # Holds either (1) the pythonized sampler result (single-step scheduling) - # or (2) what will be arguments for later deferred pythonization of the - # sampler result (muliti-step scheduling) - deferred_sample_results_args: Optional[SampleResultArgsType] = None - - # On-device tensor containing the sampled token ids. - sampled_token_ids: Optional[torch.Tensor] = None - # CPU tensor containing the sampled token ids. Used during multi-step to - # return the sampled token ids from last rank to AsyncLLMEngine to be - # 'broadcasted' to all other PP ranks for next step. - sampled_token_ids_cpu: Optional[torch.Tensor] = None - - # On-device tensor containing the sampled token embeddings (embeddings - # corresponding to the sampled token ids). Used when prompt embeddings are - # specified in lieu of prompt token ids or text. - sampled_token_embeds: Optional[torch.Tensor] = None - - # Optional last hidden states from the model. - hidden_states: Optional[torch.Tensor] = None - - # Optional prefill hidden states from the model - # (used for models like EAGLE). - prefill_hidden_states: Optional[torch.Tensor] = None - - # Time taken in the forward pass for this across all workers - model_forward_time: Optional[float] = None - - # Time taken in the model execute function. This will include model forward, - # block/sync across workers, cpu-gpu sync time and sampling time. - model_execute_time: Optional[float] = None - - def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput: - return self.outputs[idx] - - def __setitem__(self, idx: int, value): - self.outputs[idx] = value - - def __iter__(self) -> Iterator[CompletionSequenceGroupOutput]: - return iter(self.outputs) - - def __len__(self): - return len(self.outputs) - - def __eq__(self, other: object): - return isinstance(other, - self.__class__) and self.outputs == other.outputs - - def __repr__(self) -> str: - """Show the shape of a tensor instead of its values to reduce noise. - """ - sampled_token_probs_repr = ("None" if self.sampled_token_probs is None - else self.sampled_token_probs.shape) - sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else - self.sampled_token_ids.shape) - return (f"SamplerOutput(outputs={self.outputs}, " - f"sampled_token_probs={sampled_token_probs_repr}, " - f"sampled_token_ids={sampled_token_ids_repr})") - - -class Sampler(nn.Module): - """Samples the next tokens from the model's outputs. - - This layer does the following: - 1. Discard the hidden states that are not used for sampling (i.e., all - tokens except the final one in each prompt). - 2. Compute the logits for the next tokens. - 3. Apply presence, frequency and repetition penalties. - 4. Apply temperature scaling. - 5. Apply top-p and top-k truncation. - 6. Sample the next tokens. - Here, each sequence group within the batch can have different sampling - parameters (e.g., sampling method, temperature, top-p, top-k, etc.). - - The structure of the logits tensor is coupled with the seq_groups in - sampling_metadata. Typically, each sequence in each seq_group has one row in - logits for the next token to be sampled; however, for a seq_group with a - prompt request with the prompt_logprobs sampling parameter, there are rows - in logits for each token in the input prompt. - """ - - def __init__(self): - super().__init__() - - # Whether or not the SamplerOutput should have on-device tensors - # containing the sampled token ids and probabilities. This is used by - # speculative decoding and when prompt embeddings are specified. - self.include_gpu_probs_tensor = False - self.should_modify_greedy_probs_inplace = False - - def _init_sampling_tensors( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ): - """The goal here is to reuse sampling tensors between similar decode - runs. This is possible because sampling logic does not change between - decodes of the same sequences. - """ - _, vocab_size = logits.shape - - # First free any existing stored sampling tensors. - # This is necessary because some sampling tensors may - # have pinned memory. - self._sampling_tensors = None - - # Initialize new sampling tensors - (sampling_tensors, do_penalties, do_top_p_top_k, - do_min_p) = SamplingTensors.from_sampling_metadata( - sampling_metadata, vocab_size, logits.device, logits.dtype) - - self._sampling_tensors = sampling_tensors - self._do_penalties = do_penalties - self._do_top_p_top_k = do_top_p_top_k - self._do_min_p = do_min_p - - def forward( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - """ - Single-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Pythonize sampling result & logprobs tensor - - Multi-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Defer Pythonization of sampling result & logprobs - tensor - * Encapsulate arguments required for deferred Pythonization - in the - [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput] - structure - - Args: - logits: (num_tokens, vocab_size). - sampling_metadata: Metadata for sampling. - """ - assert logits is not None - _, vocab_size = logits.shape - - # Prepare sampling tensors with pinned memory to avoid blocking. - if not sampling_metadata.reuse_sampling_tensors: - self._init_sampling_tensors(logits, sampling_metadata) - elif self._do_penalties: - # In this case, the sampling tensors logic depends on - # "output_tokens" of a sequence. As a result, we cannot - # reuse sampling tensors, since "output_tokens" changes - # between decode runs. - self._init_sampling_tensors(logits, sampling_metadata) - - assert self._sampling_tensors is not None - sampling_tensors = self._sampling_tensors - do_penalties = self._do_penalties - do_top_p_top_k = self._do_top_p_top_k - do_min_p = self._do_min_p - - logits = _apply_min_tokens_penalty(logits, sampling_metadata) - - # Apply presence and frequency penalties. - if do_penalties: - logits = apply_penalties(logits, sampling_tensors.prompt_tokens, - sampling_tensors.output_tokens, - sampling_tensors.presence_penalties, - sampling_tensors.frequency_penalties, - sampling_tensors.repetition_penalties) - - # Use float32 to apply temperature scaling. - # Use in-place division to avoid creating a new tensor. - logits = logits.to(torch.float) - logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) - - if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: - logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, - sampling_tensors.top_ks) - - if do_min_p: - logits = _apply_min_p(logits, sampling_tensors.min_ps) - - # We use float32 for probabilities and log probabilities. - # Compute the probabilities. - probs = torch.softmax(logits, dim=-1, dtype=torch.float) - # Compute the log probabilities. - logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) - - # Sample the next tokens. - maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample( - probs, - logprobs, - sampling_metadata, - sampling_tensors, - include_gpu_probs_tensor=self.include_gpu_probs_tensor, - modify_greedy_probs=self._should_modify_greedy_probs_inplace, - ) - - if self.include_gpu_probs_tensor: - # Since we will defer sampler result Pythonization, - # preserve GPU-side tensors in support of later - # deferred pythonization of logprobs - assert maybe_sampled_tokens_tensor is not None - on_device_tensors = (probs, logprobs, maybe_sampled_tokens_tensor) - else: - # Since Pythonization has already happened, don't preserve - # GPU-side tensors. - on_device_tensors = None - - # Get the logprobs query results. - prompt_logprobs = None - sample_logprobs = None - if not sampling_metadata.skip_sampler_cpu_output: - # Pythonize logprobs now (GPU -> CPU); do not defer. - assert not isinstance(maybe_deferred_sample_results, - SampleResultArgsType) - prompt_logprobs, sample_logprobs = get_logprobs( - logprobs, sampling_metadata, maybe_deferred_sample_results) - - return _build_sampler_output( - maybe_deferred_sample_results, - sampling_metadata, - prompt_logprobs, - sample_logprobs, - on_device_tensors=on_device_tensors, - skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output) - - @property - def _should_modify_greedy_probs_inplace(self) -> bool: - """Whether or not the sampler should modify the probability distribution - of greedily-sampled tokens such that multinomial sampling would sample - the greedily-sampled token. - - In other words, if True then we set the probability of the greedily- - sampled token to 1. - - This is used by speculative decoding, which requires that the sampling - method be encoded into the probability distribution. - """ - return self.should_modify_greedy_probs_inplace - - -def _apply_min_tokens_penalty( - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens - have not been generated yet - """ - # list of indices in logits that will be set to -inf - logits_to_penalize: list[tuple[int, int]] = [] - logits_applied = 0 - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - - sample_indices = seq_group.sample_indices - logits_applied += len(sample_indices) + len( - seq_group.prompt_logprob_indices) - if not seq_group.do_sample: - continue - - start_idx = sample_indices[0] - min_tokens = sampling_params.min_tokens - token_ids_to_penalize = sampling_params.all_stop_token_ids - if min_tokens > 0 and token_ids_to_penalize: - seqs_to_penalize: list[int] = [] - for j, seq_id in enumerate(seq_ids): - seq_data = seq_group.seq_data[seq_id] - if len(seq_data.output_token_ids_array) < min_tokens: - seqs_to_penalize.append(j) - - if seqs_to_penalize: - # convert to the index into logits - seqs_to_penalize = [start_idx + j for j in seqs_to_penalize] - # itertools.product pairs each seq index with every token id - logits_to_penalize.extend( - itertools.product(seqs_to_penalize, token_ids_to_penalize)) - - if logits_to_penalize: - # use zip and * to group indices along each dimension - # eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) ) - logits[tuple(zip(*logits_to_penalize))] = -float("inf") - - # verifies that no rows in logits were missed unexpectedly - assert logits_applied == logits.shape[0] - return logits - - -def _apply_top_k_top_p( - logits: torch.Tensor, - p: torch.Tensor, - k: torch.Tensor, -) -> torch.Tensor: - logits_sort, logits_idx = logits.sort(dim=-1, descending=False) - - # Apply top-k. - top_k_mask = logits_sort.size(1) - k.to(torch.long) - # Get all the top_k values. - top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) - top_k_mask = logits_sort < top_k_mask - logits_sort.masked_fill_(top_k_mask, -float("inf")) - - # Apply top-p. - probs_sort = logits_sort.softmax(dim=-1) - probs_sum = probs_sort.cumsum(dim=-1) - top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) - # at least one - top_p_mask[:, -1] = False - logits_sort.masked_fill_(top_p_mask, -float("inf")) - - # Re-sort the probabilities. - logits = torch.empty_like(logits_sort).scatter_(dim=-1, - index=logits_idx, - src=logits_sort) - return logits - - -def _apply_min_p( - logits: torch.Tensor, - min_p: torch.Tensor, -) -> torch.Tensor: - """ - Adapted from - https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17 - """ - probs = torch.softmax(logits, dim=-1) - top_probs, _ = probs.max(dim=-1, keepdim=True) - scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs - tokens_to_remove = probs < scaled_min_p - logits = logits.masked_fill_(tokens_to_remove, -float("inf")) - - return logits - - -def _greedy_sample( - selected_seq_groups: list[SequenceGroupToSample], - samples: torch.Tensor, -) -> SampleResultType: - """Run greedy sampling on a given samples. - - Args: - selected_seq_groups: A list of sequence groups batched. - samples: (num_selected_samples,) A tensor of samples. The length of - samples could be smaller than selected_seq_groups if - seq_group.do_sample is False. - Returns: - Tuple of (next_token_ids, parent_ids). The length of returned list is - same as the length of selected_seq_groups. If the corresponding - seq_group has do_sample=False, tuple contains ([], []) - """ - samples_lst = samples.tolist() - sample_idx = 0 - results: SampleResultType = [] - for seq_group in selected_seq_groups: - if not seq_group.do_sample: - results.append(([], [])) - continue - - seq_ids = seq_group.seq_ids - num_parent_seqs = len(seq_ids) - assert num_parent_seqs == 1, ( - "Greedy sampling should have only one seq.") - parent_ids = list(range(num_parent_seqs)) - next_token_ids = [samples_lst[sample_idx]] - results.append((next_token_ids, parent_ids)) - sample_idx += num_parent_seqs - return results - - -def _random_sample( - selected_seq_groups: list[SequenceGroupToSample], - random_samples: torch.Tensor, -) -> SampleResultType: - """Run random sampling on a given samples. - - Args: - selected_seq_groups: A list of sequence groups batched. - random_samples: (num_selected_samples,) A tensor of samples. The - length of samples could be smaller than selected_seq_groups if - seq_group.do_sample is False. - Returns: - Tuple of (next_token_ids, parent_ids). The length of returned list is - same as the length of selected_seq_groups. If the corresponding - seq_group has do_sample=False, tuple contains ([], []) - """ - # Find the maximum n value of the prompt phase requests. - random_samples = random_samples.cpu() - sample_idx = 0 - results: SampleResultType = [] - for seq_group in selected_seq_groups: - if not seq_group.do_sample: - results.append(([], [])) - continue - - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - is_prompt = seq_group.is_prompt - num_parent_seqs = len(seq_ids) - if is_prompt: - # Prompt phase. - parent_ids = [0] * sampling_params.n - next_token_ids = random_samples[ - sample_idx, :sampling_params.n].tolist() - else: - # Generation phase. - parent_ids = list(range(num_parent_seqs)) - next_token_ids = random_samples[sample_idx:sample_idx + - num_parent_seqs, 0].tolist() - results.append((next_token_ids, parent_ids)) - sample_idx += num_parent_seqs - return results - - -# torch.multinomial forces a GPU<->CPU sync. -# Therefore, we use an optimized implementation instead. -# Note that we always sample with replacement. -# probs will be modified in place, but this is fine, as we pass -# in a copy already. -def _multinomial( - probs: torch.Tensor, - num_samples: int, - seq_groups: Optional[list[SequenceGroupToSample]] = None, -) -> torch.Tensor: - if num_samples > 1: - probs = probs.repeat_interleave(num_samples, dim=0) - q = torch.empty_like(probs) - if seq_groups is None: - q.exponential_() - else: - sample_idx = 0 - for seq_group in seq_groups: - seq_ids = seq_group.seq_ids - stride = len(seq_ids) * num_samples - assert seq_group.generator is not None - q[sample_idx:sample_idx + - stride].exponential_(generator=seq_group.generator) - sample_idx += stride - return probs.div_(q).argmax(dim=1).view(-1, num_samples) - - -def _top_k_top_p_multinomial_with_flashinfer( - probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, - num_samples: int, seq_groups: Optional[list[SequenceGroupToSample]]): - if num_samples > 1: - probs = probs.repeat_interleave(num_samples, dim=0) - top_ks = top_ks.repeat_interleave(num_samples) - top_ps = top_ps.repeat_interleave(num_samples) - batch_next_token_ids = flashinfer_top_k_top_p_sampling( - probs, - top_ks, - top_ps, - ) - return batch_next_token_ids.view(-1, num_samples) - - -def get_pythonized_sample_results( - sample_result_args: SampleResultArgsType) -> SampleResultType: - '''This function consumes GPU-side sampler results and computes - Pythonized CPU-side sampler results (GPU -> CPU sync.) - - Single-step scheduling: this function is invoked at sampling-time - for immediate Pythonization. - - Multi-step scheduling: Pythonization is deferred until after multiple - GPU-side steps have been completed. - - Args: - sample_result_args: GPU-side inputs to the Pythonization process - - Returns: - Pythonized sampler results - ''' - - ( - sample_metadata, - sampling_metadata, - greedy_samples, - multinomial_samples, - sample_results_dict, - ) = ( - sample_result_args.sample_metadata, - sample_result_args.sampling_metadata, - sample_result_args.greedy_samples, - sample_result_args.multinomial_samples, - sample_result_args.sample_results_dict, - ) - - for sampling_type in SamplingType: - if sampling_type not in sample_metadata: - continue - (seq_group_id, seq_groups) = sample_metadata[sampling_type] - if sampling_type == SamplingType.GREEDY: - sample_results = _greedy_sample(seq_groups, greedy_samples) - elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - sample_results = _random_sample(seq_groups, - multinomial_samples[sampling_type]) - sample_results_dict.update(zip(seq_group_id, sample_results)) - - return [ - sample_results_dict.get(i, ([], [])) - for i in range(len(sampling_metadata.seq_groups)) - ] - - -def _sample_with_torch( - probs: torch.Tensor, - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sampling_tensors: SamplingTensors, - include_gpu_probs_tensor: bool, - modify_greedy_probs: bool, -) -> SampleReturnType: - '''Torch-oriented _sample() implementation. - - Single-step scheduling: - * Perform GPU-side sampling computation - * Immediately Pythonize sampling result - - Multi-step scheduling: - * Perform GPU-side sampling computation - * Defer Pythonization & preserve GPU-side - tensors required for Pythonization - ''' - - categorized_seq_group_ids: dict[SamplingType, list[int]] = { - t: [] - for t in SamplingType - } - categorized_sample_indices = sampling_metadata.categorized_sample_indices - for i, seq_group in enumerate(sampling_metadata.seq_groups): - sampling_params = seq_group.sampling_params - sampling_type = sampling_params.sampling_type - categorized_seq_group_ids[sampling_type].append(i) - - sample_results_dict: SampleResultsDictType = {} - sample_metadata: SampleMetadataType = {} - multinomial_samples: MultinomialSamplesType = {} - greedy_samples: Optional[torch.Tensor] = None - - # Create output tensor for sampled token ids. - if include_gpu_probs_tensor: - sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1), - VLLM_INVALID_TOKEN_ID, - dtype=torch.long, - device=logprobs.device) - else: - sampled_token_ids_tensor = None - - # Counterintuitively, having two loops here is actually faster. - # The first loop can run without waiting on GPU<->CPU sync. - for sampling_type in SamplingType: - sample_indices = categorized_sample_indices[sampling_type] - num_tokens = len(sample_indices) - if num_tokens == 0: - continue - - seq_group_id = categorized_seq_group_ids[sampling_type] - seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_id] - sample_metadata[sampling_type] = (seq_group_id, seq_groups) - long_sample_indices = sample_indices.long() - if sampling_type == SamplingType.GREEDY: - greedy_samples = torch.argmax(logprobs[long_sample_indices], - dim=-1) - - if sampled_token_ids_tensor is not None: - # Store sampled tokens in output tensor. - sampled_token_ids_tensor[ - long_sample_indices] = greedy_samples.unsqueeze(-1) - - if modify_greedy_probs: - # If required, modify the probabilities such that sampling from - # the modified distribution would always sample the argmax - # token id. - _modify_greedy_probs_inplace(logprobs, probs, - long_sample_indices, - greedy_samples) - - elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - max_n_in_batch = 1 - for seq_group in seq_groups: - if seq_group.is_prompt: - sampling_params = seq_group.sampling_params - max_n_in_batch = max(max_n_in_batch, sampling_params.n) - seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else - seq_groups) - - if flashinfer_top_k_top_p_sampling is not None: - logger.warning("FlashInfer 0.2.3+ does not support " - "per-request generators. Falling back to " - "PyTorch-native implementation.") - - multinomial_samples[sampling_type] = _multinomial( - probs[long_sample_indices], - max_n_in_batch, - seq_groups=seq_groups_arg) - - if sampled_token_ids_tensor is not None: - # Store sampled tokens in output tensor. - sampled_token_ids_tensor[long_sample_indices] = \ - multinomial_samples[sampling_type].to(torch.long) - - else: - raise ValueError(f"Unsupported sampling type: {sampling_type}") - - # Encapsulate arguments for computing Pythonized sampler - # results, whether deferred or otherwise. - maybe_deferred_args = SampleResultArgsType( - sampling_metadata=sampling_metadata, - sample_metadata=sample_metadata, - multinomial_samples=multinomial_samples, - greedy_samples=greedy_samples, - sample_results_dict=sample_results_dict) - - if not sampling_metadata.skip_sampler_cpu_output: - # GPU<->CPU sync happens here. - # This also converts the sampler output to a Python object. - # Return Pythonized sampler result & sampled token ids - return get_pythonized_sample_results( - maybe_deferred_args), sampled_token_ids_tensor - else: - # Defer sampler result Pythonization; return deferred - # Pythonization args & sampled token ids - return ( - maybe_deferred_args, - sampled_token_ids_tensor, - ) - - -def _sample( - probs: torch.Tensor, - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sampling_tensors: SamplingTensors, - include_gpu_probs_tensor: bool, - modify_greedy_probs: bool, -) -> SampleReturnType: - """ - Args: - probs: (num_query_tokens_in_batch, num_vocab) - logprobs: (num_query_tokens_in_batch, num_vocab) - sampling_metadata: The metadata for a batch for sampling. - sampling_tensors: Tensors that include sampling related metadata. - - Returns: - (next_token_ids, parent_seq_ids) for each seq group in a batch. - If sampling is skipped, it returns ([], []) - sampled_token_ids_tensor: A tensor of sampled token ids. - """ - return _sample_with_torch( - probs, - logprobs, - sampling_metadata, - sampling_tensors, - include_gpu_probs_tensor=include_gpu_probs_tensor, - modify_greedy_probs=modify_greedy_probs, - ) - - -def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: - """ - This function calculates the ranks of the chosen tokens in a logprob tensor. - - Args: - x (torch.Tensor): 2D logprob tensor of shape (N, M) - where N is the no. of tokens and M is the vocab dim. - indices (torch.Tensor): List of chosen token indices. - - Returns: - torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens. - Each element in the returned tensor represents the rank - of the chosen token in the input logprob tensor. - """ - vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype), - indices] - result = (x > vals[:, None]) - del vals - return result.sum(1).add_(1) - - -def get_logprobs( - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sample_results: SampleResultType, -) -> tuple[list[Optional[PromptLogprobs]], list[SampleLogprobs]]: - """Return sample logprobs and prompt logprobs. - - The logic consists of 3 parts. - - Select indices to compute logprob from, ranks of token ids, and - the top k token ids from logprobs. - - Compute prompt logprobs if required. - - Compute sample logprobs if required. - - Args: - logprobs: (num_query_tokens_across_batch, num_vocab). Each query token's - logprob per vocab. Sequence groups' query tokens are batched in a - single flattened tensor. For example, assuming there are N - seq groups, it is sorted by prefill tokens for seq_group_1 (if - prompt logprob is enabled), decode tokens for seq_group_1 (if - sampling is required), prefill tokens for seq_group_2, ... - sampling_metadata: The sampling metadata. - sample_results: (num_seq_groups) The tuple of (next_token_ids, - parent_ids) for each sequence group. When beam search is enabled, - sample_results can contain different number of seq_ids from - sampling_metadata.seq_groups. It is because beam search creates - 2 * BEAM_WIDTH number of samples (whereas there are only up to - BEAM_WIDTH number of seq_ids). - - Returns: - A tuple of prompt and sample logprobs per sequence group in a batch. - """ - # The index of query token to calculate logprobs. It includes both - # prompt and sample logprob indices. - query_indices: list[int] = [] - # The next token ids to get the logprob value from. - next_token_ids: list[int] = [] - # The largest requested number of logprobs. We find logprobs as many as the - # largest num logprobs in this API. If every logprobs is None, it will be - # set to -1. - largest_num_logprobs = -1 - - # Select indices to compute logprob from, ranks of token ids, and the top - # k token ids from logprobs. - for (seq_group, sample_result) in zip(sampling_metadata.seq_groups, - sample_results): - sampling_params = seq_group.sampling_params - - # Update indices and tokens for prompt logprobs. - if (seq_group.is_prompt - and sampling_params.prompt_logprobs is not None): - largest_num_logprobs = max(largest_num_logprobs, - sampling_params.prompt_logprobs) - next_prompt_tokens = _get_next_prompt_tokens(seq_group) - query_indices.extend(seq_group.prompt_logprob_indices) - next_token_ids.extend(next_prompt_tokens) - - # Update indices and next tokenes for sample logprob. - if seq_group.do_sample: - token_ids, parent_seq_ids = sample_result - # NOTE: We cannot directly use sample_indices because - # sample_indices only contain parent seq_ids of a previous step. - # The current step may have different number of seq_ids, and - # we can obtain it from `sample_result[1]`. - query_idx = seq_group.sample_indices[0] - query_indices.extend( - [query_idx + parent_id for parent_id in parent_seq_ids]) - next_token_ids.extend(token_ids) - - if sampling_params.logprobs is not None: - largest_num_logprobs = max(largest_num_logprobs, - sampling_params.logprobs) - - assert len(next_token_ids) == len(query_indices) - - if len(query_indices) == 0: - empty_sampled_logprob: SampleLogprobs = [] - empty_prompt_logprob: Optional[PromptLogprobs] = None - num_seq_groups = len(sampling_metadata.seq_groups) - return [empty_prompt_logprob - ] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups - - selected_logprobs, ranks = None, None - top_logprobs, top_token_ids = None, None - - # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can - # skip the whole logprob calculation. - if largest_num_logprobs >= 0: - query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) - next_token_ids_gpu = torch.tensor(next_token_ids, - device=logprobs.device) - - # (num_selected_query_tokens, num_logprobs). Note that query_indices can - # contain duplicates if beam search is enabled. - selected_logprobs = logprobs[[ - query_indices_gpu, - next_token_ids_gpu, - ]] - ranks = _get_ranks( - logprobs[query_indices_gpu], - next_token_ids_gpu, - ) - assert selected_logprobs.shape[0] == ranks.shape[0] - - # We need to compute top k only if there exists logprobs > 0. - if largest_num_logprobs > 0: - # Logprobs of topk tokens for a batch of sequence groups. - # (num_query_tokens_across_batch). - top_logprobs, top_token_ids = torch.topk(logprobs, - largest_num_logprobs, - dim=-1) - top_logprobs = top_logprobs.to('cpu') - top_token_ids = top_token_ids.to('cpu') - - selected_logprobs = selected_logprobs.to('cpu') - ranks = ranks.to('cpu') - - # Find prompt/sample logprobs. - prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = [] - sample_logprobs_per_seq_group: list[SampleLogprobs] = [] - top_logprob_idx = 0 - selected_logprobs_idx = 0 - - for seq_group, sample_result in zip(sampling_metadata.seq_groups, - sample_results): - (prompt_logprobs, top_logprob_idx, - selected_logprobs_idx) = _get_prompt_logprob_if_needed( - seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs, - selected_logprobs_idx, top_logprob_idx) - prompt_logprobs_per_seq_group.append(prompt_logprobs) - - (sampled_logprobs, top_logprob_idx, - selected_logprobs_idx) = _get_sampled_logprob_if_needed( - seq_group, sample_result, selected_logprobs, ranks, top_token_ids, - top_logprobs, selected_logprobs_idx, top_logprob_idx) - sample_logprobs_per_seq_group.append(sampled_logprobs) - - return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group - - -def _get_prompt_logprob_if_needed( - seq_group: SequenceGroupToSample, - selected_logprobs: torch.Tensor, - ranks: torch.Tensor, - top_token_ids: torch.Tensor, - top_logprobs: torch.Tensor, - selected_logprobs_idx: int, - top_logprob_idx: int, -): - """Compute the prompt logprob from a sequence group if needed.""" - sampling_params = seq_group.sampling_params - is_prompt = seq_group.is_prompt - - # Find prompt logprobs - prompt_logprobs: Optional[PromptLogprobs] = None - if is_prompt and sampling_params.prompt_logprobs is not None: - prompt_logprobs = [] - num_logprobs = sampling_params.prompt_logprobs - next_prompt_tokens = _get_next_prompt_tokens(seq_group) - # Pre-select indexes and create a list. It is faster than calling .item - # repetitively. - selected_logprob_items = selected_logprobs[ - selected_logprobs_idx:selected_logprobs_idx + - len(next_prompt_tokens)].tolist() - rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + - len(next_prompt_tokens)].tolist() - - for idx, token_id in enumerate(next_prompt_tokens): - # Calculate the prompt logprob of the real prompt tokens. - # {token_id: (logprob, rank_from_vocab)} - prompt_logprobs_dict: dict[int, tuple[float, int]] = { - token_id: (selected_logprob_items[idx], rank_items[idx]) - } - - # Add top K prompt logprobs along with its rank. - if num_logprobs > 0: - top_ids = top_token_ids[ - top_logprob_idx, :num_logprobs].tolist() - top_probs = top_logprobs[ - top_logprob_idx, :num_logprobs].tolist() - # Top K is already sorted by rank, so we can use 1 ~ - # num_logprobs + 1 for rank. - top_ranks = range(1, num_logprobs + 1) - prompt_logprobs_dict.update({ - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip(top_ids, top_probs, - top_ranks) - }) - prompt_logprobs.append({ - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in prompt_logprobs_dict.items() - }) - # + 1 to go to the next prompt token. - top_logprob_idx += 1 - - # + len(next_prompt_tokens) to go to the next prompt. - selected_logprobs_idx += len(next_prompt_tokens) - return prompt_logprobs, top_logprob_idx, selected_logprobs_idx - - -def _get_sampled_logprob_if_needed( - seq_group: SequenceGroupToSample, - sample_result: tuple[list[int], list[int]], - selected_logprobs: torch.Tensor, - ranks: torch.Tensor, - top_token_ids: torch.Tensor, - top_logprobs: torch.Tensor, - selected_logprobs_idx: int, - top_logprob_idx: int, -): - """Compute the sample logprob if needed.""" - seq_ids = seq_group.seq_ids - num_logprobs = seq_group.sampling_params.logprobs - sampled_logprobs: SampleLogprobs = [] - next_token_ids, parent_seq_ids = sample_result - - if seq_group.do_sample: - assert len(next_token_ids) > 0 - if num_logprobs is None: - for next_token_id in next_token_ids: - # Use a dummy logprob - sampled_logprobs.append({next_token_id: Logprob(inf)}) - else: - # Pre-select items from tensor. tolist() is faster than repetitive - # `.item()` calls. - selected_logprob_items = selected_logprobs[ - selected_logprobs_idx:selected_logprobs_idx + - len(next_token_ids)].tolist() - rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + - len(next_token_ids)].tolist() - for idx, (next_token_id, parent_id) in enumerate( - zip(next_token_ids, parent_seq_ids)): - # Get the logprob of a sampled token. - sampled_logprobs_dict = { - next_token_id: - (selected_logprob_items[idx], rank_items[idx]) - } - if num_logprobs is not None and num_logprobs > 0: - # Get top K logprobs. - top_ids = top_token_ids[top_logprob_idx + - parent_id, :num_logprobs].tolist() - top_probs = top_logprobs[ - top_logprob_idx + parent_id, :num_logprobs].tolist() - # Top K is already sorted by rank, so we can use 1 ~ - # num_logprobs + 1 for rank. - top_ranks = range(1, num_logprobs + 1) - sampled_logprobs_dict.update({ - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip( - top_ids, top_probs, top_ranks) - }) - - sampled_logprobs.append({ - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in - sampled_logprobs_dict.items() - }) - - # NOTE: This part of code is not intuitive. `selected_logprobs` include - # logprobs for the current step, which has len(next_token_ids) tokens - # per sequence group. `logprobs` includes logprobs from the previous - # steps, which has len(seq_ids) tokens per sequence group. - - # Iterate to the next sequence group in a batch. - selected_logprobs_idx += len(next_token_ids) - # Iterate to the next sequence group in a batch. - top_logprob_idx += len(seq_ids) - return sampled_logprobs, top_logprob_idx, selected_logprobs_idx - - -def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, - sample_indices: torch.Tensor, - greedy_samples: torch.Tensor) -> None: - """Modify the probability distributions of the greedily-sampled tokens such - that each sampled token has a "probability" of 1.0. This is required by - speculative decoding, which depends on the sampling method being encoded - within the probability distribution for correctness. - - # Why do we only need to do this for greedy sampling? - - vLLM's sampler performs the following steps for greedy or multinomial - (random) sampling: - 1. Get logits from model. - 2. Modify logits according to per-sequence sampling parameters. - - Multiply by temperature, top-k and top-p masking, penalize tokens - according to their frequency, etc. - 3. Sample a token. - - Random sampling simply samples from the modified probability - distribution. - - Greedy sampling performs `argmax` to obtain the token with the - highest likelihood. - - Ignoring greedy sampling for a moment, we find that the computed probability - distribution has the following property: we can sample from it independently - and find that the token sampled by the Sampler has a frequency corresponding - to how often we see it in our sampling. In other words, for tokens sampled - with vLLM's random SamplingType, the computed probability distribution - encodes the sampling methodology completely. - - Greedy sampling does not normally have this property. vLLM modifies logits - according to sampling params, then performs `argmax`, then returns the - sampled token and the computed probability distribution. If we sample from - the distribution, we'll find the likelihood of the greedily-sampled token - is not always 1.0. - - Since lossless speculative decoding requires that the sampling methodology - be encoded within the probability distribution, we are motivated to modify - the probability distribution such that the sampled token has probability 1 - when speculative decoding is used. - - NOTE: Alternatively, we could use an extremely low temperature to achieve - greedy sampling using multinomial computation and unite the codepaths. This - has implications on the overall design of the sampler, e.g. how to record - accurate logprobs for the user, so this improvement is deferred to later. - """ - # NOTE: logprobs are not modified so they can be returned to the user. - probs[sample_indices, :] = 0 - probs[sample_indices, greedy_samples] = 1.0 - - -def _build_sampler_output( - maybe_deferred_sample_results: MaybeDeferredSampleResultType, - sampling_metadata: SamplingMetadata, - prompt_logprobs: Optional[list[Optional[PromptLogprobs]]], - sample_logprobs: Optional[list[SampleLogprobs]], - on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor, - torch.Tensor]], - skip_sampler_cpu_output: bool = False, -) -> SamplerOutput: - """Construct Python objects with the output of sampling. - - Args: - on_device_tensors: Tuple containing on-device tensors with the - probabilities used in sampling and the sampled token ids. This - allows post-processing without copies to CPU/serialization, e.g. in - speculative decoding rejection sampling. - """ - sampler_output: list[CompletionSequenceGroupOutput] = [] - - if skip_sampler_cpu_output: - assert isinstance(maybe_deferred_sample_results, SampleResultArgsType) - deferred_sample_results_args = maybe_deferred_sample_results - else: - assert prompt_logprobs is not None - assert sample_logprobs is not None - assert not isinstance(maybe_deferred_sample_results, - SampleResultArgsType) - assert len(sampling_metadata.seq_groups) \ - == len(maybe_deferred_sample_results) \ - == len(prompt_logprobs) \ - == len(sample_logprobs) - deferred_sample_results_args = None - - for (seq_group, sample_result, group_prompt_logprobs, - group_sample_logprobs) in zip(sampling_metadata.seq_groups, - maybe_deferred_sample_results, - prompt_logprobs, sample_logprobs): - seq_ids = seq_group.seq_ids - next_token_ids, parent_ids = sample_result - seq_outputs: list[SequenceOutput] = [] - for parent_id, next_token_id, logprobs in zip( - parent_ids, next_token_ids, group_sample_logprobs): - seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, - logprobs)) - sampler_output.append( - CompletionSequenceGroupOutput(seq_outputs, - group_prompt_logprobs)) - - # If not specified, store None values in SamplerOutput. - if on_device_tensors is not None: - (sampled_token_probs, logprobs_tensor, - sampled_token_ids) = on_device_tensors - else: - sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, - None) - - return SamplerOutput( - outputs=sampler_output, - sampled_token_probs=sampled_token_probs, - sampled_token_ids=sampled_token_ids, - logprobs=logprobs_tensor, - deferred_sample_results_args=deferred_sample_results_args) - - -def _get_next_prompt_tokens( - seq_group: SequenceGroupToSample) -> tuple[int, ...]: - """Get a list of next prompt tokens to compute logprob from a - given sequence group. - - It is used to compute prompt logprob. Imagine you have logprob for each - query token. Query token needs to know the next prompt token id to compute - prompt logprob. This is a helper to obtain next prompt token ids. - - This API has to be used only when the caller knows seq_group is in prefill - stage. - - Returns: - A list of next prompt tokens to compute logprob. - """ - assert seq_group.is_prompt, ( - "Caller should ensure the sequence group is in a prefill stage.") - seq_ids = seq_group.seq_ids - query_len = seq_group.query_len - assert query_len is not None - # prompt has only 1 seq id. - assert len(seq_ids) == 1 - seq_data = seq_group.seq_data[seq_ids[0]] - computed_len = seq_data.get_num_computed_tokens() - prompt_tokens = seq_data.prompt_token_ids - # +1 because we are looking for a next prompt token. - next_token_index_start = computed_len + 1 - next_token_index_end = min(computed_len + query_len + 1, - len(prompt_tokens)) - next_prompt_tokens = prompt_tokens[ - next_token_index_start:next_token_index_end] - return next_prompt_tokens diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 6ba8ad372c95a..b0a96fca2ff8a 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -2,18 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Optional import torch import torch.nn as nn from vllm.config import VllmConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from .utils import maybe_prefix @@ -105,8 +102,10 @@ class Medusa(nn.Module): return [block(hidden_states) for block in self.blocks] def compute_logits( - self, hidden_states: list[torch.Tensor], - sampling_metadata: SamplingMetadata) -> list[torch.Tensor]: + self, + hidden_states: list[torch.Tensor], + sampling_metadata, + ) -> list[torch.Tensor]: logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): @@ -130,57 +129,6 @@ class Medusa(nn.Module): return logits_lst - def sample( - self, - logits: list[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> list[SamplerOutput]: - logits = torch.stack(logits, dim=0).float() - logprobs = torch.log_softmax(logits, dim=-1) - token_ids = logits.argmax(-1) # support only top-1 for now - probs = torch.softmax(logits, dim=-1) - - token_id_list = [] - token_prob_list = [] - token_logprob_list = [] - - for idx, seq_group in enumerate(sampling_metadata.seq_groups): - token_id_list.append(token_ids[:, seq_group.sample_indices]) - token_prob_list.append(probs[:, seq_group.sample_indices]) - token_logprob_list.append(logprobs[:, seq_group.sample_indices]) - - outputs: list[Optional[SamplerOutput]] = [] - for idx in range(len(sampling_metadata.seq_groups)): - outputs.append( - SamplerOutput( - outputs=None, - sampled_token_probs=token_prob_list[idx].squeeze(1), - logprobs=token_logprob_list[idx].squeeze(1), - sampled_token_ids=token_id_list[idx].squeeze(1), - )) - - return outputs - - def generate_proposals( - self, - previous_hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[list[SamplerOutput]]: - # During preemption, we may receive an empty tensor (batch_size=0) - if previous_hidden_states.size(0) == 0: - # Return None to signal the Top1Proposer that no proposals - # were generated for this batch, allowing it to handle this - # special case appropriately - return None - - return self.sample( - logits=self.compute_logits( - hidden_states=self.forward(previous_hidden_states), - sampling_metadata=sampling_metadata, - ), - sampling_metadata=sampling_metadata, - ) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index c6a97388dc188..d057eb49a62d1 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -8,9 +8,7 @@ import torch import torch.nn as nn from vllm.config import VllmConfig -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -141,55 +139,57 @@ class MLPSpeculator(nn.Module): self.config = config self.logits_processor = LogitsProcessor(config.vocab_size, config.vocab_size, 1.0) - self.sampler = get_sampler() - def generate_proposals( - self, - input_ids: torch.Tensor, - previous_hidden_states: torch.Tensor, - num_predict_tokens: int, - sampling_metadata: SamplingMetadata, - ) -> list[SamplerOutput]: - if num_predict_tokens > self.max_speculative_tokens: - raise ValueError(f"Max speculative tokens for model is " - f"{self.max_speculative_tokens}, but " - f"{num_predict_tokens} were requested") + # NOTE(woosuk): This method is commented out because it is old code + # using V0. We should either port it to V1 or remove it. - # b x 1 x d - previous_hidden_states = previous_hidden_states.unsqueeze(1) + # def generate_proposals( + # self, + # input_ids: torch.Tensor, + # previous_hidden_states: torch.Tensor, + # num_predict_tokens: int, + # sampling_metadata: SamplingMetadata, + # ) -> list[SamplerOutput]: + # if num_predict_tokens > self.max_speculative_tokens: + # raise ValueError(f"Max speculative tokens for model is " + # f"{self.max_speculative_tokens}, but " + # f"{num_predict_tokens} were requested") - if self.scale_input: - previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2 + # # b x 1 x d + # previous_hidden_states = previous_hidden_states.unsqueeze(1) - # b x 1 - last_tokens = input_ids.unsqueeze(1) + # if self.scale_input: + # previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2 - next_tokens = [] + # # b x 1 + # last_tokens = input_ids.unsqueeze(1) - for head_index in range(num_predict_tokens): + # next_tokens = [] - # Project and predict - z = self.emb[head_index](last_tokens) # b k d - states = self.proj[head_index](previous_hidden_states) + # for head_index in range(num_predict_tokens): - # Weighted add of state_weight*state and emb_weight*z - # Let subsequent LN take care of denominator - # state_weight is close to 1, so shouldn't be any precision issues - states.add_(z, alpha=self.emb_weight / self.state_weight) + # # Project and predict + # z = self.emb[head_index](last_tokens) # b k d + # states = self.proj[head_index](previous_hidden_states) - states = self.activation(self.ln[head_index](states)) # b k d - previous_hidden_states = states - # TODO: not yet supporting top_k_tokens_per_head - states = states.flatten(0, 1) + # # Weighted add of state_weight*state and emb_weight*z + # # Let subsequent LN take care of denominator + # # state_weight is close to 1, so shouldn't be any precision issues + # states.add_(z, alpha=self.emb_weight / self.state_weight) - logits = self.logits_processor(self.head[head_index], states, - sampling_metadata) + # states = self.activation(self.ln[head_index](states)) # b k d + # previous_hidden_states = states + # # TODO: not yet supporting top_k_tokens_per_head + # states = states.flatten(0, 1) - output = self.sampler(logits, sampling_metadata) - last_tokens = output.sampled_token_ids - next_tokens.append(output) + # logits = self.logits_processor(self.head[head_index], states, + # sampling_metadata) - return next_tokens + # output = self.sampler(logits, sampling_metadata) + # last_tokens = output.sampled_token_ids + # next_tokens.append(output) + + # return next_tokens def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index c4548ee168bd7..aa7c434a44aeb 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -697,16 +697,12 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - # If the shape is the same, it means that we have already - # prune hidden states manually. - prune_hidden_states = hidden_states.size( - 0) != sampling_metadata.selected_token_indices.size(0) processed_logits = self.logits_processor( self.lm_head, hidden_states, sampling_metadata, self.embedding_bias, - prune_hidden_states=prune_hidden_states) + ) return processed_logits def load_weights( diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 2315f9dad5a5a..8c4548ff7f7dc 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,597 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from array import array -from dataclasses import dataclass -from typing import Optional - -import torch - -from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData, - SequenceGroupMetadata) -from vllm.utils import (PyObjectCache, async_tensor_h2d, - is_pin_memory_available, make_tensor_with_pad) - -_SAMPLING_EPS = 1e-5 - - -@dataclass -class SequenceGroupToSample: - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # Sequence ids for the sequence group in a previous step. - seq_ids: list[int] - sampling_params: SamplingParams - # seq_id -> sequence data. - seq_data: dict[int, SequenceData] - # The length of the sequence (all tokens seen in the past + new token to - # compute attention) of the sequence group. None if it is in a decode - # stage. - seq_len: Optional[int] - # The length of new query tokens to compute in the current step. None if it - # is in a decode stage. The length of query_len <= seq_len if chunked - # prefill is enabled. - query_len: Optional[int] - # A random number generator for sampling. - generator: Optional[torch.Generator] - # True if the sequence group is in prefill stage. False if it is in a - # decode stage. - is_prompt: bool - # Query token indices from logits. to compute prompt logprob. Empty if - # prompt logprob is not required. - prompt_logprob_indices: list[int] - # Sample token indices from logits. Empty if sampling is not required. - sample_indices: list[int] - - @property - def do_sample(self): - return len(self.sample_indices) > 0 - - def __post_init__(self): - if len(self.prompt_logprob_indices) > 0: - assert self.sampling_params.prompt_logprobs is not None - if self.is_prompt: - assert self.seq_len is not None - assert self.query_len is not None - - -def gen_seq_group_to_sample_builder(num_seqs: int): - return lambda: SequenceGroupToSample( - seq_ids=[0] * num_seqs, - sampling_params=None, - seq_data=None, # type: ignore - seq_len=0, - query_len=0, - generator=None, - is_prompt=True, - prompt_logprob_indices=[], - sample_indices=[], - ) - - -class SamplingMetadataCache: - """Used to cache SamplingMetadata objects between scheduler iterations""" - - def __init__(self): - self._seq_group_to_sample_cache: dict[int, PyObjectCache] = {} - - def get_cached_seq_group_to_sample(self, num_seqs): - if num_seqs not in self._seq_group_to_sample_cache: - self._seq_group_to_sample_cache[num_seqs] = PyObjectCache( - gen_seq_group_to_sample_builder(num_seqs)) - - obj = self._seq_group_to_sample_cache[num_seqs].get_object() - return obj - - def reset(self): - for cache in self._seq_group_to_sample_cache.values(): - cache.reset() - class SamplingMetadata: - """Metadata for input sequences. Used in sampler. - - The usage is as follows; - ``` - hidden_states = execute_model(...) - logits = hidden_states[sampling_metadata.selected_token_indices] - sample(logits) - - def sample(logits): - # Use categorized_sample_indices for sampling.... - ``` - - Args: - seq_groups: List of batched sequence groups. - selected_token_indices: (num_query_tokens_to_logprob). Indices to find - logits from the initial model output hidden states. - categorized_sample_indices: SamplingType -> token indices to sample. - Each token indices is 2D tensor of (num_indices, num_indices) where - the first item means the sample index within the returned logit - (before pruning padding), and the second item means the sample - index after pruning using selected_token_indices. - For example, if the returned logit is [1, 2, 3], and we select - [1, 2] for sampling, the pruned logit will be [2, 3]. In this case, - The first tuple is [1, 2] (sampled index within original logit), - and the second tuple is [0, 1] (sampled index within pruned logit). - num_prompts: Number of prompt sequence groups in seq_groups. - skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU - serialization of token outputs. - reuse_sampling_tensors: Indicates if we want to reuse sampling - tensors that are part of the sampler forward pass. Currently, - it is mainly used for multi-step decode. - - """ - - def __init__( - self, - seq_groups: list[SequenceGroupToSample], - selected_token_indices: torch.Tensor, - categorized_sample_indices: dict[SamplingType, torch.Tensor], - num_prompts: int, - skip_sampler_cpu_output: bool = False, - reuse_sampling_tensors: bool = False, - ) -> None: - self.seq_groups = seq_groups - self.selected_token_indices = selected_token_indices - self.categorized_sample_indices = categorized_sample_indices - self.num_prompts = num_prompts - self.skip_sampler_cpu_output = skip_sampler_cpu_output - self.reuse_sampling_tensors = reuse_sampling_tensors - - @staticmethod - def prepare( - seq_group_metadata_list: list[SequenceGroupMetadata], - seq_lens: list[int], - query_lens: list[int], - device: str, - pin_memory: bool, - generators: Optional[dict[str, torch.Generator]] = None, - cache: Optional[SamplingMetadataCache] = None, - ) -> "SamplingMetadata": - ( - seq_groups, - selected_token_indices, - categorized_sample_indices, - num_prompts, - ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens, - device, generators, cache) - selected_token_indices = async_tensor_h2d( - selected_token_indices, - dtype=torch.long, - target_device=device, - pin_memory=pin_memory, - ) - categorized_sample_indices = { - t: - async_tensor_h2d( - seq_ids, - dtype=torch.int, - target_device=device, - pin_memory=pin_memory, - ) - for t, seq_ids in categorized_sample_indices.items() - } - - sampling_metadata = SamplingMetadata( - seq_groups=seq_groups, - selected_token_indices=selected_token_indices, - categorized_sample_indices=categorized_sample_indices, - num_prompts=num_prompts, - ) - return sampling_metadata - - def __repr__(self) -> str: - return ( - "SamplingMetadata(" - f"seq_groups={self.seq_groups}, " - f"selected_token_indices={self.selected_token_indices}, " - f"categorized_sample_indices={self.categorized_sample_indices})") - - -def _prepare_seq_groups( - seq_group_metadata_list: list[SequenceGroupMetadata], - seq_lens: list[int], - query_lens: list[int], - device: str, - generators: Optional[dict[str, torch.Generator]] = None, - cache: Optional[SamplingMetadataCache] = None, -) -> tuple[ - list[SequenceGroupToSample], - list[int], - dict[SamplingType, list[int]], - int, -]: - """Prepare sequence groups and indices for sampling. - - Args: - seq_group_metadata_list: A list of sequence group to batch. - seq_lens: A list of sequence lens per sequence group. - Index of prompt len should match with seq_group_metadata_list. - query_lens: A list of query lengths. Prompt lens include the length - of entire prompt tokens, and it could be shorter. - device: A device to use for random number generators, - `SequenceGroupToSample.generator`. - generators: A store of per-request random number generators used - for seeded requests. - - Returns: - seq_groups: A list of sequence group to sample. - selected_token_indices: See the definition from `SamplingMetadata`. - categorized_sample_indices: See the definition from `SamplingMetadata`. - num_prompts: Total number of prompts from `seq_group_metadata_list`. - """ - # Batched sequence groups for the current model forward stsep. - seq_groups: list[SequenceGroupToSample] = [] - # A list of token indices to sample/compute logprob. It is used to - # prune the outcome logits from the model for the performance. - selected_token_indices: list[int] = [] - # Used for selected_token_indices. - model_output_idx = 0 - - # Sampling type -> ( - # indices to sample/prompt logprob within pruned output logits, - # indices to sample within pruned logits) - categorized_sample_indices: dict[SamplingType, list[int]] = { - t: [] - for t in SamplingType - } - # Index of logits to compute logprob. Logits include both prompt logprob - # and sample logprob indices. - logit_idx = 0 - # Total number of prompts from given sequence groups. - num_prompts = 0 - - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = seq_group_metadata.seq_data.keys() - - if cache is not None: - sample_obj = cache.get_cached_seq_group_to_sample(len(seq_ids)) - - for j, seq_id in enumerate(seq_ids): - sample_obj.seq_ids[j] = seq_id - - sample_obj.prompt_logprob_indices.clear() - sample_obj.sample_indices.clear() - - sampling_params = seq_group_metadata.sampling_params - is_prompt = seq_group_metadata.is_prompt - generator: Optional[torch.Generator] = None - # If the current seq group is in decode stage, it is None. - seq_len: Optional[int] = None - query_len: Optional[int] = None - prompt_logprob_indices: list[int] = (sample_obj.prompt_logprob_indices - if cache is not None else []) - sample_indices: list[int] = (sample_obj.sample_indices - if cache is not None else []) - do_sample = seq_group_metadata.do_sample - - if seq_group_metadata.is_prompt: - if sampling_params.seed is not None: - generator = torch.Generator(device=device).manual_seed( - sampling_params.seed) - if generators is not None: - generators[seq_group_metadata.request_id] = generator - - num_prompts += 1 - num_prefill_sample = len(seq_ids) - assert num_prefill_sample == 1 - assert query_lens is not None and seq_lens is not None - query_len, seq_len = query_lens[i], seq_lens[i] - # If we need sampling, exclude num_prefill_sample tokens from - # prompt logprob. - prompt_logprob_len = (query_len - num_prefill_sample - if do_sample else query_len) - sample_len = num_prefill_sample if do_sample else 0 - else: - # Decode - prompt_logprob_len = 0 - query_len = query_lens[i] if query_lens is not None and len( - query_lens) > 0 else 1 - sample_len = len(seq_ids) * query_len if do_sample else 0 - - if sampling_params.seed is not None and generators is not None: - generator = generators.get(seq_group_metadata.request_id) - - # Update indices to select from the model output. - """ - This blocks computes selected_token_indices which is used in the - following way. - - hidden_states = model(...) - logits = hidden_states[selected_token_indices] - """ - - if sampling_params.prompt_logprobs is not None: - selected_token_indices.extend( - range(model_output_idx, model_output_idx + prompt_logprob_len)) - model_output_idx += prompt_logprob_len - if do_sample: - selected_token_indices.extend( - range(model_output_idx, model_output_idx + sample_len)) - model_output_idx += sample_len - - # We now find indices for logprob computation and sampling. - """ - This block computes categorized_sample_indices which is used in the - following way. - - hidden_states = model(...) - logits = hidden_states[selected_token_indices] - def sample(logits): - # Use categorized_sample_indices for sampling. - # prompt_logprob_indices to find prompt logprob indices. - # sample_indices to find sample indices. - """ - - if sampling_params.prompt_logprobs is not None: - prompt_logprob_indices.extend( - range(logit_idx, logit_idx + prompt_logprob_len)) - logit_idx += prompt_logprob_len - if do_sample: - sample_indices.extend(range(logit_idx, logit_idx + sample_len)) - categorized_sample_indices[sampling_params.sampling_type].extend( - list(range(logit_idx, logit_idx + sample_len))) - logit_idx += sample_len - - if cache is not None: - sample_obj.sampling_params = sampling_params - sample_obj.seq_data = seq_group_metadata.seq_data - sample_obj.seq_len = seq_len - sample_obj.query_len = query_len - sample_obj.generator = generator - sample_obj.is_prompt = is_prompt - else: - sample_obj = SequenceGroupToSample( - seq_ids=list(seq_ids), - sampling_params=sampling_params, - seq_data=seq_group_metadata.seq_data, - seq_len=seq_len, - query_len=query_len, - generator=generator, - is_prompt=is_prompt, - prompt_logprob_indices=list(prompt_logprob_indices), - sample_indices=list(sample_indices), - ) - - seq_groups.append(sample_obj) - - if cache is not None: - cache.reset() - - return (seq_groups, selected_token_indices, categorized_sample_indices, - num_prompts) - - -@dataclass -class SamplingTensors: - """Tensors for sampling.""" - - temperatures: torch.Tensor - top_ps: torch.Tensor - top_ks: torch.Tensor - min_ps: torch.Tensor - presence_penalties: torch.Tensor - frequency_penalties: torch.Tensor - repetition_penalties: torch.Tensor - prompt_tokens: torch.Tensor - output_tokens: torch.Tensor - - @classmethod - def from_sampling_metadata( - cls, - sampling_metadata: "SamplingMetadata", - vocab_size: int, - device: torch.device, - dtype: torch.dtype, - ) -> tuple["SamplingTensors", bool, bool, bool]: - prompt_tokens: list[array] = [] - output_tokens: list[array] = [] - top_ks: list[int] = [] - temperatures: list[float] = [] - top_ps: list[float] = [] - min_ps: list[float] = [] - presence_penalties: list[float] = [] - frequency_penalties: list[float] = [] - repetition_penalties: list[float] = [] - do_penalties = False - do_top_p_top_k = False - do_min_p = False - - assert sampling_metadata.seq_groups is not None - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - temperature = sampling_params.temperature - p = sampling_params.presence_penalty - f = sampling_params.frequency_penalty - r = sampling_params.repetition_penalty - top_p = sampling_params.top_p - min_p = sampling_params.min_p - - # k should not be greater than the vocab size. - top_k = min(sampling_params.top_k, vocab_size) - top_k = vocab_size if top_k < 1 else top_k - if temperature < _SAMPLING_EPS: - # NOTE: Zero temperature means deterministic sampling - # (i.e., greedy sampling or beam search). - # Set the temperature to 1 to avoid division by zero. - temperature = 1.0 - if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS - or top_k != vocab_size): - do_top_p_top_k = True - if not do_min_p and min_p > _SAMPLING_EPS: - do_min_p = True - if not do_penalties and (abs(p) >= _SAMPLING_EPS - or abs(f) >= _SAMPLING_EPS - or abs(r - 1.0) >= _SAMPLING_EPS): - do_penalties = True - - is_prompt = seq_group.is_prompt - if is_prompt and sampling_params.prompt_logprobs is not None: - # For tokens in the prompt that we only need to get - # their logprobs - query_len = seq_group.query_len - assert query_len is not None - prefill_len = len(seq_group.prompt_logprob_indices) - temperatures += [temperature] * prefill_len - top_ps += [top_p] * prefill_len - top_ks += [top_k] * prefill_len - min_ps += [min_p] * prefill_len - presence_penalties += [0] * prefill_len - frequency_penalties += [0] * prefill_len - repetition_penalties += [1] * prefill_len - - if seq_group.do_sample: - sample_lens = len(seq_group.sample_indices) - assert sample_lens >= len(seq_ids) - temperatures += [temperature] * sample_lens - top_ps += [top_p] * sample_lens - top_ks += [top_k] * sample_lens - min_ps += [min_p] * sample_lens - presence_penalties += [p] * sample_lens - frequency_penalties += [f] * sample_lens - repetition_penalties += [r] * sample_lens - - if do_penalties: - for seq_group in sampling_metadata.seq_groups: - seq_ids = seq_group.seq_ids - sampling_params = seq_group.sampling_params - if (seq_group.is_prompt - and sampling_params.prompt_logprobs is not None): - prefill_len = len(seq_group.prompt_logprob_indices) - prompt_tokens.extend( - array(VLLM_TOKEN_ID_ARRAY_TYPE) - for _ in range(prefill_len)) - output_tokens.extend( - array(VLLM_TOKEN_ID_ARRAY_TYPE) - for _ in range(prefill_len)) - if seq_group.do_sample: - for seq_id in seq_ids: - seq_data = seq_group.seq_data[seq_id] - prompt_tokens.append(seq_data.prompt_token_ids_array) - output_tokens.append(seq_data.output_token_ids_array) - - sampling_tensors = SamplingTensors.from_lists( - temperatures, - top_ps, - top_ks, - min_ps, - presence_penalties, - frequency_penalties, - repetition_penalties, - prompt_tokens, - output_tokens, - vocab_size, - device, - dtype, - ) - return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) - - @classmethod - def from_lists( - cls, - temperatures: list[float], - top_ps: list[float], - top_ks: list[int], - min_ps: list[float], - presence_penalties: list[float], - frequency_penalties: list[float], - repetition_penalties: list[float], - prompt_tokens: list[array], - output_tokens: list[array], - vocab_size: int, - device: torch.device, - dtype: torch.dtype, - ) -> "SamplingTensors": - # Note that the performance will be very bad without - # pinned memory. - pin_memory = is_pin_memory_available() - - do_penalties = prompt_tokens or output_tokens - - if do_penalties: - prompt_t = make_tensor_with_pad( - prompt_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - ) - output_t = make_tensor_with_pad( - output_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - ) - else: - empty_tensor = torch.empty(0, device=device, dtype=torch.long) - prompt_t = empty_tensor - output_t = empty_tensor - - temperatures_t = torch.tensor( - temperatures, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - top_ps_t = torch.tensor( - top_ps, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - min_ps_t = torch.tensor( - min_ps, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - presence_penalties_t = torch.tensor( - presence_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - frequency_penalties_t = torch.tensor( - frequency_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - repetition_penalties_t = torch.tensor( - repetition_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - top_ks_t = torch.tensor( - top_ks, - device="cpu", - dtype=torch.int, - pin_memory=pin_memory, - ) - # Because the memory is pinned, we can do non-blocking - # transfer to device. - - return cls( - temperatures=temperatures_t.to(device=device, non_blocking=True), - top_ps=top_ps_t.to(device=device, non_blocking=True), - top_ks=top_ks_t.to(device=device, non_blocking=True), - min_ps=min_ps_t.to(device=device, non_blocking=True), - presence_penalties=presence_penalties_t.to(device=device, - non_blocking=True), - frequency_penalties=frequency_penalties_t.to(device=device, - non_blocking=True), - repetition_penalties=repetition_penalties_t.to(device=device, - non_blocking=True), - prompt_tokens=prompt_t.to(device=device, non_blocking=True), - output_tokens=output_t.to(device=device, non_blocking=True), - ) + # Placeholder until it can be safely removed. + pass diff --git a/vllm/sequence.py b/vllm/sequence.py index 24114c0bb792e..a6c194fbac0b2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1,28 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sequence and its related classes.""" -import copy -import enum -from abc import ABC, abstractmethod -from array import array -from collections import defaultdict -from collections.abc import Mapping -from collections.abc import Sequence as GenericSequence -from dataclasses import dataclass, field -from functools import reduce -from typing import TYPE_CHECKING, Any, Callable, Optional, Union +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional, Union import msgspec import torch -from vllm.inputs import SingletonInputs -from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs -from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import RequestOutputKind, SamplingParams - if TYPE_CHECKING: - from vllm.lora.request import LoRARequest from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorOutput) else: @@ -34,50 +19,6 @@ VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_INVALID_TOKEN_ID = -1 -def array_full(token_id: int, count: int): - """[`array`][] equivalent of [numpy.full][].""" - return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count - - -class SequenceStatus(enum.IntEnum): - """Status of a sequence.""" - WAITING = 0 - RUNNING = 1 - SWAPPED = 2 - # Note: anything after SWAPPED (2) will be considered - # as a finished status. - FINISHED_STOPPED = 3 - FINISHED_LENGTH_CAPPED = 4 - FINISHED_ABORTED = 5 - FINISHED_IGNORED = 6 - - @staticmethod - def is_finished(status: "SequenceStatus") -> bool: - return status > SequenceStatus.SWAPPED - - @staticmethod - def get_finished_reason(status: "SequenceStatus") -> Union[str, None]: - if status == SequenceStatus.FINISHED_STOPPED: - finish_reason = "stop" - elif status == SequenceStatus.FINISHED_LENGTH_CAPPED: - finish_reason = "length" - elif status == SequenceStatus.FINISHED_ABORTED: - finish_reason = "abort" - elif status == SequenceStatus.FINISHED_IGNORED: - # The ignored sequences are the sequences whose prompt lengths - # are longer than the model's length cap. Therefore, the stop - # reason should also be "length" as in OpenAI API. - finish_reason = "length" - else: - finish_reason = None - return finish_reason - - -class SequenceStage(enum.Enum): - PREFILL = enum.auto() - DECODE = enum.auto() - - @dataclass class RequestMetrics: """Metrics associated with a request. @@ -107,971 +48,12 @@ class RequestMetrics: model_execute_time: Optional[float] = None -class SequenceDataDelta( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True): # type: ignore[call-arg] - """Delta SequenceData to send to workers per step.""" - # A new token to be appended to existing SequenceData. - new_output_token_ids: list[int] - # Overwriting existing `cumulative_logprob` - new_cumulative_logprob: float - # Overwriting existing `num_computed_tokens`. - new_num_computed_tokens: int - # Overwriting existing `stage`. - new_stage: SequenceStage - - -class SequenceData(msgspec.Struct, - omit_defaults=True): # type: ignore[call-arg] - """Data associated with a sequence.""" - # NOTE: we cannot use Union[list, array] because msgspec cannot support - # union of 2 list types. - _prompt_token_ids: array - _output_token_ids: array = msgspec.field( - default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, [])) - - _prompt_embeds: Optional[torch.Tensor] = None - _output_embeds: Optional[torch.Tensor] = None - - ### The below fields should not be passed as an argument ### - _cumulative_logprob: float = 0.0 - _prompt_token_ids_tuple: tuple[int, - ...] = msgspec.field(default_factory=tuple) - # The number of tokens that are computed (that run against the model). - _num_computed_tokens: int = 0 - # The number of tokens with prefix cache hit. - _num_cached_tokens: int = 0 - _stage: SequenceStage = SequenceStage.PREFILL - _cached_all_token_ids: list[int] = msgspec.field(default_factory=list) - _cached_all_token_embeds: Optional[torch.Tensor] = None - - # It is used to get delta input. It is reset when `get_delta_and_reset` - # is called. - _new_appended_tokens: list[int] = msgspec.field(default_factory=list) - - # It is used to compute mrope_position_ids. - _mrope_position_delta: Optional[int] = None - - @staticmethod - def from_prompt_token_counts( - *token_counts: tuple[int, int]) -> "SequenceData": - """ - Construct a [`SequenceData`][vllm.sequence.SequenceData] instance - by concatenating prompt token sequences. - - Each tuple represents one token sequence, expressed in the form - `(token_id, count)`. - """ - if len(token_counts) == 0: - return SequenceData.from_seqs([]) - - prompt_token_ids_arr = reduce( - array.__iadd__, - (array_full(token_id, count) for token_id, count in token_counts), - ) - - return SequenceData(prompt_token_ids_arr) - - @staticmethod - def from_seqs( - prompt_token_ids: GenericSequence[int], - output_token_ids: Optional[GenericSequence[int]] = None, - *, - prompt_embeds: Optional[torch.Tensor] = None, - ) -> "SequenceData": - """ - Construct a [`SequenceData`][vllm.sequence.SequenceData] instance - from prompt and output token sequences. - """ - prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, - prompt_token_ids) - - if output_token_ids is None: - return SequenceData(prompt_token_ids_arr, - _prompt_embeds=prompt_embeds) - - output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, - output_token_ids) - - return SequenceData(prompt_token_ids_arr, - _output_token_ids=output_token_ids_arr, - _prompt_embeds=prompt_embeds) - - def __post_init__(self) -> None: - assert self._prompt_token_ids.typecode == "l" - assert self._output_token_ids.typecode == "l" - self._prompt_token_ids_tuple: tuple[int, ...] = tuple( - self._prompt_token_ids) - self._update_cached_all_tokens() - if self._prompt_embeds is not None: - self._update_cached_all_token_embeds() - - def _update_cached_all_tokens(self): - assert isinstance(self._prompt_token_ids, array) - assert isinstance(self._output_token_ids, array) - self._cached_all_token_ids: list[int] = list(self._prompt_token_ids + - self._output_token_ids) - - def _update_cached_all_token_embeds(self): - assert isinstance(self._prompt_embeds, torch.Tensor) - self._cached_all_token_embeds: torch.Tensor = self._prompt_embeds - if self._output_embeds is not None: - self._cached_all_token_embeds = torch.cat( - (self._cached_all_token_embeds, self._output_embeds), dim=0) - - @property - def cumulative_logprob(self) -> float: - """The cumulative log probability of the output.""" - return self._cumulative_logprob - - @property - def prompt_token_ids(self) -> tuple[int, ...]: - """The token IDs of the prompt.""" - return self._prompt_token_ids_tuple - - @prompt_token_ids.setter - def prompt_token_ids(self, new_prompt_token_ids) -> None: - raise NotImplementedError - - @property - def prompt_token_ids_array(self) -> array: - """Return the prompt token ids in array type. - - Note that the array is in "I" type, and it is not compatible - with torch.long (2 bytes vs 4 bytes). So beware of the usage. - """ - return self._prompt_token_ids - - @property - def output_token_ids(self) -> tuple[int, ...]: - """The token IDs of the output.""" - return tuple(self._output_token_ids) - - @output_token_ids.setter - def output_token_ids(self, - new_output_token_ids: GenericSequence[int]) -> None: - self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, - new_output_token_ids) - self._update_cached_all_tokens() - - @property - def output_embeds(self) -> Optional[torch.Tensor]: - return self._output_embeds - - @output_embeds.setter - def output_embeds(self, new_output_token_embeds: torch.Tensor) -> None: - self._output_token_embeds = new_output_token_embeds - self._update_cached_all_token_embeds() - - @property - def output_token_ids_array(self) -> array: - """Return the prompt token ids in array type. - - Note that the array is in "I" type, and it is not compatible - with torch.long (2 bytes vs 4 bytes). So beware of the usage. - """ - assert isinstance(self._output_token_ids, array) - return self._output_token_ids - - @property - def prompt_embeds(self) -> Optional[torch.Tensor]: - return self._prompt_embeds - - @prompt_embeds.setter - def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None: - self._prompt_embeds = prompt_embeds - self._update_cached_all_token_embeds() - - @property - def mrope_position_delta(self) -> Optional[int]: - return self._mrope_position_delta - - @mrope_position_delta.setter - def mrope_position_delta(self, new_mrope_position_delta): - self._mrope_position_delta = new_mrope_position_delta - - def append_token_id(self, - token_id: int, - logprob: float, - token_embed: Optional[torch.Tensor] = None) -> None: - self._output_token_ids.append(token_id) - self._new_appended_tokens.append(token_id) - self._cached_all_token_ids.append(token_id) - self._cumulative_logprob += logprob - if token_embed is not None: - # Do not pass in with batch or sequence dimensions - assert token_embed.ndim == 1 - token_embed = token_embed.detach().cpu().unsqueeze(0) - if self._output_embeds is None: - self._output_embeds = token_embed - else: - self._output_embeds = torch.cat( - (self._output_embeds, token_embed), dim=0) - assert self._cached_all_token_embeds is not None - self._cached_all_token_embeds = torch.cat( - (self._cached_all_token_embeds, - token_embed.to(device=self._cached_all_token_embeds.device)), - dim=0) - - def get_len(self) -> int: - return len(self._output_token_ids) + len(self._prompt_token_ids) - - def get_prompt_len(self) -> int: - return len(self._prompt_token_ids) - - def get_output_len(self) -> int: - return len(self._output_token_ids) - - def get_token_ids(self) -> list[int]: - return self._cached_all_token_ids - - def get_token_embeddings(self) -> Optional[torch.Tensor]: - return self._cached_all_token_embeds - - def get_prefix_token_ids( - self, num_tokens: int - ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]: - """Get prefix tokens, and make the return value hashable""" - prompt_length = self.get_prompt_len() - if num_tokens > prompt_length: - return (self._prompt_token_ids_tuple, - tuple(self._output_token_ids[:num_tokens - prompt_length])) - else: - return (self._prompt_token_ids_tuple[:num_tokens], None) - - def get_num_computed_tokens(self) -> int: - """Return the number of prefill tokens that are already computed.""" - return self._num_computed_tokens - - def update_num_computed_tokens(self, num_new_computed_tokens: int): - """Update number of tokens computed so far.""" - self._num_computed_tokens += num_new_computed_tokens - assert self._num_computed_tokens <= self.get_len(), ( - self._num_computed_tokens, self.get_len()) - # If all tokens are computed, it means it is in decoding phase. - if self.get_num_uncomputed_tokens() == 0: - self._stage = SequenceStage.DECODE - - def get_num_cached_tokens(self) -> int: - """Return the number of tokens with prefix cache hit.""" - return self._num_cached_tokens - - def update_num_cached_tokens(self, num_cached_tokens: int): - """Update the number of tokens with prefix cache hit.""" - self._num_cached_tokens = num_cached_tokens - - def reset_state_for_recompute(self) -> None: - """Reset the number of computed tokens from this sequence. It is - supposed to be called when a sequence needs to be started from - the beginning again (e.g., sequence is preempted). - """ - self._num_computed_tokens = 0 - self._stage = SequenceStage.PREFILL - self._new_appended_tokens = [] - - def get_num_uncomputed_tokens(self) -> int: - """Return the number of prefill tokens that are not computed.""" - # we use `get_len()` which includes prompt_len + output_len instead - # of prompt_len here. This is because during recompute we need to - # prefill for both prompt and output. - return self.get_len() - self.get_num_computed_tokens() - - def get_last_token_id(self) -> int: - if not self._output_token_ids: - return self._prompt_token_ids[-1] - return self._output_token_ids[-1] - - def get_prompt_token_ids(self) -> tuple[int, ...]: - return self.prompt_token_ids - - def get_output_token_ids(self) -> tuple[int, ...]: - return self.output_token_ids - - def get_delta_and_reset(self) -> SequenceDataDelta: - delta = SequenceDataDelta(self._new_appended_tokens, - self._cumulative_logprob, - self.get_num_computed_tokens(), self.stage) - # Reset delta state. - self._new_appended_tokens = [] - return delta - - def apply_delta(self, delta: SequenceDataDelta): - self._num_computed_tokens = delta.new_num_computed_tokens - self._cumulative_logprob = delta.new_cumulative_logprob - self._stage = delta.new_stage - self._output_token_ids.extend(delta.new_output_token_ids) - self._cached_all_token_ids.extend(delta.new_output_token_ids) - - @property - def stage(self) -> SequenceStage: - return self._stage - - def __repr__(self) -> str: - return (f"SequenceData(" - f"prompt_token_ids={self._prompt_token_ids}, " - f"prompt_embeds.shape=" - f"{getattr(self._prompt_embeds, 'shape', None)}, " - f"output_token_ids={self.output_token_ids}, " - f"cumulative_logprob={self.cumulative_logprob}, " - f"get_num_computed_tokens={self.get_num_computed_tokens()})") - - -class Sequence: - """Stores the data, status, and block information of a sequence. - - The sequence is constructed from the - [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only) - or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] - (for encoder-decoder) instance passed in through the `inputs` - constructor argument. - - Args: - seq_id: The ID of the sequence. - inputs: The inputs of the sequence. - block_size: The block size of the sequence. Should be the same as the - block size used by the block manager and cache engine. - eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. - lora_request: LoRA request. - """ - - def __init__( - self, - seq_id: int, - inputs: SingletonInputs, - block_size: int, - eos_token_id: Optional[int] = None, - lora_request: Optional[LoRARequest] = None, - ) -> None: - self.seq_id = seq_id - self.inputs = inputs - self.block_size = block_size - self.eos_token_id = eos_token_id - self.lora_request = lora_request - - self.data = SequenceData.from_seqs( - self.prompt_token_ids, - prompt_embeds=self.inputs["prompt_embeds"] - if self.inputs["type"] == "embeds" else None) - self.output_logprobs: SampleLogprobs = [] - self.output_text = "" - - self.status = SequenceStatus.WAITING - self.stop_reason: Union[int, str, None] = None - - # These are used to keep track of delta outputs - self._last_output_token_ids_offset: int = 0 - self._last_output_text_offset: int = 0 - - # Used for incremental detokenization - self.prefix_offset = 0 - self.read_offset = 0 - # Input + output tokens - self.tokens: Optional[list[str]] = None - - @property - def n_blocks(self) -> int: - return (self.get_len() + self.block_size - 1) // self.block_size - - @property - def prompt(self) -> Optional[str]: - if self.inputs["type"] == "embeds": - return None - return self.inputs.get("prompt") - - @property - def prompt_token_ids(self) -> list[int]: - if self.inputs["type"] == "embeds": - return [0] * len(self.inputs["prompt_embeds"]) - return self.inputs["prompt_token_ids"] - - @property - def multi_modal_data(self) -> MultiModalKwargs: - if self.inputs["type"] == "multimodal": - return self.inputs["mm_kwargs"].get_data() - - return MultiModalKwargs() - - @property - def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - if self.inputs["type"] == "multimodal": - return self.inputs["mm_placeholders"] - - return {} - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - def get_output_text_to_return(self, buffer_length: int, - delta: bool) -> str: - """If delta is True, only new text since the last call to - this method is returned""" - - # We return the full output text if the sequence is finished. - truncate = buffer_length and not self.is_finished() - if not delta: - return self.output_text[:-buffer_length] if truncate else ( - self.output_text) - length = len(self.output_text) - if truncate: - length -= buffer_length - last_offset = self._last_output_text_offset - if last_offset < length: - self._last_output_text_offset = length - return self.output_text[last_offset:length] - return "" - - def get_output_token_ids_to_return( - self, delta: bool) -> Union[GenericSequence[int], int]: - """If delta is True, only new tokens since the last call to - this method are returned""" - if not delta: - return self.get_output_token_ids() - - output_len = self.get_output_len() - - # Get the number of new tokens - num_new_tokens = output_len - self._last_output_token_ids_offset - self._last_output_token_ids_offset = output_len - - # Return new tokens - if num_new_tokens == 1: - # Optimization for single decode token case - # (which is what we have most of the time) - return self.data._cached_all_token_ids[-1] - - if num_new_tokens == 0: - return [] - - return self.data._cached_all_token_ids[-num_new_tokens:] - - def hash_of_block(self, logical_idx: int) -> int: - # TODO This can produce incorrect hash when block size > prompt size - - # Compute the number of tokens in the sequence - # TODO: The current hashing function is O(L^2). We should optimize - # this in the future. - num_tokens = self.num_hashed_tokens_of_block(logical_idx) - hashed_tokens = self.data.get_prefix_token_ids(num_tokens) - return hash((hashed_tokens, self.lora_int_id)) - - def extra_hash(self) -> Optional[int]: - """ - This function computes an extra hash for a sequence, specifically - designed for prefix caching mode. The final sequence hash is determined - by applying token_ids from the sequence's blocks. - """ - if self.lora_int_id == 0: - return None - - # NOTE: If there are additional factors influencing the block aside from - # token_ids, include them as input parameters to the hash. - return hash(self.lora_int_id) - - def num_hashed_tokens_of_block(self, logical_idx: int): - return logical_idx * self.block_size + self.block_size - - def reset_state_for_recompute(self): - """Reset the sequence states for recomputation.""" - self.data.reset_state_for_recompute() - - def append_token_id(self, - token_id: int, - logprobs: dict[int, Logprob], - token_embed: Optional[torch.Tensor] = None) -> None: - assert token_id in logprobs - self.output_logprobs.append(logprobs) - self.data.append_token_id(token_id, logprobs[token_id].logprob, - token_embed) - - def get_len(self) -> int: - return self.data.get_len() - - def get_prompt_len(self) -> int: - return self.data.get_prompt_len() - - def get_output_len(self) -> int: - return self.data.get_output_len() - - def get_token_ids(self) -> list[int]: - return self.data.get_token_ids() - - def get_prompt_token_ids(self) -> tuple[int, ...]: - return self.data.get_prompt_token_ids() - - def get_last_token_id(self) -> int: - return self.data.get_last_token_id() - - def get_output_token_ids(self) -> tuple[int, ...]: - return self.data.get_output_token_ids() - - def get_cumulative_logprob(self) -> float: - return self.data.cumulative_logprob - - def is_finished(self) -> bool: - return SequenceStatus.is_finished(self.status) - - def fork(self, new_seq_id: int) -> "Sequence": - new_seq = copy.deepcopy(self) - new_seq.seq_id = new_seq_id - return new_seq - - def get_num_new_tokens(self) -> int: - """Get the number of new tokens to be computed. - - Returns: - The new number of tokens to be computed. I.e., 1 for decode, or - the remaining prompt size for prefill. - """ - if self.data.stage == SequenceStage.DECODE: - return 1 - return self.data.get_num_uncomputed_tokens() - - def get_num_computed_tokens(self) -> int: - return self.data.get_num_computed_tokens() - - def is_prefill(self) -> bool: - return self.data.stage == SequenceStage.PREFILL - - def __repr__(self) -> str: - return (f"Sequence(seq_id={self.seq_id}, " - f"status={self.status.name}, " - f"num_blocks={self.n_blocks})") - - -class SequenceGroupState(msgspec.Struct, - omit_defaults=True): # type: ignore[call-arg] - """Mutable state tied to a specific sequence group""" - - # for multi-step decoding - num_steps: int = 1 - current_step: int = 0 - - @property - def remaining_steps(self) -> int: - return self.num_steps - self.current_step - - -class SequenceGroup: - """A group of sequences that are generated from the same prompt. - - Args: - request_id: The ID of the request. - seqs: The list of sequences. - sampling_params: The sampling parameters used to generate the outputs. - arrival_time: The arrival time of the request. - lora_request: LoRA request. - pooling_params: The parameters used to generate the pooler - for a pooling model. - pooled_data: The extracted hidden states from a pooling model. - encoder_seq: Optional, the single encoder sequence. Should be None - unless you are working with an encoder/decoder model. - trace_headers: OpenTelemetry trace headers. - priority: User-defined priority of the request. - draft_size: The number of speculative tokens plus one from the target - model; equal to max number of tokens a step can generate - for single-draft speculative decoding but larger than - that for multi-draft SD (currently not supported). - """ - - def __init__(self, - request_id: str, - seqs: list[Sequence], - arrival_time: float, - sampling_params: Optional[SamplingParams] = None, - lora_request: Optional[LoRARequest] = None, - pooling_params: Optional[PoolingParams] = None, - pooled_data: Optional[torch.Tensor] = None, - encoder_seq: Optional[Sequence] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - draft_size: int = 1) -> None: - self.request_id = request_id - self.seqs = seqs - self.first_seq = seqs[0] - self.arrival_time = arrival_time - self.is_single_seq = len(seqs) == 1 - self.seqs_dict = {seq.seq_id: seq for seq in seqs} - - self.sampling_params = sampling_params - self.metrics = RequestMetrics(arrival_time=arrival_time, - last_token_time=arrival_time, - first_scheduled_time=None, - first_token_time=None, - time_in_queue=None) - self.last_token_latency = 0.0 - self.lora_request = lora_request - self.prompt_logprobs: Optional[PromptLogprobs] = None - self.state = SequenceGroupState() - self.pooling_params = pooling_params - self.pooled_data = pooled_data - self.encoder_seq = encoder_seq - self.trace_headers = trace_headers - self.priority = priority - - self.cached_request_output = None - - @property - def prompt(self) -> Optional[str]: - return self.first_seq.prompt - - @property - def prompt_token_ids(self) -> list[int]: - return self.first_seq.prompt_token_ids - - @property - def encoder_prompt(self) -> Optional[str]: - # There are either 0 or 1 encoder sequences - # If one is present, its prompt is distinct - # from the decoder's. - return (self.encoder_seq.prompt - if self.encoder_seq is not None else None) - - @property - def encoder_prompt_token_ids(self) -> Optional[list[int]]: - # There are either 0 or 1 encoder sequences - # If one is present, its prompt token ids are - # distinct from the decoder's. - return (self.encoder_seq.prompt_token_ids - if self.encoder_seq is not None else None) - - @property - def multi_modal_data(self) -> MultiModalKwargs: - if self.first_seq.multi_modal_data: - return self.first_seq.multi_modal_data - elif self.encoder_seq is not None: - return self.encoder_seq.multi_modal_data - return MultiModalKwargs() - - @property - def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - if self.first_seq.multi_modal_data: - return self.first_seq.multi_modal_placeholders - elif self.encoder_seq is not None: - return self.encoder_seq.multi_modal_placeholders - return {} - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - def set_last_token_time(self, now: float) -> None: - """Sets the last token time for Request level timings.""" - # If still in prefill phase, assertion fails. - assert not self.is_prefill(), ( - "seq_group.set_last_token_time() should not be called " - "if the seq_group is in prefill phase.") - self.last_token_latency = now - self.metrics.last_token_time - self.metrics.last_token_time = now - - def get_last_token_latency(self) -> float: - """Returns the latency of the last token.""" - assert not self.is_prefill(), ( - "seq_group.get_last_token_latency() should not be called " - "if the seq_group is in prefill phase.") - return self.last_token_latency - - def maybe_set_first_token_time(self, time: float) -> None: - """Sets the first token time for Request level timings.""" - # Note: in a case where a sequence_group is swapped and - # recomputed, the time between iterations is counted - # in TPOT, rather than recalculating TTFT (since from the ) - # POV of the user, there is simply a long generation delay. - if (self.metrics.first_token_time is None - and self.first_seq.get_output_len() == 1): - self.metrics.first_token_time = time - - def maybe_set_first_scheduled_time(self, time: float) -> None: - """Sets the first scheduled time and time in queue for Request - level timings.""" - if self.metrics.first_scheduled_time is None: - self.metrics.first_scheduled_time = time - self.metrics.time_in_queue = time - self.metrics.arrival_time - - def set_finished_time(self, time: Optional[float]) -> None: - """Sets the finished time for Request level timings.""" - self.metrics.finished_time = time - - def get_max_num_running_seqs(self) -> int: - """The maximum number of sequences running in parallel in the remaining - lifetime of the request.""" - if self.is_single_seq: - return 0 if self.first_seq.is_finished() else 1 - return self.num_seqs() - self.num_finished_seqs() - - def get_seqs( - self, - status: Optional[SequenceStatus] = None, - ) -> list[Sequence]: - if status is None: - return self.seqs - - if self.is_single_seq: - return self.seqs if self.first_seq.status == status else [] - - return [seq for seq in self.seqs if seq.status == status] - - def is_encoder_decoder(self) -> bool: - return self.encoder_seq is not None - - def get_encoder_seq(self) -> Optional[Sequence]: - return self.encoder_seq - - def get_finished_seqs(self) -> list[Sequence]: - if self.is_single_seq: - return self.seqs if self.first_seq.is_finished() else [] - - return [seq for seq in self.seqs if seq.is_finished()] - - def update_num_computed_tokens(self, num_new_computed_tokens: int): - """Update number of tokens computed so far.""" - for seq in self.seqs: - if not seq.is_finished(): - seq.data.update_num_computed_tokens(num_new_computed_tokens) - - def get_num_uncomputed_tokens(self) -> int: - num_uncomputed_tokens = 0 - for seq in self.seqs: - if not seq.is_finished(): - num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() - return num_uncomputed_tokens - - def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: - # Optimization. We don't need to call get_seqs if we don't need to - # filter by states. - if status is None: - return len(self.seqs) - - if self.is_single_seq: - return 1 if self.seqs[0].status == status else 0 - - return len(self.get_seqs(status)) - - def num_finished_seqs(self) -> int: - if self.is_single_seq: - return 1 if self.seqs[0].is_finished() else 0 - return len(self.get_finished_seqs()) - - def is_finished(self) -> bool: - if self.is_single_seq: - return self.first_seq.is_finished() - return all(seq.is_finished() for seq in self.seqs) - - def is_prefill(self) -> bool: - return self.first_seq.is_prefill() - - def __repr__(self) -> str: - return (f"SequenceGroup(request_id={self.request_id}, " - f"sampling_params={self.sampling_params}, " - f"num_seqs={len(self.seqs)})") - - def uses_prompt_embeds(self) -> bool: - """Returns True if the sequence group uses input embeds.""" - return any(seq.data.prompt_embeds is not None for seq in self.seqs) - - -class SequenceGroupMetadataDelta( - msgspec.Struct, - tag=True, # type: ignore[call-arg] - array_like=True, # type: ignore[call-arg] - omit_defaults=True): # type: ignore[call-arg] - """Delta of SequenceGroupMetadata. - - After sending the first SequenceGroupMetadata, vLLM scheduler - only sends delta to reduce the data payload size. - """ - seq_data_delta: dict[int, SequenceDataDelta] - request_id: str - block_tables: dict[int, list[int]] - is_prompt: bool - do_sample: bool = True - token_chunk_size: Optional[int] = None - computed_block_nums: Optional[list[int]] = None - state: Optional[SequenceGroupState] = msgspec.field( - default_factory=lambda: SequenceGroupState()) - - -class SequenceGroupMetadata( - msgspec.Struct, - tag=True, # type: ignore[call-arg] - array_like=True, # type: ignore[call-arg] - omit_defaults=True): # type: ignore[call-arg] - """Metadata for a sequence group. Used to create `AttentionMetadata`. - - Attributes: - request_id: The ID of the request. - is_prompt: Whether the request is at prompt stage. - seq_data: The sequence data. (Seq id -> sequence data) - sampling_params: The sampling parameters used to generate the outputs. - block_tables: The block tables. (Seq id -> list of physical block - numbers) - do_sample: True if sampling is required. Sampling is not required when - e.g., prefill is chunked, and the current iteration only computes - query tokens for prefill, we don't need sampling. - pooling_params: Pooling parameters. - lora_request: LoRA request. - computed_block_nums: The block numbers that are already computed, - used in prefix caching. - state: Internal state tied to this sequence group. - token_type_ids: Token type IDs. - multi_modal_data: Multi modal data. - multi_modal_placeholders: Multi modal placeholders. - encoder_seq_data: Optional sequence data for encoder prompt - (SequenceGroup.encoder_seq). Should be None - unless you are working with an encoder/decoder - model. - cross_block_table: Optional cross-attention block table associated - with the encoder prompt - (SequenceGroup.encoder_seq). Should be None - unless you are working with an encoder/decoder - model. - """ - - request_id: str - is_prompt: bool - seq_data: dict[int, SequenceData] - sampling_params: Optional[SamplingParams] - block_tables: dict[int, list[int]] - do_sample: bool = True - pooling_params: Optional[PoolingParams] = None - lora_request: Optional[LoRARequest] = None - computed_block_nums: Optional[list[int]] = None - state: Optional[SequenceGroupState] = msgspec.field( - default_factory=lambda: SequenceGroupState()) - multi_modal_data: Optional[MultiModalKwargs] = None - multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None - encoder_seq_data: Optional[SequenceData] = None - cross_block_table: Optional[list[int]] = None - token_chunk_size: Optional[int] = None - - ### Stateful fields that are lazily defined. ### - # The number of speculative tokens adopted in this request. - # None means specuative decoding is not used. - # Zero means speculative decoding is disabled for some reasons. - # TODO: We should maintain this states out of the sequence group. - num_speculative_tokens: Optional[int] = None - - def __post_init__(self): - if self.seq_data is not None and self.token_chunk_size is None: - if self.is_prompt: - self.token_chunk_size = next(iter( - self.seq_data.values())).get_len() - else: - self.token_chunk_size = 1 - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - # Multi-Step Chunked-Prefill property - @property - def is_single_step_prompt(self) -> bool: - # do_sample is true, only when the token_chunk_size matches the - # num_uncomputed_tokens of the sequence. This indicates that - # the prompt will finish processing in a single `execute_model` - # step. - return self.is_prompt and self.do_sample - - def get_first_seq_id(self) -> int: - # This is an efficient way of fetching the seq_id when - # we know this SequenceGroup has only one sequence. - return next(iter(self.seq_data)) - - def apply_delta(self, - sequence_group_metadata_delta: SequenceGroupMetadataDelta): - for id, delta in sequence_group_metadata_delta.seq_data_delta.items(): - self.seq_data[id].apply_delta(delta) - assert self.request_id == sequence_group_metadata_delta.request_id - self.block_tables = sequence_group_metadata_delta.block_tables - self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size - self.do_sample = sequence_group_metadata_delta.do_sample - self.is_prompt = sequence_group_metadata_delta.is_prompt - - def finish_step(self) -> None: - assert self.state is not None - assert self.state.current_step < self.state.num_steps, \ - f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa - self.state.current_step += 1 - - -class SequenceOutput( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """The model output associated with a sequence. - - Attributes: - parent_seq_id: The ID of the parent sequence (for forking in beam - search). - output_token: The output token ID. - logprobs: The logprobs of the output token. - (Token id -> logP(x_i+1 | x_0, ..., x_i)) - output_embed: Optional output embedding tensor. - """ - parent_seq_id: int - output_token: int - logprobs: dict[int, Logprob] - output_embed: Optional[torch.Tensor] = None - - def __repr__(self) -> str: - output_embed_shape = \ - self.output_embed.shape if self.output_embed is not None else None - return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " - f"output_token={self.output_token}, " - f"output_embed.shape={output_embed_shape}, " - f"logprobs={self.logprobs})") - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SequenceOutput): - raise NotImplementedError() - equal = (self.parent_seq_id == other.parent_seq_id - and self.output_token == other.output_token) - log_probs_equal = other.logprobs == self.logprobs - return equal and log_probs_equal - - -class SequenceGroupOutput(ABC): - """The base class for model outputs associated with a sequence group.""" - - @abstractmethod - def __repr__(self) -> str: - pass - - @abstractmethod - def __eq__(self, other: object) -> bool: - pass - - -class CompletionSequenceGroupOutput( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """The model output associated with a completion sequence group.""" - __metaclass__ = SequenceGroupOutput - samples: list[SequenceOutput] - # Prompt logprob for each prompt query token. - prompt_logprobs: Optional[PromptLogprobs] - step_index: Optional[int] = 0 - - def __repr__(self) -> str: - return (f"CompletionSequenceGroupOutput(samples={self.samples}, " - f"prompt_logprobs={self.prompt_logprobs})") - - def __eq__(self, other: object) -> bool: - if not isinstance(other, CompletionSequenceGroupOutput): - raise NotImplementedError() - return (self.samples == other.samples - and self.prompt_logprobs == other.prompt_logprobs) - - class PoolingSequenceGroupOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True, # type: ignore[call-arg] ): """The model output associated with a pooling sequence group.""" - __metaclass__ = SequenceGroupOutput # Annotated as Any to be compatible with msgspec # The actual type is in SequenceGroup.pooled_data data: Any @@ -1161,305 +143,9 @@ class PoolerOutput( self.__class__) and self.outputs == other.outputs -def get_all_seq_ids( - seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]: - """Given a list of SequenceGroupMetadata, create a list of all - sequence ids. - """ - return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data] - - -def get_all_seq_ids_and_request_ids( - seq_group_metadata_list: list[SequenceGroupMetadata] -) -> tuple[list[int], dict[str, set[int]]]: - """Given a list of SequenceGroupMetadata, create a list of all - sequence ids. - """ - seq_ids: list[int] = [] - request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set) - for sg in seq_group_metadata_list: - for seq_id in sg.seq_data: - seq_ids.append(seq_id) - request_id_seq_ids_mapping[sg.request_id].add(seq_id) - return seq_ids, request_id_seq_ids_mapping - - -class HiddenStates(msgspec.Struct, array_like=True, - omit_defaults=True): # type: ignore[call-arg] - """Hidden states corresponding to in-progress sequences. - Used in speculative decoding to pass hidden states from - the target model to the proposer model. - - seq_ids are the sequence ids of each entry of the batch - dimension of the hidden_states tensor""" - # Scorer hidden states. For prefill step, it is used for hidden states of - # all tokens, whereas for decode step, it is used for last accepted tokens. - hidden_states: torch.Tensor - # The sequence group metadata list. Only needed for decode step. - seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None - # Scorer hidden states of the 2nd last token proposed by the proposer ( - # irrespective of whether it was accepted or not). Only used for cases when - # last proposed token is accepted (i.e., in case of bonus tokens). For the - # case of no bonus tokens, these are ignored. - second_last_token_hidden_states: Optional[torch.Tensor] = None - - _seq_ids: list[int] = msgspec.field(default_factory=list) - - def __post_init__(self): - if self.seq_group_metadata_list is not None: - assert len(self.seq_group_metadata_list) == len(self.hidden_states) - self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list) - - @property - def seq_ids(self) -> list[int]: - return self._seq_ids - - def update(self, - hidden_states: torch.Tensor, - seq_group_metadata_list: list[SequenceGroupMetadata], - second_last_token_hidden_states: Optional[torch.Tensor] = None): - """Update hidden states from target model invocation. Only used for - decode steps""" - assert len(seq_group_metadata_list) == len(hidden_states) - self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) - self.hidden_states = torch.cat([self.hidden_states, hidden_states]) - - if self.second_last_token_hidden_states is not None: - # Adding dummy hidden_states to this to maintain same shape - self.second_last_token_hidden_states = torch.cat([ - self.second_last_token_hidden_states, - torch.zeros_like(hidden_states) - if second_last_token_hidden_states is None else - second_last_token_hidden_states - ]) - - def prune(self, - seq_group_metadata_list: list[SequenceGroupMetadata]) -> None: - """Prune to provided list of sequence ids. Only used for decode steps. - """ - # Currently this prunes all seq_ids not present in - # seq_group_metadata_list which might cause problems where a sequence - # may be "paused" then "resumed" later. This should only prune sequences - # which are confirmed to be aborted. - seq_ids = get_all_seq_ids(seq_group_metadata_list) - # Only keep sequence IDs that exist in self._seq_ids - seq_ids = [seq_id for seq_id in seq_ids if seq_id in self._seq_ids] - if seq_ids != self._seq_ids: - # Batch contents changed - prune removed sequences. - index = [self._seq_ids.index(seq_id) for seq_id in seq_ids] - self.hidden_states = self.hidden_states[index] - if self.second_last_token_hidden_states is not None: - self.second_last_token_hidden_states = self\ - .second_last_token_hidden_states[index] - self._seq_ids = seq_ids - - def expand_with_bonus_tokens( - self, seq_with_bonus_token_in_last_step: set) -> None: - """Expand hidden states for sequences with bonus tokens. This is in - alignment with `MultiStepWorker._expand_execute_model_request`.""" - if self.second_last_token_hidden_states is None \ - or not seq_with_bonus_token_in_last_step: - return - - index = [] - for seq_id in self._seq_ids: - i = self._seq_ids.index(seq_id) - if seq_id in seq_with_bonus_token_in_last_step: - index.append(i + len(self._seq_ids)) - index.append(i) - - self.hidden_states = torch.cat( - [self.hidden_states, self.second_last_token_hidden_states])[index] - - class ExecuteModelRequest( msgspec.Struct, array_like=True, # type: ignore[call-arg] omit_defaults=True): # type: ignore[call-arg] - """The model execution request, containing CPU metadata only. The LLM - engine should create an instance of this class for each request batch.""" - # The sequence group metadata list. - seq_group_metadata_list: list[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]] - # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: list[tuple[int, - int]] = msgspec.field(default_factory=list) - # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: list[tuple[int, - int]] = msgspec.field(default_factory=list) - # Blocks to copy. Source to dest block. - blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list) - # Virtual engine ID for pipeline parallel. - virtual_engine: int = 0 - # The number of slots for lookahead decoding. - num_lookahead_slots: int = 0 - # The number of requests in the running queue. - running_queue_size: int = 0 - # Optional hidden states from prior step. - previous_hidden_states: Optional[HiddenStates] = None - # The number of forward steps to run. - num_steps: int = 1 - # Finished request ids since last step. - finished_requests_ids: list[str] = msgspec.field(default_factory=list) - # The last sampled token ids for multi step decoding. - last_sampled_token_ids: Optional[torch.Tensor] = None - # Async callback - async_callback: Optional[Callable] = None - - @property - def is_last_step(self) -> bool: - # TODO(will) make this be able to handle batches with variable number of - # steps - assert len(self.seq_group_metadata_list) > 0 - first_seq_group = self.seq_group_metadata_list[0] - assert first_seq_group.state is not None - return first_seq_group.state.remaining_steps == 1 - - @property - def current_step(self) -> int: - # TODO(will) make this be able to handle batches with variable number of - # steps - assert len(self.seq_group_metadata_list) > 0 - state = self.seq_group_metadata_list[0].state - assert state is not None - return state.current_step - - def clone( - self, seq_group_metadata_list: list[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]] - ) -> "ExecuteModelRequest": - """Clone the request with a new sequence group metadata list.""" - return ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=self.blocks_to_swap_in.copy(), - blocks_to_swap_out=self.blocks_to_swap_out.copy(), - blocks_to_copy=self.blocks_to_copy.copy(), - virtual_engine=self.virtual_engine, - num_lookahead_slots=self.num_lookahead_slots, - running_queue_size=self.running_queue_size, - previous_hidden_states=self.previous_hidden_states, - num_steps=self.num_steps, - finished_requests_ids=self.finished_requests_ids, - last_sampled_token_ids=self.last_sampled_token_ids.clone() - if self.last_sampled_token_ids is not None else None, - async_callback=self.async_callback) - - -@dataclass -class SequenceGroupBase: - group_id: str # the original request id before splitting - - assembled_seq_group: Optional[SequenceGroup] = None - - # seq id to a unique index inside this group - seq_id_to_index: dict[str, int] = field(default_factory=dict) - - # seq ids to be finished - to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict) - - # seq id to finished sequences - finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict) - - streaming: bool = False - - output_produced: bool = False - - @staticmethod - def add_request(request_id: str, engine, params, *args, **kwargs): - """When we are ready to add a request with request_id and params - into the engine, we can split the request into multiple requests. - """ - raise NotImplementedError - - def finish_seq(self, seq: SequenceGroup): - """The sequence `seq` finishes, we should record the information. - """ - del self.to_be_finished[seq.request_id] - self.finished_reqs[seq.request_id] = seq - - def maybe_assemble_group( - self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: - """Assemble the sequence group, for producing the final - output, or adding request in the engine again. - """ - raise NotImplementedError - - -class ParallelSampleSequenceGroup(SequenceGroupBase): - - @staticmethod - def add_request(request_id: str, engine, params, **kwargs): - original_params = params - group = ParallelSampleSequenceGroup(request_id) - seqs = [] - for i in range(original_params.n): - request_id_i = f"{request_id}_parallel_sample_{i}" - group.seq_id_to_index[request_id_i] = i - params = original_params.clone() - params.n = 1 - if params.seed is not None: - params.seed += i - seq_group = engine._add_processed_request( - request_id_i, - params=params, - **kwargs, - ) # type: ignore - assert seq_group is not None - engine.seq_id_to_seq_group[request_id_i] = group - group.to_be_finished[request_id_i] = seq_group - seqs.append(seq_group.seqs[0]) - - # for parallel sampling, the `assembled_seq_group` is always - # available, since we have all the sequences ready, and they - # will not change. - group.assembled_seq_group = SequenceGroup( - request_id=request_id, - seqs=seqs, - arrival_time=seq_group.arrival_time, - sampling_params=original_params, - lora_request=seq_group.lora_request, - pooling_params=seq_group.pooling_params, - pooled_data=seq_group.pooled_data, - encoder_seq=seq_group.encoder_seq, - trace_headers=seq_group.trace_headers, - priority=seq_group.priority, - ) - - group.streaming = params.output_kind == RequestOutputKind.DELTA - group.output_produced = False - - def maybe_assemble_group( - self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: - - # in the streaming mode, we will return the assembled sequence - # for the first remaining sequence, and then return None for the - # rest of sequences - if self.streaming: - first_remaining_id = next(iter(self.to_be_finished)) - if seq_group.request_id == first_remaining_id: - return self.assembled_seq_group - return None - - # in the non-streaming mode, we will return the assembled sequence - # when the last sequences finishes, and then return None for the - # rest of the time - if (len(self.to_be_finished) == 1 - and seq_group.request_id in self.to_be_finished - and seq_group.is_finished()): - assert self.assembled_seq_group is not None - params = self.assembled_seq_group.sampling_params - assert isinstance(params, SamplingParams) - if not self.output_produced: - self.output_produced = True - if params._real_n is not None: - # Get the top-n sequences. - n = params._real_n or params.n - seqs = self.assembled_seq_group.seqs - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] - self.assembled_seq_group.seqs = top_n_seqs - return self.assembled_seq_group - if self.output_produced: - return None - return None + # Placeholder. Remove. + pass diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py deleted file mode 100644 index e2d2846a28073..0000000000000 --- a/vllm/transformers_utils/detokenizer.py +++ /dev/null @@ -1,162 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -from vllm.logprobs import Logprob -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence, - SequenceGroup) - -from .detokenizer_utils import (convert_prompt_ids_to_tokens, - detokenize_incrementally) -from .tokenizer import AnyTokenizer - - -class Detokenizer: - """Provides methods to decode the output of a model into text.""" - - def __init__(self, tokenizer: AnyTokenizer): - self.tokenizer = tokenizer - - def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, - prompt_logprobs: list[Optional[dict[ - int, Logprob]]], - position_offset: int) -> None: - """Decodes the logprobs for the prompt of a sequence group. - - Args: - seq_group: The sequence group to decode. - prompt_logprobs: The logprobs to decode. - position_offset: Offset of the first index of the logprobs - relative to the start of the sequence (for chunked prefill). - - Returns: - The prompt logprobs with the decoded tokens. - """ - prms = seq_group.sampling_params - assert prms is not None - - # We can pick any sequence for the prompt. - seq = seq_group.get_seqs()[0] - # Only prompt, without the generated token. - all_token_ids = seq.get_token_ids() - prompt_token_ids = all_token_ids[:-1] - prefix_offset = 0 - read_offset = 0 - next_iter_prefix_offset = 0 - next_iter_read_offset = 0 - next_iter_tokens: list[str] = [] - prev_tokens = None - - for token_position_in_logprob, prompt_logprobs_for_token in enumerate( - prompt_logprobs): - - # Absolute token position equals the index in the logprobs - # list plus the offset of the entire logprobs list relative - # to the start of the sequence. - token_position = token_position_in_logprob + position_offset - if not prompt_logprobs_for_token: - continue - for token_id, sample_logprob in prompt_logprobs_for_token.items(): - if (sample_logprob.decoded_token is None - and token_id != VLLM_INVALID_TOKEN_ID): - prompt_token_ids_with_token = ( - prompt_token_ids[:token_position] + [token_id]) - (new_tokens, new_text, new_prefix_offset, - new_read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=prompt_token_ids_with_token, - prev_tokens=prev_tokens, - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms. - spaces_between_special_tokens, - ) - - sample_logprob.decoded_token = new_text - - # Use the offsets & prev tokens corresponding to - # real tokens to ensure detokenization is consistent - # actual with prompt. - if token_id == all_token_ids[token_position]: - next_iter_prefix_offset = new_prefix_offset - next_iter_read_offset = new_read_offset - next_iter_tokens = new_tokens - - # Advance to the next token position. - prefix_offset = next_iter_prefix_offset - read_offset = next_iter_read_offset - if prev_tokens is None: - prev_tokens = next_iter_tokens.copy() - else: - prev_tokens.extend(next_iter_tokens) - - def decode_sequence_inplace(self, seq: Sequence, - prms: SamplingParams) -> int: - """Decodes the new token for a sequence. In-place operation. - - Args: - seq: The sequence to decode. - prms: The sampling parameters used to generate the sequence. - - Returns: - The number of characters added to the output text. - """ - all_input_ids = seq.get_token_ids() - token_id_generated_this_iteration = all_input_ids[-1] - - # Convert prompt token IDs to tokens if necessary. - # Do it here so that we don't have to repeat this - # computation for each logprob. - if seq.tokens is None: - (seq.tokens, seq.prefix_offset, - seq.read_offset) = convert_prompt_ids_to_tokens( - tokenizer=self.tokenizer, - prompt_ids=all_input_ids[:-1], - skip_special_tokens=prms.skip_special_tokens, - ) - - (new_tokens, new_decoded_token_text, prefix_offset, - read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=all_input_ids, - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms.spaces_between_special_tokens, - ) - - # Decode logprobs - logprobs = seq.output_logprobs[-1] - if logprobs: - previous_tokens = all_input_ids[:-1] - for token_id, sample_logprob in logprobs.items(): - # If the token was generated this iteration, - # use the provided text. - if token_id == token_id_generated_this_iteration: - sample_logprob.decoded_token = new_decoded_token_text - continue - - if (sample_logprob.decoded_token is None - and token_id != VLLM_INVALID_TOKEN_ID): - all_input_ids_with_logprob = previous_tokens + [token_id] - (_, new_text, _, _) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=all_input_ids_with_logprob, - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms. - spaces_between_special_tokens, - ) - sample_logprob.decoded_token = new_text - - seq.tokens.extend(new_tokens) - seq.prefix_offset = prefix_offset - seq.read_offset = read_offset - seq.output_text += new_decoded_token_text - - return len(new_decoded_token_text) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index eaab976bf7f75..20fabef4f19b9 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -11,12 +11,12 @@ import torch.nn as nn from vllm.config import VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.utils import (enable_trace_function_call_for_thread, resolve_obj_by_qualname, run_method, update_environment_variables, warn_for_unimplemented_methods) +from vllm.v1.outputs import SamplerOutput logger = init_logger(__name__) From 0ff8ebb2d700b2e39457a661ef979b0da2ad73b3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sun, 21 Sep 2025 08:52:32 -0700 Subject: [PATCH 529/584] [V0 Deprecation] Remove async_output_proc, preemption mode, delay factor (#25334) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- tests/detokenizer/test_stop_strings.py | 46 ++---------------- .../test_processor_multi_modal_uuids.py | 10 ---- tests/v1/test_oracle.py | 18 ------- vllm/config/__init__.py | 4 -- vllm/config/model.py | 48 +++---------------- vllm/config/scheduler.py | 15 +----- vllm/engine/arg_utils.py | 34 ------------- vllm/entrypoints/llm.py | 4 -- vllm/executor/uniproc_executor.py | 4 -- vllm/platforms/cpu.py | 4 -- vllm/platforms/cuda.py | 10 ---- vllm/platforms/interface.py | 7 --- vllm/platforms/rocm.py | 10 ---- vllm/platforms/tpu.py | 4 -- vllm/platforms/xpu.py | 4 -- 15 files changed, 12 insertions(+), 210 deletions(-) diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index cb87c44cc3999..46f7d58c438cb 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -32,10 +32,6 @@ def _test_stopping(llm: LLM, assert output.stop_reason == expected_reason -def _set_async_mode(llm, is_async): - llm.llm_engine.scheduler[0].use_async_output_proc = is_async - - def _stop_basic(llm): _test_stopping(llm, stop=["."], @@ -103,40 +99,8 @@ def test_stop_strings(): # async output processing below. llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) - if envs.VLLM_USE_V1: - _stop_basic(llm) - else: - _set_async_mode(llm, True) - _stop_basic(llm) - - _set_async_mode(llm, False) - _stop_basic(llm) - - if envs.VLLM_USE_V1: - _stop_multi_tokens(llm) - else: - _set_async_mode(llm, True) - _stop_multi_tokens(llm) - - _set_async_mode(llm, False) - _stop_multi_tokens(llm) - - if envs.VLLM_USE_V1: - _stop_partial_token(llm) - else: - _set_async_mode(llm, True) - _stop_partial_token(llm) - - _set_async_mode(llm, False) - _stop_partial_token(llm) - - if envs.VLLM_USE_V1: - # FIXME: this does not respect include_in_output=False - # _stop_token_id(llm) - pass - else: - _set_async_mode(llm, True) - _stop_token_id(llm) - - _set_async_mode(llm, False) - _stop_token_id(llm) + _stop_basic(llm) + _stop_multi_tokens(llm) + _stop_partial_token(llm) + # FIXME: this does not respect include_in_output=False + # _stop_token_id(llm) diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index bdd41eece2317..3a7bcb9571825 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -6,7 +6,6 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig -from vllm.platforms.interface import UnspecifiedPlatform from vllm.sampling_params import SamplingParams from vllm.v1.engine import processor as processor_mod from vllm.v1.engine.processor import Processor @@ -33,15 +32,6 @@ def _mk_processor(monkeypatch, "__post_init__", lambda self, *args: None, raising=True) - monkeypatch.setattr(UnspecifiedPlatform, - "is_async_output_supported", - classmethod(lambda cls, enforce_eager: True), - raising=True) - monkeypatch.setattr( - ModelConfig, - "verify_async_output_proc", - lambda self, parallel_config, speculative_config, device_config: None, - raising=True) monkeypatch.setattr(ModelConfig, "verify_with_parallel_config", lambda self, parallel_config: None, diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 28c24f62895ab..f6b8a18dd7c2c 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -29,24 +29,6 @@ def test_unsupported_configs(monkeypatch): }, ).create_engine_config() - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - preemption_mode="swap", - ).create_engine_config() - - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - disable_async_output_proc=True, - ).create_engine_config() - - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - scheduler_delay_factor=1.2, - ).create_engine_config() - def test_enable_by_default_fallback(monkeypatch): with monkeypatch.context() as m: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index ddd8de4324f6f..e31a78ba33baa 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -454,9 +454,6 @@ class VllmConfig: self.try_verify_and_update_config() if self.model_config is not None: - self.model_config.verify_async_output_proc(self.parallel_config, - self.speculative_config, - self.device_config) self.model_config.verify_with_parallel_config(self.parallel_config) self.model_config.verify_dual_chunk_attention_config( self.load_config) @@ -877,7 +874,6 @@ class VllmConfig: f"served_model_name={self.model_config.served_model_name}, " f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa - f"use_async_output_proc={self.model_config.use_async_output_proc}, " f"pooler_config={self.model_config.pooler_config!r}, " f"compilation_config={self.compilation_config!r}") diff --git a/vllm/config/model.py b/vllm/config/model.py index 921322bb475c5..b53029dc8c3ec 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -223,8 +223,6 @@ class ModelConfig: that this name(s) will also be used in `model_name` tag content of prometheus metrics, if multiple names provided, metrics tag will take the first one.""" - use_async_output_proc: bool = True - """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = "auto" """The format of the model config to load:\n - "auto" will try to load the config in hf format if available else it @@ -1119,37 +1117,6 @@ class ModelConfig: raise ValueError("please set VLLM_ATTENTION_BACKEND to " f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") - def verify_async_output_proc(self, parallel_config, speculative_config, - device_config) -> None: - if not self.use_async_output_proc: - # Nothing to check - return - - if parallel_config.pipeline_parallel_size > 1: - self.use_async_output_proc = False - return - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - from vllm.platforms import current_platform - if not current_platform.is_async_output_supported(self.enforce_eager): - self.use_async_output_proc = False - return - - if envs.VLLM_USE_RAY_SPMD_WORKER: - self.use_async_output_proc = False - return - - # Async postprocessor is not necessary for pooling models - # since there is no token generation - if self.runner_type == "pooling": - self.use_async_output_proc = False - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if speculative_config: - self.use_async_output_proc = False - def verify_with_parallel_config( self, parallel_config: ParallelConfig, @@ -1173,15 +1140,12 @@ class ModelConfig: self._verify_with_expert_parallelism() pipeline_parallel_size = parallel_config.pipeline_parallel_size - if pipeline_parallel_size > 1: - if not self.registry.is_pp_supported_model(self.architectures, - self): - raise NotImplementedError( - "Pipeline parallelism is not supported for this model. " - "Supported models implement the `SupportsPP` interface.") - - if self.use_async_output_proc: - self.use_async_output_proc = False + if (pipeline_parallel_size > 1 + and not self.registry.is_pp_supported_model( + self.architectures, self)): + raise NotImplementedError( + "Pipeline parallelism is not supported for this model. " + "Supported models implement the `SupportsPP` interface.") def get_sliding_window(self) -> Optional[int]: """Get the sliding window size from the HF text config if present.""" diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index f0f67bab9d6ff..daf094d2df5c8 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -3,7 +3,7 @@ import hashlib from dataclasses import field -from typing import Any, Literal, Optional, Union +from typing import Any, Literal, Union from pydantic import SkipValidation, model_validator from pydantic.dataclasses import dataclass @@ -18,7 +18,6 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, logger = init_logger(__name__) RunnerType = Literal["generate", "pooling", "draft"] -PreemptionMode = Literal["swap", "recompute"] SchedulerPolicy = Literal["fcfs", "priority"] @@ -78,10 +77,6 @@ class SchedulerConfig: 3. more than one value (e.g. 1 2 128) is provided, then the capture list will follow the provided list.""" - delay_factor: float = 0.0 - """Apply a delay (of delay factor multiplied by previous - prompt latency) before scheduling next prompt.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -103,14 +98,6 @@ class SchedulerConfig: NOTE: This is not currently configurable. It will be overridden by max_num_batched_tokens in case max multimodal embedding size is larger.""" - preemption_mode: Optional[PreemptionMode] = None - """Whether to perform preemption by swapping or - recomputation. If not specified, we determine the mode as follows: - We use recomputation by default since it incurs lower overhead than - swapping. However, when the sequence group has multiple sequences - (e.g., beam search), recomputation is not currently supported. In - such a case, we use swapping instead.""" - send_delta_data: bool = False """Private API. If used, scheduler sends delta data to workers instead of an entire data. It should be enabled only diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 242fcf501bfc4..fef4177b3a333 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -409,9 +409,7 @@ class EngineArgs: get_field(LoadConfig, "model_loader_extra_config") ignore_patterns: Optional[Union[str, List[str]]] = LoadConfig.ignore_patterns - preemption_mode: Optional[str] = SchedulerConfig.preemption_mode - scheduler_delay_factor: float = SchedulerConfig.delay_factor enable_chunked_prefill: Optional[ bool] = SchedulerConfig.enable_chunked_prefill disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input @@ -439,7 +437,6 @@ class EngineArgs: ObservabilityConfig.otlp_traces_endpoint collect_detailed_traces: Optional[list[DetailedTraceModules]] = \ ObservabilityConfig.collect_detailed_traces - disable_async_output_proc: bool = not ModelConfig.use_async_output_proc scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls @@ -561,14 +558,6 @@ class EngineArgs: **model_kwargs["enable_prompt_embeds"]) model_group.add_argument("--served-model-name", **model_kwargs["served_model_name"]) - # This one is a special case because it is the - # opposite of ModelConfig.use_async_output_proc - model_group.add_argument( - "--disable-async-output-proc", - action="store_true", - default=EngineArgs.disable_async_output_proc, - help="Disable async output processing. This may result in " - "lower performance.") model_group.add_argument("--config-format", **model_kwargs["config_format"]) # This one is a special case because it can bool @@ -897,10 +886,6 @@ class EngineArgs: **scheduler_kwargs["long_prefill_token_threshold"]) scheduler_group.add_argument("--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]) - scheduler_group.add_argument("--scheduler-delay-factor", - **scheduler_kwargs["delay_factor"]) - scheduler_group.add_argument("--preemption-mode", - **scheduler_kwargs["preemption_mode"]) # multi-step scheduling has been removed; corresponding arguments # are no longer supported. scheduler_group.add_argument("--scheduling-policy", @@ -1029,7 +1014,6 @@ class EngineArgs: interleave_mm_strings=self.interleave_mm_strings, media_io_kwargs=self.media_io_kwargs, skip_mm_profiling=self.skip_mm_profiling, - use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, @@ -1395,11 +1379,9 @@ class EngineArgs: max_model_len=model_config.max_model_len, cuda_graph_sizes=self.cuda_graph_sizes, num_lookahead_slots=num_lookahead_slots, - delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, disable_chunked_mm_input=self.disable_chunked_mm_input, is_multimodal_model=model_config.is_multimodal_model, - preemption_mode=self.preemption_mode, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), policy=self.scheduling_policy, @@ -1492,22 +1474,6 @@ class EngineArgs: recommend_to_remove=False) return False - if self.preemption_mode != SchedulerConfig.preemption_mode: - _raise_or_fallback(feature_name="--preemption-mode", - recommend_to_remove=True) - return False - - if (self.disable_async_output_proc - != EngineArgs.disable_async_output_proc): - _raise_or_fallback(feature_name="--disable-async-output-proc", - recommend_to_remove=True) - return False - - if self.scheduler_delay_factor != SchedulerConfig.delay_factor: - _raise_or_fallback(feature_name="--scheduler-delay-factor", - recommend_to_remove=True) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0ab806fcb8b5c..092d3f276d1c5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -137,8 +137,6 @@ class LLM: back to the eager mode. disable_custom_all_reduce: See [ParallelConfig][vllm.config.ParallelConfig]. - disable_async_output_proc: Disable async output processing. - This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files . If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). @@ -188,7 +186,6 @@ class LLM: enforce_eager: bool = False, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, - disable_async_output_proc: bool = False, hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, @@ -286,7 +283,6 @@ class LLM: enforce_eager=enforce_eager, max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, - disable_async_output_proc=disable_async_output_proc, hf_token=hf_token, hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 3b566e88a9ec2..7a753d608a43a 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -137,10 +137,6 @@ class ExecutorWithExternalLauncher(UniProcExecutor): def _init_executor(self) -> None: """Initialize the worker and load the model. """ - assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ - ("ExecutorWithExternalLauncher needs deterministic " - "execution, so it" - "does not support delay_factor in scheduling") if envs.VLLM_USE_V1: assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \ ("To get deterministic execution in V1, " diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 544e091491bf5..cd41832bc2ea4 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -126,10 +126,6 @@ class CpuPlatform(Platform): """ torch.cpu.set_device(device) - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - @classmethod def inference_mode(cls): return torch.no_grad() diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 87d8f2b7481bb..c263e2afe83b0 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -96,16 +96,6 @@ class CudaPlatformBase(Platform): def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - if enforce_eager and not envs.VLLM_USE_V1: - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used") - return False - return True - @classmethod def is_fully_connected(cls, device_ids: list[int]) -> bool: raise NotImplementedError diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 53fc762dce540..c43580ac5da13 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -275,13 +275,6 @@ class Platform: """Get the total memory of a device in bytes.""" raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - """ - Check if the current platform supports async output. - """ - raise NotImplementedError - @classmethod def inference_mode(cls): """A device-specific wrapper of `torch.inference_mode`. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 4f540fe965e22..dce2924ac7a9d 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -310,16 +310,6 @@ class RocmPlatform(Platform): device_props = torch.cuda.get_device_properties(device_id) return device_props.total_memory - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - if enforce_eager and not envs.VLLM_USE_V1: - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used") - return False - return True - @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: from vllm.config.compilation import CUDAGraphMode diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 4e4db116abca0..9852d948bc4b8 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -75,10 +75,6 @@ class TpuPlatform(Platform): def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - @classmethod def get_punica_wrapper(cls) -> str: return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU" diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 67ef058df10f1..4d3bef4b42947 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -98,10 +98,6 @@ class XPUPlatform(Platform): device_props = torch.xpu.get_device_properties(device_id) return device_props.total_memory - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return True - @classmethod def inference_mode(cls): return torch.no_grad() From c438b2951c5adafd4db831605696d097ab3e513a Mon Sep 17 00:00:00 2001 From: Rahul Tuli <rtuli@redhat.com> Date: Sun, 21 Sep 2025 22:34:45 +0530 Subject: [PATCH 530/584] feat: Enable engine-level arguments with speculators models (#25250) Signed-off-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> --- .../speculators/test_eagle3.py | 62 ++++++++++++------- vllm/config/model.py | 12 +--- vllm/engine/arg_utils.py | 35 ++++------- vllm/transformers_utils/config.py | 46 +++++++++++--- .../configs/speculators/base.py | 58 +++++++++++------ 5 files changed, 128 insertions(+), 85 deletions(-) diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py index 45ddb2178722a..368238b3a7200 100644 --- a/tests/speculative_decoding/speculators/test_eagle3.py +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -3,38 +3,52 @@ import pytest import torch +from vllm.config import SpeculativeConfig from vllm.model_executor.models.interfaces import supports_eagle3 -@pytest.mark.parametrize( - "model_path", - [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) -def test_llama(vllm_runner, example_prompts, model_path, monkeypatch): +@pytest.mark.parametrize("model_path", [ + pytest.param( + "nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized", + id="llama3-eagle3-speculator"), + pytest.param( + "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized", + id="qwen3-eagle3-speculator"), +]) +def test_eagle3_speculators_model(vllm_runner, example_prompts, model_path, + monkeypatch): + """ + Test Eagle3 speculators models properly initialize speculative decoding. + + This test verifies: + 1. Eagle3 support is detected for the model + 2. Speculative config is automatically initialized from embedded config + 3. The draft model path is correctly set to the speculators model + 4. Speculative tokens count is valid + 5. Text generation works with speculative decoding enabled + """ # Set environment variable for V1 engine serialization monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + # Verify Eagle3 support is detected eagle3_supported = vllm_model.apply_model(supports_eagle3) - assert eagle3_supported + assert eagle3_supported, f"Eagle3 should be supported for {model_path}" + + vllm_config = vllm_model.llm.llm_engine.vllm_config + + assert isinstance(vllm_config.speculative_config, SpeculativeConfig), \ + "Speculative config should be initialized for speculators model" + + spec_config = vllm_config.speculative_config + assert spec_config.num_speculative_tokens > 0, \ + (f"Expected positive speculative tokens, " + f"got {spec_config.num_speculative_tokens}") + + assert spec_config.model == model_path, \ + f"Draft model should be {model_path}, got {spec_config.model}" vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens=20) - print(vllm_outputs) - assert vllm_outputs - - -@pytest.mark.parametrize( - "model_path", - [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")]) -def test_qwen(vllm_runner, example_prompts, model_path, monkeypatch): - # Set environment variable for V1 engine serialization - monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - - with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: - eagle3_supported = vllm_model.apply_model(supports_eagle3) - assert eagle3_supported - - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens=20) - print(vllm_outputs) - assert vllm_outputs + assert vllm_outputs, \ + f"No outputs generated for speculators model {model_path}" diff --git a/vllm/config/model.py b/vllm/config/model.py index b53029dc8c3ec..95fe52883db0d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -27,8 +27,7 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - is_interleaved, maybe_override_with_speculators_target_model, - try_get_generation_config, try_get_safetensors_metadata, + is_interleaved, try_get_generation_config, try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.runai_utils import (ObjectStorageModel, is_runai_obj_uri) @@ -416,15 +415,6 @@ class ModelConfig: self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - if self.runner != "draft": - # If we're not running the draft model, check for speculators config - # If speculators config, set model / tokenizer to be target model - self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 - model=self.model, - tokenizer=self.tokenizer, - revision=self.revision, - trust_remote_code=self.trust_remote_code) - if (backend := envs.VLLM_ATTENTION_BACKEND ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: raise ValueError( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fef4177b3a333..7e00260caa39a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -41,7 +41,8 @@ from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_ray_initialized from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 -from vllm.transformers_utils.config import get_model_path, is_interleaved +from vllm.transformers_utils.config import (get_model_path, is_interleaved, + maybe_override_with_speculators) from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -1082,29 +1083,8 @@ class EngineArgs: provided as a JSON string input via CLI arguments or directly as a dictionary from the engine. """ - - from vllm.transformers_utils.config import get_config - from vllm.transformers_utils.configs.speculators.base import ( - SpeculatorsConfig) - if self.speculative_config is None: - hf_config = get_config( - self.hf_config_path or target_model_config.model, - self.trust_remote_code, self.revision, self.code_revision, - self.config_format) - - # if loading a SpeculatorsConfig, load the speculative_config - # details from the config directly - # no user input required / expected - if isinstance(hf_config, SpeculatorsConfig): - # We create one since we don't create one - self.speculative_config = {} - self.speculative_config[ - "num_speculative_tokens"] = hf_config.num_lookahead_tokens - self.speculative_config["model"] = target_model_config.model - self.speculative_config["method"] = hf_config.method - else: - return None + return None # Note(Shangming): These parameters are not obtained from the cli arg # '--speculative-config' and must be passed in when creating the engine @@ -1139,6 +1119,15 @@ class EngineArgs: device_config = DeviceConfig( device=cast(Device, current_platform.device_type)) + + (self.model, self.tokenizer, + self.speculative_config) = maybe_override_with_speculators( + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code, + vllm_speculative_config=self.speculative_config, + ) model_config = self.create_model_config() # * If VLLM_USE_V1 is unset, we enable V1 for "supported features" diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 52e2c18a77847..9eed466788661 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -463,15 +463,29 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config -def maybe_override_with_speculators_target_model( +def maybe_override_with_speculators( model: str, tokenizer: str, trust_remote_code: bool, revision: Optional[str] = None, + vllm_speculative_config: Optional[dict[str, Any]] = None, **kwargs, -) -> tuple[str, str]: +) -> tuple[str, str, Optional[dict[str, Any]]]: """ - If running a speculators config, override running model with target model + Resolve model configuration when speculators are detected. + + Checks if the provided model is a speculators model and if so, extracts + the target model configuration and builds the speculative config. + + Args: + model: Model name or path + tokenizer: Tokenizer name or path + trust_remote_code: Whether to trust remote code + revision: Model revision + vllm_speculative_config: Existing vLLM speculative config + + Returns: + Tuple of (resolved_model, resolved_tokenizer, speculative_config) """ is_gguf = check_gguf_file(model) if is_gguf: @@ -487,11 +501,27 @@ def maybe_override_with_speculators_target_model( token=_get_hf_token(), **kwargs, ) - spec_config = config_dict.get("speculators_config", None) - # Return the target model - if spec_config is not None: - model = tokenizer = spec_config["verifier"]["name_or_path"] - return model, tokenizer + speculators_config = config_dict.get("speculators_config") + + if speculators_config is None: + # No speculators config found, return original values + return model, tokenizer, vllm_speculative_config + + # Speculators format detected - process overrides + from vllm.transformers_utils.configs.speculators.base import ( + SpeculatorsConfig) + + vllm_speculative_config = SpeculatorsConfig.extract_vllm_speculative_config( + config_dict=config_dict) + + # Set the draft model to the speculators model + vllm_speculative_config["model"] = model + + # Override model and tokenizer with the verifier model from config + verifier_model = speculators_config["verifier"]["name_or_path"] + model = tokenizer = verifier_model + + return model, tokenizer, vllm_speculative_config def get_config( diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index d7c16e180c709..53128b4eecb03 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -24,6 +24,12 @@ class SpeculatorsConfig(PretrainedConfig): config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + vllm_config = cls.extract_vllm_speculative_config(config_dict) + return cls(**vllm_config) + + @classmethod + def extract_vllm_speculative_config( + cls, config_dict: dict[str, Any]) -> dict[str, Any]: speculators_model_type = config_dict.get("speculators_model_type") if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES: raise ValueError( @@ -34,11 +40,12 @@ class SpeculatorsConfig(PretrainedConfig): # TODO: @dsikka - use speculators pydantic model to validate cls.validate_speculators_config(config_dict=config_dict) # Convert from speculators config -> format that can be ingested by vLLM - vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict) + vllm_config = cls.build_vllm_speculative_config( + config_dict=config_dict) # Apply anything specific to the supported algorithm algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type] algo_updater(config_dict=config_dict, vllm_config=vllm_config) - return cls(**vllm_config) + return vllm_config @classmethod def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None: @@ -60,32 +67,45 @@ class SpeculatorsConfig(PretrainedConfig): "'transformer_layer_config' must be a dictionary if provided") @classmethod - def convert_speculators_to_vllm( + def build_vllm_speculative_config( cls, config_dict: dict[str, Any]) -> dict[str, Any]: """ - Convert speculators config format to vLLM format. - - This method handles the translation of field names and structure - between speculators and vLLM formats. - - Returns: - Dictionary with vLLM-compatible configuration - """ - # Currently we only support one proposal method - spec_config = config_dict["speculators_config"] - first_method = spec_config.get("proposal_methods")[0] - num_lookahead_tokens = first_method.get("speculative_tokens") + Build vLLM-compatible speculative configuration from speculators format. - if num_lookahead_tokens is None: + This method extracts and transforms speculative configuration from the + speculators format into the structure expected by vLLM. + + Args: + config_dict: Configuration dictionary in speculators format + + Returns: + Dictionary with vLLM-compatible speculative configuration + """ + # Extract speculators configuration + spec_config = config_dict["speculators_config"] + + # Currently we only support one proposal method + proposal_methods = spec_config.get("proposal_methods") + if not proposal_methods: + raise ValueError("No proposal methods found in speculators config") + + first_method = proposal_methods[0] + num_speculative_tokens = first_method.get("speculative_tokens") + + if num_speculative_tokens is None: raise ValueError( "Missing 'speculative_tokens' in proposal method. " f"Got: {first_method}") - # Build base vLLM config + # Build base vLLM speculative configuration vllm_config = { "method": config_dict.get("speculators_model_type"), - "num_lookahead_tokens": num_lookahead_tokens, + "num_speculative_tokens": num_speculative_tokens, "target_model": spec_config.get("verifier")["name_or_path"] } - vllm_config.update(config_dict["transformer_layer_config"]) + + # Merge transformer layer configuration if present + transformer_config = config_dict.get("transformer_layer_config", {}) + vllm_config.update(transformer_config) + return vllm_config From 1c3ffdbeccaf75f7b885b1700a4011fb4f63a7ab Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sun, 21 Sep 2025 10:37:11 -0700 Subject: [PATCH 531/584] [V0 Deprecation] Remove V0 sampling metadata (#25345) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> --- .../vllm_add_dummy_model/my_llava.py | 8 +++----- .../vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py | 8 +++----- vllm/model_executor/__init__.py | 2 -- vllm/model_executor/layers/logits_processor.py | 2 -- vllm/model_executor/models/apertus.py | 5 +---- vllm/model_executor/models/arcee.py | 7 +++---- vllm/model_executor/models/arctic.py | 5 +---- vllm/model_executor/models/aria.py | 7 ++----- vllm/model_executor/models/aya_vision.py | 5 +---- vllm/model_executor/models/baichuan.py | 5 +---- vllm/model_executor/models/bailing_moe.py | 5 +---- vllm/model_executor/models/bamba.py | 5 +---- vllm/model_executor/models/blip2.py | 5 +---- vllm/model_executor/models/bloom.py | 5 +---- vllm/model_executor/models/chameleon.py | 5 +---- vllm/model_executor/models/chatglm.py | 5 +---- vllm/model_executor/models/cohere2_vision.py | 5 +---- vllm/model_executor/models/commandr.py | 6 ++---- vllm/model_executor/models/dbrx.py | 5 +---- vllm/model_executor/models/deepseek.py | 5 +---- vllm/model_executor/models/deepseek_eagle.py | 5 +---- vllm/model_executor/models/deepseek_mtp.py | 9 ++------- vllm/model_executor/models/deepseek_v2.py | 5 +---- vllm/model_executor/models/deepseek_vl2.py | 5 +---- vllm/model_executor/models/dots1.py | 5 +---- vllm/model_executor/models/ernie45_moe.py | 5 +---- vllm/model_executor/models/ernie45_vl.py | 5 +---- vllm/model_executor/models/ernie45_vl_moe.py | 5 +---- vllm/model_executor/models/ernie_mtp.py | 8 ++------ vllm/model_executor/models/exaone.py | 5 +---- vllm/model_executor/models/exaone4.py | 5 +---- vllm/model_executor/models/falcon.py | 5 +---- vllm/model_executor/models/falcon_h1.py | 5 +---- vllm/model_executor/models/fuyu.py | 4 +--- vllm/model_executor/models/gemma.py | 5 +---- vllm/model_executor/models/gemma2.py | 5 +---- vllm/model_executor/models/gemma3.py | 5 +---- vllm/model_executor/models/gemma3_mm.py | 5 +---- vllm/model_executor/models/gemma3n.py | 5 +---- vllm/model_executor/models/gemma3n_mm.py | 5 +---- vllm/model_executor/models/glm4.py | 5 +---- vllm/model_executor/models/glm4_1v.py | 5 +---- vllm/model_executor/models/glm4_moe.py | 5 +---- vllm/model_executor/models/glm4_moe_mtp.py | 9 ++------- vllm/model_executor/models/gpt2.py | 5 +---- vllm/model_executor/models/gpt_bigcode.py | 5 +---- vllm/model_executor/models/gpt_j.py | 4 +--- vllm/model_executor/models/gpt_neox.py | 5 +---- vllm/model_executor/models/gpt_oss.py | 7 ++----- vllm/model_executor/models/granite.py | 9 +++------ vllm/model_executor/models/granite_speech.py | 7 +------ vllm/model_executor/models/granitemoe.py | 9 +++------ vllm/model_executor/models/granitemoehybrid.py | 5 +---- vllm/model_executor/models/granitemoeshared.py | 9 +++------ vllm/model_executor/models/grok1.py | 5 +---- vllm/model_executor/models/hunyuan_v1.py | 5 +---- vllm/model_executor/models/hyperclovax_vision.py | 5 +---- vllm/model_executor/models/idefics3.py | 7 ++----- vllm/model_executor/models/interfaces_base.py | 3 --- vllm/model_executor/models/internlm2.py | 5 +---- vllm/model_executor/models/interns1.py | 5 +---- vllm/model_executor/models/internvl.py | 5 +---- vllm/model_executor/models/jais.py | 5 +---- vllm/model_executor/models/jamba.py | 5 +---- vllm/model_executor/models/keye.py | 5 +---- vllm/model_executor/models/kimi_vl.py | 5 +---- vllm/model_executor/models/lfm2.py | 7 ++----- vllm/model_executor/models/llama.py | 5 +---- vllm/model_executor/models/llama_eagle3.py | 5 +---- vllm/model_executor/models/llava.py | 5 +---- vllm/model_executor/models/llava_next.py | 5 +---- vllm/model_executor/models/llava_next_video.py | 5 +---- vllm/model_executor/models/llava_onevision.py | 5 +---- vllm/model_executor/models/mamba.py | 7 ++----- vllm/model_executor/models/mamba2.py | 7 ++----- vllm/model_executor/models/medusa.py | 3 +-- vllm/model_executor/models/midashenglm.py | 4 +--- vllm/model_executor/models/mimo.py | 5 +---- vllm/model_executor/models/mimo_mtp.py | 8 ++------ vllm/model_executor/models/minicpm.py | 5 +---- vllm/model_executor/models/minicpm_eagle.py | 5 +---- vllm/model_executor/models/minicpmv.py | 4 +--- vllm/model_executor/models/minimax_text_01.py | 7 ++----- vllm/model_executor/models/minimax_vl_01.py | 5 +---- vllm/model_executor/models/mistral3.py | 5 +---- vllm/model_executor/models/mixtral.py | 5 +---- vllm/model_executor/models/mllama4.py | 5 +---- vllm/model_executor/models/molmo.py | 7 ++----- vllm/model_executor/models/mpt.py | 5 +---- vllm/model_executor/models/nano_nemotron_vl.py | 5 +---- vllm/model_executor/models/nemotron.py | 5 +---- vllm/model_executor/models/nemotron_h.py | 5 +---- vllm/model_executor/models/nemotron_nas.py | 5 +---- vllm/model_executor/models/nemotron_vl.py | 5 +---- vllm/model_executor/models/olmo.py | 5 +---- vllm/model_executor/models/olmo2.py | 5 +---- vllm/model_executor/models/olmoe.py | 7 ++----- vllm/model_executor/models/opt.py | 5 +---- vllm/model_executor/models/orion.py | 5 +---- vllm/model_executor/models/ovis.py | 4 +--- vllm/model_executor/models/ovis2_5.py | 4 +--- vllm/model_executor/models/paligemma.py | 5 +---- vllm/model_executor/models/persimmon.py | 5 +---- vllm/model_executor/models/phi.py | 4 +--- vllm/model_executor/models/phi3v.py | 5 +---- vllm/model_executor/models/phi4_multimodal.py | 5 +---- vllm/model_executor/models/phi4flash.py | 3 --- vllm/model_executor/models/phi4mm.py | 5 +---- vllm/model_executor/models/phimoe.py | 7 ++----- vllm/model_executor/models/pixtral.py | 5 +---- vllm/model_executor/models/plamo2.py | 5 +---- vllm/model_executor/models/qwen.py | 5 +---- vllm/model_executor/models/qwen2.py | 5 +---- vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +---- vllm/model_executor/models/qwen2_5_vl.py | 5 +---- vllm/model_executor/models/qwen2_audio.py | 5 +---- vllm/model_executor/models/qwen2_moe.py | 5 +---- vllm/model_executor/models/qwen2_vl.py | 5 +---- vllm/model_executor/models/qwen3.py | 5 +---- vllm/model_executor/models/qwen3_moe.py | 5 +---- vllm/model_executor/models/qwen3_next.py | 5 +---- vllm/model_executor/models/qwen3_next_mtp.py | 5 +---- vllm/model_executor/models/qwen3_vl.py | 5 +---- vllm/model_executor/models/seed_oss.py | 5 +---- vllm/model_executor/models/skyworkr1v.py | 5 +---- vllm/model_executor/models/solar.py | 7 ++----- vllm/model_executor/models/stablelm.py | 5 +---- vllm/model_executor/models/starcoder2.py | 5 +---- vllm/model_executor/models/step3_text.py | 7 ++----- vllm/model_executor/models/step3_vl.py | 5 +---- vllm/model_executor/models/tarsier.py | 5 +---- vllm/model_executor/models/transformers.py | 5 +---- vllm/model_executor/models/ultravox.py | 7 ++----- vllm/model_executor/models/voxtral.py | 5 +---- vllm/model_executor/models/whisper.py | 7 ++----- vllm/model_executor/models/zamba2.py | 5 +---- vllm/model_executor/sampling_metadata.py | 7 ------- vllm/v1/spec_decode/eagle.py | 9 +++------ vllm/v1/spec_decode/medusa.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 9 ++++----- vllm/v1/worker/tpu_model_runner.py | 2 +- 141 files changed, 172 insertions(+), 583 deletions(-) delete mode 100644 vllm/model_executor/sampling_metadata.py diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index da97cf7e2b40b..b431ad1ed0928 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -9,7 +9,6 @@ from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder, LlavaForConditionalGeneration, LlavaMultiModalProcessor, LlavaProcessingInfo) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -18,11 +17,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY dummy_inputs=LlavaDummyInputsBuilder) class MyLlava(LlavaForConditionalGeneration): - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: # this dummy model always predicts the first token - logits = super().compute_logits(hidden_states, sampling_metadata) + logits = super().compute_logits(hidden_states) if logits is not None: logits.zero_() logits[:, 0] += 1.0 diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py index 8c34407e3e071..a6fafff98e9c5 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py @@ -6,16 +6,14 @@ from typing import Optional import torch from vllm.model_executor.models.opt import OPTForCausalLM -from vllm.model_executor.sampling_metadata import SamplingMetadata class MyOPTForCausalLM(OPTForCausalLM): - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: # this dummy model always predicts the first token - logits = super().compute_logits(hidden_states, sampling_metadata) + logits = super().compute_logits(hidden_states) if logits is not None: logits.zero_() logits[:, 0] += 1.0 diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index a59aebfac4ff9..3c094cfdb553f 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -3,11 +3,9 @@ from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed __all__ = [ - "SamplingMetadata", "set_random_seed", "BasevLLMParameter", "PackedvLLMParameter", diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 8226437cb1898..2110aa2769b93 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -10,7 +10,6 @@ from vllm.distributed import (tensor_model_parallel_all_gather, from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform @@ -50,7 +49,6 @@ class LogitsProcessor(CustomOp): self, lm_head: VocabParallelEmbedding, hidden_states: torch.Tensor, - sampling_metadata: Optional[SamplingMetadata] = None, embedding_bias: Optional[torch.Tensor] = None, ) -> Optional[torch.Tensor]: if self.logits_as_input: diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index f6400b05e110a..6dab4ed14345f 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -566,10 +565,8 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index be82c2fd59644..1ee378af76c9f 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -399,11 +399,10 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds=inputs_embeds) return model_output - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata) -> Optional[torch.Tensor]: + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: # Compute final logits from hidden states (last pipeline rank only) - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b6dd559968415..55d16fd75cebb 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -30,7 +30,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -456,10 +455,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index a7cb6b35a4ab4..35c1adbdd00b6 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -644,10 +643,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 687c82ded9d0a..0f05f9b4efcd6 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -16,7 +16,6 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import ( get_optimal_tiled_canvas) from vllm.config import VllmConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, @@ -464,7 +463,5 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index ae25033410407..db8d0a8710471 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, row_parallel_weight_loader) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant @@ -421,10 +420,8 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 5f6025abf315c..82cd4a26a1baa 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -623,10 +622,8 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 397089f31cdf6..584981ef3ebfd 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -571,10 +570,8 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index a3131aa3812ef..b7455fba62c02 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -12,7 +12,6 @@ from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig, from vllm.config import CacheConfig, VllmConfig from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -704,10 +703,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 4c37622b049c8..30816f72a2678 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP, SupportsQuant @@ -355,10 +354,8 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 7a56236483749..79d648d749c6a 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -28,7 +28,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, row_parallel_weight_loader) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -1046,10 +1045,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) # Disallow image tokens which does not include special # begin-image and end-image tokens diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 1fc2da3e4d7ca..879508400222f 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -27,7 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig @@ -437,10 +436,8 @@ class ChatGLMBaseModel(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 179cc2af8eb3f..6d67eb68d51a8 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, @@ -478,7 +477,5 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 7f87e31abdcd3..f3929ef3b5938 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name, row_parallel_weight_loader) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -448,15 +447,14 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: is_not_lora = hasattr(self.model.embed_tokens, 'weight') if is_not_lora: logits = self.logits_processor(self.model.embed_tokens, - hidden_states, sampling_metadata) + hidden_states) else: logits = self.logits_processor(self.model.embed_tokens.base_layer, - hidden_states, sampling_metadata) + hidden_states) return logits diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 003cf4563a22f..f863b1da5505c 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -462,10 +461,8 @@ class DbrxForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 59c9921881497..ffc843fe033cc 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -488,10 +487,8 @@ class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 2770ddebc48ab..ed7e7614800fc 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -19,7 +19,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer, DeepseekV3ForCausalLM) -from vllm.model_executor.sampling_metadata import SamplingMetadata from .utils import AutoWeightsLoader, maybe_prefix @@ -222,10 +221,8 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 8fbf16d206a86..92f311ab465b5 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -15,7 +15,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .deepseek_v2 import (DeepseekV2DecoderLayer, @@ -124,15 +123,13 @@ class DeepSeekMultiTokenPredictor(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: current_step_idx = (spec_step_idx % self.num_mtp_layers) mtp_layer = self.layers[str(self.mtp_start_layer_idx + current_step_idx)] logits = self.logits_processor(mtp_layer.shared_head.head, - mtp_layer.shared_head(hidden_states), - sampling_metadata) + mtp_layer.shared_head(hidden_states)) return logits @@ -161,11 +158,9 @@ class DeepSeekMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: - return self.model.compute_logits(hidden_states, sampling_metadata, - spec_step_idx) + return self.model.compute_logits(hidden_states, spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 636554bd648f2..a99a6679a5696 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -56,7 +56,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import cdiv, direct_register_custom_op @@ -914,10 +913,8 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index d7ae8206baca5..c8ed759d2e972 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -15,7 +15,6 @@ from transformers import BatchFeature from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models.transformers import replace_linear_class @@ -647,10 +646,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 20555e48b73d4..2a09234b59ed1 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -52,7 +52,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -534,10 +533,8 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index ebab018ed67e7..d262e9e9da50e 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -591,10 +590,8 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 0d4aced93ca1c..74b358034ef3d 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -39,7 +39,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -1292,11 +1291,9 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: """compute logits""" - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def _vision_forward( self, diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 7f791852ceb91..f55016f7ccb36 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .ernie45_moe import Ernie4_5_MoeMLP @@ -587,10 +586,8 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py index c446265230313..288fbe736c32f 100644 --- a/vllm/model_executor/models/ernie_mtp.py +++ b/vllm/model_executor/models/ernie_mtp.py @@ -36,7 +36,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -138,12 +137,10 @@ class ErnieMultiTokenPredictor(nn.Module): self, hidden_states: torch.Tensor, lm_head: ParallelLMHead, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: self.layers[str(self.mtp_start_layer_idx + spec_step_idx)] - logits = self.logits_processor(lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(lm_head, hidden_states) return logits @@ -180,11 +177,10 @@ class ErnieMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: return self.model.compute_logits(hidden_states, self.lm_head, - sampling_metadata, spec_step_idx) + spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index f503fb0f9364a..5dafcd595e4a5 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -534,10 +533,8 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 9f7d57d938140..c78eedff66700 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -517,10 +516,8 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 42c378e5c389a..0c50056d1c527 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import RWConfig @@ -496,10 +495,8 @@ class FalconForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 757051b3b1447..83efdd2e433fc 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -33,7 +33,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP @@ -675,10 +674,8 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 90af859ab92ec..53e9e6fe6e460 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -29,7 +29,6 @@ from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor, from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.models.persimmon import PersimmonForCausalLM -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -389,10 +388,9 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: logits = self.language_model.logits_processor( - self.language_model.lm_head, hidden_states, sampling_metadata) + self.language_model.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 12eb27503870c..c19425b6cb6d6 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -412,10 +411,8 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 0bdb6c6bf7ae9..3f76e1e7d42a2 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -409,10 +408,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 7246308d59028..77c0ef8cb91d2 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from ...attention.layers.encoder_only_attention import EncoderOnlyAttention @@ -542,10 +541,8 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index bee9fbd2c084a..0630ee07c347e 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -14,7 +14,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -704,10 +703,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index ffec3408702c9..f4d288fd887e9 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsQuant @@ -814,10 +813,8 @@ class Gemma3nForCausalLM(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: Optional[SamplingMetadata], ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.model.embed_tokens, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.model.embed_tokens, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 8d3079aee0dfb..2acdba54a257d 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -685,10 +684,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 5e2908a82c418..b9d5e24e9f6fa 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -289,10 +288,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 308b0cb602bc9..56ec634386909 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -52,7 +52,6 @@ from vllm.distributed import (get_tensor_model_parallel_world_size, parallel_state) from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -1654,10 +1653,8 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 1acbd18091fb3..947c6ce62f551 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -703,10 +702,8 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 322c5619c1783..c572978e62206 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -38,7 +38,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name @@ -155,15 +154,13 @@ class Glm4MoeMultiTokenPredictor(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: current_step_idx = (spec_step_idx % self.num_mtp_layers) mtp_layer = self.layers[str(self.mtp_start_layer_idx + current_step_idx)] logits = self.logits_processor(mtp_layer.shared_head.head, - mtp_layer.shared_head(hidden_states), - sampling_metadata) + mtp_layer.shared_head(hidden_states)) return logits @@ -192,11 +189,9 @@ class Glm4MoeMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: - return self.model.compute_logits(hidden_states, sampling_metadata, - spec_step_idx) + return self.model.compute_logits(hidden_states, spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 0f6521e44e6be..24274db148bd7 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from ..layers.pooler import DispatchPooler, Pooler @@ -307,10 +306,8 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 745d0b7759991..162018450e7c0 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -329,10 +328,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 77df6ae6f30c8..698387fab946c 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -329,10 +328,9 @@ class GPTJForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata, self.lm_head.bias) + self.lm_head.bias) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index e97db188e27eb..7570aefb6e96e 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -321,10 +320,8 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.embed_out, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.embed_out, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index b49fd0d8f88af..4fe59f91124dd 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import cdiv @@ -670,10 +669,8 @@ class GptOssForCausalLM(nn.Module, SupportsPP): return self.model(input_ids, positions, intermediate_tensors, inputs_embeds) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 4f9cc2532bd8c..795b38e724eab 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -463,11 +462,9 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return model_output - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 221023f1fb657..a5849184339b1 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -37,7 +37,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -776,12 +775,8 @@ class GraniteSpeechForConditionalGeneration( def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits( - hidden_states, - sampling_metadata, - ) + return self.language_model.compute_logits(hidden_states) def load_weights( self, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index da16c72000c0e..07200fef4799d 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -511,11 +510,9 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return hidden_states - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 79c6d8146ba9c..e89a1a4a0f7d3 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -672,10 +671,8 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 0b568a4b22685..a5d118f084e6c 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .granitemoe import GraniteMoeAttention, GraniteMoeModel, GraniteMoeMoE @@ -311,11 +310,9 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return hidden_states - def compute_logits( - self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, + hidden_states: torch.Tensor) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index a591134383371..996e41fe84ff3 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -528,10 +527,8 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 4110c8a1fd08d..8a23a6b45bc70 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -54,7 +54,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP @@ -1004,10 +1003,8 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def make_empty_intermediate_tensors( diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 870addd0dcbca..54167f9f10995 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -31,7 +31,6 @@ from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -962,10 +961,8 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights( self, diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 9153a0e2c1e5a..18446d126b51a 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -31,7 +31,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -738,10 +737,8 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 19a3ef1a3b800..8fdf70e35a2b8 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -13,11 +13,9 @@ from vllm.utils import supports_kw if TYPE_CHECKING: from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import Pooler - from vllm.model_executor.sampling_metadata import SamplingMetadata else: VllmConfig = Any Pooler = Any - SamplingMetadata = Any logger = init_logger(__name__) @@ -100,7 +98,6 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): def compute_logits( self, hidden_states: T, - sampling_metadata: SamplingMetadata, ) -> Optional[T]: """Return `None` if TP rank > 0.""" ... diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ce94328797ed6..221ff08b43843 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -29,7 +29,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -358,10 +357,8 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.output, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.output, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index b59d1b88cf5ce..ba72c288b2b12 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -21,7 +21,6 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.interns1_vit import InternS1VisionModel from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -812,10 +811,8 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 6a5c565b52e85..f4004e518e3ba 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -1399,10 +1398,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 4fee8c32fd581..0eb1578b43610 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -42,7 +42,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import JAISConfig @@ -332,10 +331,8 @@ class JAISLMHeadModel(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 5b8fbc7226866..12a49029195ff 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -32,7 +32,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaMLP as JambaMLP from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -581,10 +580,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index afe33b4d4ad26..2e5e276cc1c7d 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -21,7 +21,6 @@ from vllm.attention.layer import check_upstream_fa_availability from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) @@ -1556,10 +1555,8 @@ class BaseKeyeModule(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 94a5933a61416..f554077935bf3 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -67,7 +67,6 @@ from vllm.model_executor.models.interfaces import (SupportsMultiModal, SupportsPP) from vllm.model_executor.models.moonvit import MoonVitPretrainedModel from vllm.model_executor.models.utils import merge_multimodal_embeddings -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -484,10 +483,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, return hidden_states def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, **kwargs) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata, **kwargs) + logits = self.logits_processor(self.lm_head, hidden_states, **kwargs) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 927f78c4e4b45..dd97afbeb668a 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -27,7 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, @@ -542,10 +541,8 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f8ea2111fed57..1b03cbef501b3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP @@ -601,10 +600,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 7027138dfcb17..fb10af6c53c90 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaForCausalLM) -from vllm.v1.sample.metadata import SamplingMetadata from .utils import AutoWeightsLoader, maybe_prefix @@ -244,10 +243,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) if self.draft_id_to_target_id is None: assert logits.shape[1] == self.config.vocab_size, \ "Expected logits to have shape " \ diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 4f15e1b5762ea..e2d7b9f23b28a 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -20,7 +20,6 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -760,10 +759,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index beb3c33100595..c9133fde14552 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -13,7 +13,6 @@ from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) from vllm.config import VllmConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.multimodal.parse import ImageSize @@ -563,10 +562,8 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens]. def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index cf9852de633f3..610fb188d57d2 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -13,7 +13,6 @@ from transformers import (BatchFeature, LlavaNextVideoConfig, from vllm.config import VllmConfig from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.models.clip import CLIPVisionModel -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -464,10 +463,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 46d54452a52d8..cee9ddaf94cc4 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -14,7 +14,6 @@ from transformers.models.llava_onevision.modeling_llava_onevision import ( from vllm.config import VllmConfig from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -934,10 +933,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 9d1017dac8aa1..36141a5d50641 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -26,7 +26,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsAttentionFree, SupportsPP) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -299,10 +298,8 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index b1a4138cb8f6c..9c3108146d2e5 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -30,7 +30,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsAttentionFree) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType @@ -335,10 +334,8 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index b0a96fca2ff8a..0ae59dc8dfc23 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -104,12 +104,11 @@ class Medusa(nn.Module): def compute_logits( self, hidden_states: list[torch.Tensor], - sampling_metadata, ) -> list[torch.Tensor]: logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): - _logits = self.logits_processor(lm_head, hs, sampling_metadata) + _logits = self.logits_processor(lm_head, hs) if _logits is None: # _logits should only be None on rank > 0, in which case diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 140800dd41c76..82648ba668ca5 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.utils import set_default_torch_dtype -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -784,9 +783,8 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.decoder.compute_logits(hidden_states, sampling_metadata) + return self.decoder.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index ea5292d0df202..d256c1f3eed7b 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix @@ -183,9 +182,7 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: hidden_states = self.model.norm(hidden_states) - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index 09194e9f95d0e..b4abe458e4771 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2DecoderLayer -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .utils import maybe_prefix @@ -140,12 +139,10 @@ class MiMoMultiTokenPredictor(nn.Module): self, hidden_states: torch.Tensor, lm_head: ParallelLMHead, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> torch.Tensor: self.mtp_layers[str(self.mtp_start_layer_idx + spec_step_idx)] - logits = self.logits_processor(lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(lm_head, hidden_states) return logits @@ -178,11 +175,10 @@ class MiMoMTP(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: return self.model.compute_logits(hidden_states, self.lm_head, - sampling_metadata, spec_step_idx) + spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 240c23ea2b25d..0986ea07406a9 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -583,10 +582,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 848a97b8bb2a0..2af0d546ce63d 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -39,7 +39,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -376,10 +375,8 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 9b2d84e32151a..a17c4f004d75c 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -50,7 +50,6 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1194,9 +1193,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.llm.compute_logits(hidden_states, sampling_metadata) + return self.llm.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 6ce883be0a83c..1d2c7dea811e0 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import maybe_prefix -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid @@ -742,10 +741,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid): return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states.float(), - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states.float()) return logits diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index cc7db849a28bf..b2f020f3323e8 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -14,7 +14,6 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.sequence import IntermediateTensors @@ -420,10 +419,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index d15776a39362d..94e3d7234b6f4 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -20,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -606,10 +605,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 8b3474d809532..bebf0b5adac52 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP @@ -594,10 +593,8 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 2f0e8a2a5e575..131a66b713235 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.utils import initialize_model from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -856,10 +855,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def separate_weights( self, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 2475fe1316097..201bf83cac581 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -26,7 +26,6 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather) -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU, SiluAndMul) from vllm.model_executor.layers.layernorm import RMSNorm @@ -1527,10 +1526,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 48ac91fa6dde0..64d669e8ac3e1 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -320,10 +319,8 @@ class MPTForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 4f8652c006941..ae50f1aefc6f7 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -37,7 +37,6 @@ from vllm.model_executor.models.utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, MultiModalKwargsItems, @@ -1192,10 +1191,8 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): adapter_dict = dict(self.mlp1.named_parameters()) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 21f785e4b91af..6bb2f7392cb49 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import NemotronConfig @@ -498,10 +497,8 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 1e1f0524bd063..ff571541a60a5 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -54,7 +54,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager, from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import NemotronHConfig from vllm.utils import LayerBlockType @@ -622,10 +621,8 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index f8e38dcd80b5a..d474c8db41b2f 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -44,7 +44,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasNoOps, SupportsLoRA, SupportsPP @@ -468,10 +467,8 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index acda2027401d9..3abbff8c717d4 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -26,7 +26,6 @@ from vllm.model_executor.models.internvl import ( BaseInternVLProcessingInfo, InternVLImageEmbeddingInputs, InternVLImageInputs, InternVLImagePixelInputs, InternVLProcessor) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import NestedTensors @@ -632,10 +631,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 7be3c16528b52..9fa8760073c15 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -391,10 +390,8 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 3e4c580a11211..2e0b1fb2a13f7 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -54,7 +54,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Olmo3Config @@ -427,10 +426,8 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 892e967e4a21f..77ece544d4900 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -471,10 +470,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 365aab205b211..4c3ce9f61efb3 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -399,10 +398,8 @@ class OPTForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 944a9151d75d3..586fea343d6f9 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -28,7 +28,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -339,10 +338,8 @@ class OrionForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index f1bb18716b40d..052e143b27f6e 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -39,7 +39,6 @@ from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -558,9 +557,8 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.llm.compute_logits(hidden_states, sampling_metadata) + logits = self.llm.compute_logits(hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 5e4758ef8ea5d..f18e38ce154d2 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -19,7 +19,6 @@ from vllm.model_executor.models.siglip2navit import Siglip2NavitModel from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -630,9 +629,8 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.llm.compute_logits(hidden_states, sampling_metadata) + logits = self.llm.compute_logits(hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index d6eec77ebcee5..aef5102304614 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -9,7 +9,6 @@ from transformers import BatchFeature, PaliGemmaConfig from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, @@ -403,10 +402,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3e854e4d561ff..23fb7bb85215c 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -44,7 +44,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -334,10 +333,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 6f39afbecf35b..9cf288e850057 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -59,7 +59,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -346,10 +345,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata, self.lm_head.bias) + self.lm_head.bias) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4522c7043d01a..a2b201fe4228d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -29,7 +29,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -681,10 +680,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 25df9e9261d91..d2a3a8cc04969 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -27,7 +27,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1451,10 +1450,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index aa7c434a44aeb..ae153558e37aa 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -29,7 +29,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, SupportsV0Only) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .utils import make_layers, maybe_prefix @@ -695,12 +694,10 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: processed_logits = self.logits_processor( self.lm_head, hidden_states, - sampling_metadata, self.embedding_bias, ) return processed_logits diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index b3fc55dab6eca..47b5ad55ab2d0 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -18,7 +18,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1257,10 +1256,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 01d16f1f2c387..3ce67ce37a7ab 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -667,10 +666,8 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 142d3251bc67a..7b197844c8b63 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalUUIDDict, NestedTensors) @@ -480,10 +479,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 9f1ee36366fdd..33ee1cf44afd1 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -52,7 +52,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager, from vllm.model_executor.models.utils import ( is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -1022,10 +1021,8 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 7470948499005..e0c08a6a88271 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -30,7 +30,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -282,10 +281,8 @@ class QWenBaseModel(nn.Module): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e13e87b93429d..c536b0f60c30d 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import is_interleaved @@ -510,10 +509,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index a7e71309b6074..5f27230c913b4 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -50,7 +50,6 @@ from vllm.model_executor.models.qwen2_5_vl import ( from vllm.model_executor.models.qwen2_audio import ( Qwen2AudioProcessingInfo, _get_feat_extract_output_lengths) from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, @@ -955,10 +954,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index dbf486374bcf3..73b27572a8ebd 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,7 +43,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm # yapf: disable @@ -1256,10 +1255,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index c797b71b5d2e1..762ab42e5929e 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -34,7 +34,6 @@ from transformers.models.qwen2_audio import (Qwen2AudioConfig, from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (AudioItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, @@ -481,10 +480,8 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 6c6276a930453..6a9acaf2c3fe0 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -546,10 +545,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index dd4e7731e0b08..b3c42c2572566 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -46,7 +46,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -1527,10 +1526,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index dddb47048a1fc..ae72fd30c3993 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP @@ -328,10 +327,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 029309c49efd4..0661b3707ff44 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -54,7 +54,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP @@ -690,10 +689,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ce917f92bd2e5..24cebc5bfdd82 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -53,7 +53,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, sharded_weight_loader) from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -1208,10 +1207,8 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + return self.logits_processor(self.lm_head, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index c755eeb9b4eaa..c054339842e64 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen3_next import (Qwen3NextDecoderLayer, Qwen3NextRMSNorm) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Qwen3NextConfig @@ -266,11 +265,9 @@ class Qwen3NextMTP(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, spec_step_idx: int = 0, ) -> Optional[torch.Tensor]: - return self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + return self.logits_processor(self.lm_head, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index ca232e03767b1..aa28c07ddcebe 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -45,7 +45,6 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -1493,10 +1492,8 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index e3c7c700f8fa1..a217c820fedf0 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -472,10 +471,8 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 9857ccdcbe2d4..893ce4497c319 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -22,7 +22,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -897,10 +896,8 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 94c862258b7ad..c774171b9dcd2 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -495,10 +494,8 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds) return model_output - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 9e880ebd50813..e4dfe8d5a9a3b 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -42,7 +42,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -332,10 +331,8 @@ class StablelmForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 62ff9b6182755..7f379ab95a03d 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -339,10 +338,8 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 6a5b540fc8174..0cce0c78f8dc6 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -29,7 +29,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -405,10 +404,8 @@ class Step3TextForCausalLM(nn.Module, SupportsPP): inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index c2940f8e44450..f667266b77bfa 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -23,7 +23,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -1055,10 +1054,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index c66867315e553..67cf3ccf315d1 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -23,7 +23,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.llava import LlavaDummyInputsBuilder -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems @@ -638,10 +637,8 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 3bd4d10316ec6..475a68bc642b9 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalUUIDDict, @@ -798,10 +797,8 @@ class TransformersForCausalLM(TransformersBase): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index f1f11c5fe8f00..12ae9487ad9dc 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -18,7 +18,6 @@ from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.model_loader import DefaultModelLoader from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors) @@ -616,10 +615,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): inputs_embeds=inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 16a97389cd21b..b33e8d09c4be1 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -30,7 +30,6 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys # yapf: disable from vllm.model_executor.models.whisper import WhisperEncoder # yapf: enable -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, MultiModalUUIDDict, @@ -454,10 +453,8 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) + return self.language_model.compute_logits(hidden_states) @classmethod def get_speech_to_text_config(cls, model_config: ModelConfig, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 41ae7b129782d..de3e4f0592a62 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -31,7 +31,6 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) @@ -936,10 +935,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, return WhisperAudioInputs(input_features=input_features) - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.proj_out, hidden_states, - sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.proj_out, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index e601bc3adb6e9..4350e38e02f96 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import HasInnerState, IsHybrid @@ -1036,7 +1035,6 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: """Compute logits for next token prediction. @@ -1047,8 +1045,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): Returns: Logits for next token prediction """ - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py deleted file mode 100644 index 8c4548ff7f7dc..0000000000000 --- a/vllm/model_executor/sampling_metadata.py +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - - -class SamplingMetadata: - # Placeholder until it can be safely removed. - pass diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 2a178ddf48777..5dacf60886966 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -239,7 +239,7 @@ class EagleProposer: else: last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] - logits = self.model.compute_logits(sample_hidden_states, None) + logits = self.model.compute_logits(sample_hidden_states) # Early exit if there is only one draft token to be generated. if self.num_speculative_tokens == 1: @@ -367,8 +367,7 @@ class EagleProposer: else: last_hidden_states, hidden_states = ret_hidden_states hidden_states = hidden_states[:batch_size] - logits = self.model.compute_logits(last_hidden_states[:batch_size], - None) + logits = self.model.compute_logits(last_hidden_states[:batch_size]) draft_token_ids = logits.argmax(dim=-1) draft_token_ids_list.append(draft_token_ids) @@ -678,9 +677,7 @@ class EagleProposer: # Get the output logits for the draft tokens. logits = self.model.compute_logits( draft_last_hidden_states.reshape(batch_size * level_num_drafts, - -1), - None, - ) + -1)) # Sample a draft token for each child at the next tree level. num_children = self.child_drafts_per_level[level + 1] diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 3e90179e78d99..70b29c05c2a50 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -41,7 +41,7 @@ class MedusaProposer: ) -> list[list[int]]: # Generate blocks and compute logits blocks = self.model(target_hidden_states) - logits = self.model.compute_logits(blocks, None) + logits = self.model.compute_logits(blocks) # Get draft tokens and transpose the result # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d0946e8c5d7d8..b0cd0f4133079 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2240,7 +2240,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return output sample_hidden_states = hidden_states[logits_indices] - logits = self.model.compute_logits(sample_hidden_states, None) + logits = self.model.compute_logits(sample_hidden_states) else: # Rare case. assert not self.is_pooling_model @@ -2258,8 +2258,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits = None else: sample_hidden_states = hidden_states[logits_indices] - logits = self.model.compute_logits(sample_hidden_states, - None) + logits = self.model.compute_logits(sample_hidden_states) model_output_broadcast_data = {} if logits is not None: @@ -2706,7 +2705,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_idx = self.input_batch.req_id_to_index[req_id] offset = self.query_start_loc.np[req_idx].item() prompt_hidden_states = hidden_states[offset:offset + num_logits] - logits = self.model.compute_logits(prompt_hidden_states, None) + logits = self.model.compute_logits(prompt_hidden_states) # Get the "target" tokens for each index. For prompt at index i, # the token at prompt index i+1 is the "sampled" token we want @@ -3105,7 +3104,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # To avoid breaking the sampler, we use a random tensor here instead. hidden_states = torch.rand_like(hidden_states) - logits = self.model.compute_logits(hidden_states, None) + logits = self.model.compute_logits(hidden_states) num_reqs = logits.size(0) dummy_tensors = lambda v: torch.full( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 48070c1e3e7cb..dd11b1dcbe94c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1692,7 +1692,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): @torch.compile(backend="openxla", fullgraph=True, dynamic=False) def compute_logits(self, sample_hidden_states: torch.Tensor) -> torch.Tensor: - return self.model.compute_logits(sample_hidden_states, None) + return self.model.compute_logits(sample_hidden_states) # TODO: Under SPMD mode, sample_from_logits has correctness issue. # Re-enable the torch.compile once the issue is fixed in torchxla. From af7dfb0d1a95fb097a93de156ddd6c1eb5c72796 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Mon, 22 Sep 2025 04:12:45 +0800 Subject: [PATCH 532/584] [Perf] Further optimization for Qwen3-VL `fast_pos_embed_interpolate` (#25347) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/model_executor/models/qwen3_vl.py | 46 +++++++++++++++++--------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index aa28c07ddcebe..98d65dea27393 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -405,25 +405,39 @@ class Qwen3_VisionTransformer(nn.Module): dh = h_idxs - h_floor dw = w_idxs - w_floor - w00 = ((1 - dh)[:, None] * (1 - dw)[None, :]).reshape(-1) - w01 = ((1 - dh)[:, None] * dw[None, :]).reshape(-1) - w10 = (dh[:, None] * (1 - dw)[None, :]).reshape(-1) - w11 = (dh[:, None] * dw[None, :]).reshape(-1) + # Create meshgrid view for all h, w vars + dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing='ij') + h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, + w_floor, + indexing='ij') + h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, + w_ceil, + indexing='ij') + h_floor_grid_idx = h_floor_grid * num_grid_per_side + h_ceil_grid_idx = h_ceil_grid * num_grid_per_side - idx00 = (h_floor[:, None] * num_grid_per_side + - w_floor[None, :]).reshape(-1) - idx01 = (h_floor[:, None] * num_grid_per_side + - w_ceil[None, :]).reshape(-1) - idx10 = (h_ceil[:, None] * num_grid_per_side + - w_floor[None, :]).reshape(-1) - idx11 = (h_ceil[:, None] * num_grid_per_side + - w_ceil[None, :]).reshape(-1) + # original computation of weights + # w00 = (1 - dh_grid) * (1 - dw_grid) + # w01 = (1 - dh_grid) * dw_grid + # w10 = dh_grid * (1 - dw_grid) + # w11 = dh_grid * dw_grid + # we reuse w11 here to avoid duplicate + # dh_grid * dw_grid computation + w11 = dh_grid * dw_grid + w10 = dh_grid - w11 + w01 = dw_grid - w11 + w00 = 1 - dh_grid - dw_grid + w11 - indices = torch.stack([idx00, idx01, idx10, idx11], dim=0) + idx00 = h_floor_grid_idx + w_floor_grid + idx01 = h_floor_grid_idx + w_ceil_grid + idx10 = h_ceil_grid_idx + w_floor_grid + idx11 = h_ceil_grid_idx + w_ceil_grid + + indices = torch.stack([idx00, idx01, idx10, idx11], + dim=0).reshape(4, -1) weights = torch.stack([w00, w01, w10, w11], - dim=0).to(dtype=self.dtype, - device=self.device) - weights = weights.unsqueeze(-1) + dim=0).reshape(4, -1, 1) + weights = weights.to(dtype=self.dtype, device=self.device) embeds = self.pos_embed(indices) weighted_embeds = embeds * weights From bc6e542d9fc400d7002853b3878c81b14fbdc998 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sun, 21 Sep 2025 16:03:28 -0700 Subject: [PATCH 533/584] Remove V0 attention backends (#25351) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- examples/offline_inference/qwen_1m.py | 1 - tests/compile/test_fusion_attn.py | 5 +- tests/kernels/attention/test_attention.py | 6 +- .../attention/test_attention_selector.py | 1 + .../kernels/attention/test_prefix_prefill.py | 6 +- .../attention/test_rocm_attention_selector.py | 1 + tests/kernels/utils.py | 66 +- tests/models/test_initialization.py | 5 +- .../backends/differential_flash_attn.py | 931 ---------- .../backends/dual_chunk_flash_attn.py | 1495 ----------------- vllm/attention/backends/flash_attn.py | 929 ---------- vllm/attention/backends/flashmla.py | 227 --- vllm/attention/backends/mla/__init__.py | 0 vllm/attention/backends/mla/common.py | 1305 -------------- vllm/attention/backends/rocm_aiter_mla.py | 407 ----- vllm/attention/backends/rocm_flash_attn.py | 953 ----------- vllm/attention/backends/triton_mla.py | 111 -- vllm/attention/backends/utils.py | 14 +- vllm/attention/backends/xformers.py | 805 --------- vllm/config/model.py | 7 +- .../kv_transfer/kv_connector/utils.py | 2 +- vllm/engine/arg_utils.py | 15 +- vllm/envs.py | 1 - .../layers/mamba/mamba2_metadata.py | 19 +- vllm/model_executor/models/deepseek_v2.py | 3 +- vllm/platforms/cuda.py | 139 +- vllm/platforms/rocm.py | 63 +- vllm/utils/__init__.py | 2 - 28 files changed, 143 insertions(+), 7376 deletions(-) delete mode 100644 vllm/attention/backends/differential_flash_attn.py delete mode 100644 vllm/attention/backends/dual_chunk_flash_attn.py delete mode 100755 vllm/attention/backends/flash_attn.py delete mode 100644 vllm/attention/backends/flashmla.py delete mode 100644 vllm/attention/backends/mla/__init__.py delete mode 100644 vllm/attention/backends/mla/common.py delete mode 100644 vllm/attention/backends/rocm_aiter_mla.py delete mode 100644 vllm/attention/backends/rocm_flash_attn.py delete mode 100644 vllm/attention/backends/triton_mla.py delete mode 100644 vllm/attention/backends/xformers.py diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py index d8d61667f688b..c8d0d91ce7b5c 100644 --- a/examples/offline_inference/qwen_1m.py +++ b/examples/offline_inference/qwen_1m.py @@ -5,7 +5,6 @@ from urllib.request import urlopen from vllm import LLM, SamplingParams -os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN" os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1" diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index b6bebbba915be..c3f1c7481d1b3 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -334,8 +334,9 @@ else: [7, 256, 533] if current_platform.is_cuda() else [8]) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("model_name, model_class", MODELS) -@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if - current_platform.is_cuda() else [_Backend.ROCM_FLASH]) +@pytest.mark.parametrize("backend", + [_Backend.FLASHINFER] if current_platform.is_cuda() + else [_Backend.TRITON_ATTN_VLLM_V1]) @pytest.mark.parametrize( "split_attention", [False, True] if current_platform.is_rocm() else [False]) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 7083661575ef2..c7abf652f111b 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -18,7 +18,7 @@ if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask - from vllm.attention.backends.xformers import _make_alibi_bias + from tests.kernels.utils import make_alibi_bias FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. @@ -429,8 +429,8 @@ def test_multi_query_kv_attention( alibi_bias = None if use_alibi: alibi_slopes = torch.randn(num_query_heads, dtype=torch.float) - attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, - seq_lens) + attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, + seq_lens) output = torch.empty_like(query) start = 0 # Dynamic sequence length not supported with custom attn_bias. diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index f8454ad0a4c4d..38ab40f88ae0b 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -67,6 +67,7 @@ def generate_params(): return params +@pytest.mark.skip(reason="Skipped for now. Should be revisited.") @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) def test_env( diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 8544eab3acccd..0695f84aea1af 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -11,7 +11,7 @@ import torch from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask -from vllm.attention.backends.xformers import _make_alibi_bias +from tests.kernels.utils import make_alibi_bias from vllm.attention.ops.chunked_prefill_paged_decode import ( chunked_prefill_paged_decode) from vllm.attention.ops.prefix_prefill import context_attention_fwd @@ -470,7 +470,7 @@ def test_contexted_kv_attention_alibi( key = key.unsqueeze(0) value = value.unsqueeze(0) - attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens) + attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens) output_ref = torch.empty_like(output) seq_start = 0 query_start = 0 @@ -479,7 +479,7 @@ def test_contexted_kv_attention_alibi( # FIXME(DefTruth): Because xformers does not support dynamic sequence # lengths with custom attention bias, we process each prompt one by # one. This is inefficient, especially when we have many short prompts. - # modified from: vllm/attention/backends/xformers.py#L343 + # modified from: vllm/v1/attention/backends/xformers.py#L343 for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)): seq_end = seq_start + seq_len query_end = query_start + query_len diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index d56d3f4638f1c..af301d9de4350 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -16,6 +16,7 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() +@pytest.mark.skip(reason="Skipped for now. Should be revisited.") def test_selector(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH") diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index c9bf85f6e2a5c..8d6ce381976b5 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -513,10 +513,6 @@ def make_backend(backend_name: str) -> AttentionBackend: Construct the backend instance determined by the backend_name string argument. - "XFORMERS" -> construct xformers backend - - TODO: other backends - Note: at time of writing the Attention wrapper automatically selects its own backend for Attention.forward(); so the backend instance which you generate with this function is not meant to be used for *running* @@ -528,18 +524,68 @@ def make_backend(backend_name: str) -> AttentionBackend: * Backend instance ''' - if backend_name == STR_XFORMERS_ATTN_VAL: - # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs. - from vllm.attention.backends.xformers import XFormersBackend - return XFormersBackend() - elif backend_name == STR_FLASH_ATTN_VAL: - from vllm.attention.backends.flash_attn import FlashAttentionBackend + if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"): + from vllm.v1.attention.backends.xformers import ( + XFormersAttentionBackend) + return XFormersAttentionBackend() + if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"): + from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend return FlashAttentionBackend() + if backend_name == "TRITON_ATTN_VLLM_V1": + from vllm.v1.attention.backends.triton_attn import ( + TritonAttentionBackend) + return TritonAttentionBackend() + if backend_name == "FLEX_ATTENTION": + from vllm.v1.attention.backends.flex_attention import ( + FlexAttentionBackend) + return FlexAttentionBackend() + if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"): + from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend + return TorchSDPABackend() + if backend_name == "FLASHINFER": + from vllm.v1.attention.backends.flashinfer import FlashInferBackend + return FlashInferBackend() raise AssertionError( f"Unrecognized backend_name {backend_name} for unit test") +def make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + dtype: torch.dtype, + seq_lens: list[int], +) -> list[Any]: + """Create ALiBi biases compatible with xFormers attention tests.""" + from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias + + if alibi_slopes is None: + return [None for _ in seq_lens] + + attn_biases: list[Any] = [] + num_heads = alibi_slopes.shape[0] + assert num_heads >= num_kv_heads, ( + "ALiBi slopes expect at least as many heads as KV heads") + + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) + bias = bias[None, :] - bias[:, None] + + padded_len = (seq_len + 7) // 8 * 8 + bias_tensor = torch.empty( + 1, + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias_tensor.mul_(alibi_slopes[:, None, None]) + attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor)) + + return attn_biases + + def _make_metadata_tensors( seq_lens: Optional[list[int]], context_lens: Optional[list[int]], diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index b9601114a3183..bfde6e20a3b17 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -78,9 +78,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, return if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"): - # Phi4FlashForCausalLM and MotifForCausalLM - # only supports DIFFERENTIAL_FLASH_ATTN backend - m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN") + pytest.skip( + "Differential Flash Attention backend has been removed.") if model_arch == "GptOssForCausalLM": # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py deleted file mode 100644 index 87a4558e377d0..0000000000000 --- a/vllm/attention/backends/differential_flash_attn.py +++ /dev/null @@ -1,931 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""" An implementation of https://arxiv.org/pdf/2410.05258 """ -from collections import defaultdict -from dataclasses import dataclass -from itertools import accumulate -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -from einops import rearrange - -from vllm import _custom_ops as ops -# yapf conflicts with isort for this block -# yapf: disable -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.flash_attn import FlashAttentionBackend -# yapf: enable -from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState, - compute_slot_mapping, - compute_slot_mapping_start_idx, - is_all_cross_attn_metadata_set, - is_all_encoder_attn_metadata_set, - is_block_tables_empty) -from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8, - get_flash_attn_version) -from vllm.logger import init_logger -from vllm.multimodal import MultiModalPlaceholderMap -from vllm.utils import async_tensor_h2d, make_tensor_with_pad -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) - -logger = init_logger(__name__) - - -class DifferentialFlashAttentionBackend(AttentionBackend): - accept_output_buffer = False - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - if block_size % 16 != 0: - raise ValueError("Block size must be a multiple of 16.") - assert num_kv_heads % 2 == 0, "num_kv_heads must be divisible by 2" - return (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size) - - @staticmethod - def get_name() -> str: - return "DIFFERENTIAL_FLASH_ATTN" - - @staticmethod - def get_impl_cls() -> Type["DifferentialFlashAttentionImpl"]: - return DifferentialFlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["DifferentialFlashAttentionMetadata"]: - return DifferentialFlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["DifferentialFlashAttentionMetadataBuilder"]: - return DifferentialFlashAttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - src_key_cache = src_kv_cache[0] - dst_key_cache = dst_kv_cache[0] - ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) - src_value_cache = src_kv_cache[1] - dst_value_cache = dst_kv_cache[1] - ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -@dataclass -class DifferentialFlashAttentionMetadata(AttentionMetadata): - """Metadata for FlashAttentionBackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - - use_cuda_graph: bool - - # Maximum query length in the batch. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - _cached_prefill_metadata: Optional[ - "DifferentialFlashAttentionMetadata"] = None - _cached_decode_metadata: Optional[ - "DifferentialFlashAttentionMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - encoder_seq_start_loc: Optional[torch.Tensor] = None - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - # Cross-layer shared attention block tables - cross_layer_shared_block_tables: Optional[torch.Tensor] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return is_all_encoder_attn_metadata_set(self) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return is_all_cross_attn_metadata_set(self) - - @property - def prefill_metadata( - self) -> Optional["DifferentialFlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert ((self.seq_lens is not None) - or (self.encoder_seq_lens is not None)) - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - cross_layer_shared_block_tables = ( - None if self.cross_layer_shared_block_tables is None else - self.cross_layer_shared_block_tables[:self.num_prefills]) - - self._cached_prefill_metadata = DifferentialFlashAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_query_len=0, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - cross_layer_shared_block_tables=cross_layer_shared_block_tables, - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata( - self) -> Optional["DifferentialFlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - cross_layer_shared_block_tables = ( - None if self.cross_layer_shared_block_tables is None else - self.cross_layer_shared_block_tables[self.num_prefills:]) - self._cached_decode_metadata = DifferentialFlashAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=self.max_decode_query_len, - max_query_len=self.max_query_len, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - # Batch may be composed of prefill|decodes, adjust query start - # indices to refer to the start of decodes. E.g. - # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - query_start_loc=(self.query_start_loc[self.num_prefills:] - - self.query_start_loc[self.num_prefills]) - if self.query_start_loc is not None else None, - seq_start_loc=self.seq_start_loc[self.num_prefills:] - if self.seq_start_loc is not None else None, - context_lens_tensor=None, - block_tables=block_tables, - cross_layer_shared_block_tables=cross_layer_shared_block_tables, - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_decode_metadata - - -class DifferentialFlashAttentionMetadataBuilder( - AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]): - - def __init__(self, input_builder): - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - - def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.cross_layer_shared_block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.num_decode_tokens = 0 - self.has_prefix_cache_hit = False - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - # TODO: add support for chunked prefill and prefix caching. - assert not chunked_prefill_enabled, \ - "chunked prefill is not supported for now" - assert not prefix_cache_hit, "prefix caching is not supported for now" - - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - - if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - cross_layer_shared_block_table = [] - if prefix_cache_hit: - cross_layer_shared_block_table = block_tables[seq_id] - elif block_tables is not None: - if curr_sliding_window_block == 0: - cross_layer_shared_block_table = block_tables[seq_id] - else: - cross_layer_shared_block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.cross_layer_shared_block_tables.append( - cross_layer_shared_block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - - def _get_graph_runner_block_tables(self, num_seqs: int, - block_tables: List[List[int]], - graph_block_tables) -> torch.Tensor: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - # max_batch_size, max_blocks = self.runner.graph_block_tables.shape - max_batch_size, max_blocks = graph_block_tables.shape - assert max_batch_size >= num_seqs - - # graph_block_tables = self.runner.graph_block_tables[:num_seqs] - graph_block_tables = graph_block_tables[:num_seqs] - for i, block_table in enumerate(block_tables): - if block_table: - num_blocks = len(block_table) - if num_blocks <= max_blocks: - graph_block_tables[i, :num_blocks] = block_table - else: - # It may be possible to have more blocks allocated due - # to lookahead slots of multi-step, however, they are - # not used anyway, so can be safely ignored. - graph_block_tables[ - i, :max_blocks] = block_table[:max_blocks] - - return torch.from_numpy(graph_block_tables).to( - device=self.runner.device, non_blocking=True) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. - """ - prefix_cache_hit = any([ - inter_data.prefix_cache_hit - for inter_data in self.input_builder.inter_data_list - ]) - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled, - prefix_cache_hit) - - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - max_query_len = max(query_lens) - decode_query_lens = query_lens[self.num_prefills:] - if len(decode_query_lens) > 0: - max_decode_query_len = max(decode_query_lens) - else: - max_decode_query_len = 1 - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - num_decode_tokens = self.num_decode_tokens - query_start_loc = list(accumulate(query_lens, initial=0)) - seq_start_loc = list(accumulate(seq_lens, initial=0)) - - num_seqs = len(seq_lens) - if use_captured_graph: - self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) - self.block_tables.extend([] * cuda_graph_pad_size) - - self.cross_layer_shared_block_tables.extend([] * - cuda_graph_pad_size) - - num_decode_tokens = batch_size - self.num_prefill_tokens - block_tables = self._get_graph_runner_block_tables( - num_seqs, self.block_tables, self.runner.graph_block_tables) - cross_layer_shared_block_tables = \ - self._get_graph_runner_block_tables( - num_seqs, self.cross_layer_shared_block_tables, - self.runner.cross_layer_shared_graph_block_tables) - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - cross_layer_shared_block_tables = make_tensor_with_pad( - self.cross_layer_shared_block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - - assert device is not None - context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, - device, self.runner.pin_memory) - query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, - device, - self.runner.pin_memory) - seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, - device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } - - return DifferentialFlashAttentionMetadata( - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - seq_lens=seq_lens, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=True, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_decode_query_len=max_decode_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc_tensor, - seq_start_loc=seq_start_loc_tensor, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - cross_layer_shared_block_tables=cross_layer_shared_block_tables, - use_cuda_graph=use_captured_graph, - ) - - -class DifferentialFlashAttentionImpl(AttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - differential_flash_attention_config: Optional[Dict[str, Any]] = None, - ) -> None: - if differential_flash_attention_config is None: - differential_flash_attention_config = {} - self.differential_flash_attention_config = \ - differential_flash_attention_config - self.used_shared_kv_cache = kv_sharing_target_layer_name is not None - self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - if use_irope: - logger.warning( - "Using irope in V0 is not supported yet, it will fall back " - "to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window - 1, - 0) if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - self.vllm_flash_attn_version = get_flash_attn_version( - requires_alibi=self.alibi_slopes is not None) - if is_quantized_kv_cache(self.kv_cache_dtype) and ( - not self.kv_cache_dtype.startswith("fp8") - or not flash_attn_supports_fp8()): - raise NotImplementedError( - f"FlashAttention does not support {self.kv_cache_dtype} " - "kv-cache on this device " - f"(FA supports fp8 = {flash_attn_supports_fp8()}).") - if logits_soft_cap is None: - # In flash-attn, setting logits_soft_cap as 0 means no soft cap. - logits_soft_cap = 0 - self.logits_soft_cap = logits_soft_cap - - assert self.num_heads % self.num_kv_heads == 0 - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() - if head_size not in support_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {support_head_sizes}.") - self.attn_type = attn_type - - self.lambda_full = None - self.subln = self.differential_flash_attention_config["subln"] - - def split_heads(self, x): - # split by num_heads, the stripe pattern is friendly to tensor parallel. - x = rearrange(x, "... (H two) D -> ... H two D", two=2) - x1 = x[..., 0, :] - x2 = x[..., 1, :] - return x1.contiguous(), x2.contiguous() - - def split_kv_cache(self, x): - # split by num_heads, the stripe pattern is friendly to tensor parallel. - if x.numel() == 0: - return torch.empty(0), torch.empty(0) - - x1, x2 = x[0], x[1] - return x1, x2 - - def populate_kv_cache(self, layer: AttentionLayer, key: torch.Tensor, - value: torch.Tensor, kv_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata): - if kv_cache.numel() > 0 and key is not None and value is not None: - updated_slot_mapping = attn_metadata.slot_mapping - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], - updated_slot_mapping.flatten(), - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - def forward_generate_kv_cache( - self, query: torch.Tensor, key: Optional[torch.Tensor], - value: Optional[torch.Tensor], k_cache: torch.Tensor, - v_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata) -> torch.Tensor: - - head_size = self.head_size - num_heads = self.num_heads // 2 - num_kv_heads = self.num_kv_heads // 2 - - query = query.view(-1, num_heads, head_size) - if key is not None: - assert value is not None - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - else: - assert value is None - - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - assert key.shape[ - 0] == num_prefill_tokens + num_decode_tokens, "key shape mismatch" - assert value.shape[ - 0] == num_prefill_tokens + num_decode_tokens, "value shape mismatch" - - output = torch.empty_like(query) - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] - # QKV for prefill. - query = query[:num_prefill_tokens] - if key is not None and value is not None: - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - - assert query.shape[0] == num_prefill_tokens, "query shape mismatch" - assert decode_query.shape[ - 0] == num_decode_tokens, "decode query shape mismatch" - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if k_cache.numel() == 0 \ - or prefill_meta.block_tables is None \ - or prefill_meta.block_tables.numel() == 0: - # normal attention - prefill_output = flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=prefill_meta.seq_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=prefill_meta.max_prefill_seq_len, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - ) - assert prefill_output.shape == output[: - num_prefill_tokens].shape - output[:num_prefill_tokens] = prefill_output - else: - raise Exception("prefix caching not supported") - - if decode_meta := attn_metadata.decode_metadata: - block_tables_arg = decode_meta.block_tables - try: - output[num_prefill_tokens:] = flash_attn_with_kvcache( - q=decode_query.unsqueeze(1), - k_cache=k_cache, - v_cache=v_cache, - block_table=block_tables_arg, - cache_seqlens=decode_meta.seq_lens_tensor, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - ).squeeze(1) - except Exception as e: - logger.error("Error in PagedAttention.forward_decode: %s", - str(e)) - raise e - - # Reshape the output tensor. - return output.view(-1, num_heads, head_size) - - def forward_with_kv_cache_only( - self, - query: torch.Tensor, - k_cache: torch.Tensor, - v_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata, - ): - if not attn_metadata.decode_metadata: - block_tables_arg = attn_metadata.cross_layer_shared_block_tables - else: - block_tables_arg = attn_metadata.block_tables - - output = flash_attn_with_kvcache( - q=query.unsqueeze(1), - k_cache=k_cache, - v_cache=v_cache, - block_table=block_tables_arg, - cache_seqlens=attn_metadata.seq_lens_tensor, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - ).squeeze(1) - return output - - def forward( - self, - layer: AttentionLayer, - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: DifferentialFlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with FlashAttention. - - Args: - layer: Attention layer instance. - q: Query tensor with shape = [num_tokens, num_heads, head_size] - k: Key tensor with shape = [num_tokens, num_kv_heads, head_size] - v: Value tensor with shape = [num_tokens, num_kv_heads, head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size, num_kv_heads, head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - output: Output tensor with shape [num_tokens, num_heads, head_size] - output_scale: Optional output scale tensor. - output_block_scale: Optional output block scale tensor. - NOTE: It in-place updates the output tensor. - NOTE: FP8 quantization, flash-attn expect the size of - {q,k,v}_descale to be (num_sequences, num_kv_heads). - We use torch's .expand() to avoid duplicating values - """ - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for DifferentialFlashAttentionImpl") - - if self.lambda_full is None: - self.lambda_init = self.differential_flash_attention_config[ - "lambda_init"] - lambda_q1 = self.differential_flash_attention_config["lambda_q1"] - lambda_k1 = self.differential_flash_attention_config["lambda_k1"] - lambda_q2 = self.differential_flash_attention_config["lambda_q2"] - lambda_k2 = self.differential_flash_attention_config["lambda_k2"] - lambda_1 = torch.exp( - torch.sum(lambda_q1 * lambda_k1, dim=-1).float()).type_as(q) - lambda_2 = torch.exp( - torch.sum(lambda_q2 * lambda_k2, dim=-1).float()).type_as(q) - self.lambda_full = lambda_1 - lambda_2 + self.lambda_init - - if not self.used_shared_kv_cache: # need to generate kv-cache - q = q.view(-1, self.num_heads, self.head_size) - k = k.view(-1, self.num_kv_heads, self.head_size) - v = v.view(-1, self.num_kv_heads, self.head_size) - - q1, q2 = self.split_heads(q) - k1, k2 = self.split_heads(k) - v1, v2 = self.split_heads(v) - - # kv_cache shape is (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size) # noqa: E501 - # Split by half along the first dimension. - kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache) - assert kv_cache1.is_contiguous(), "kv_cache1 is not contiguous" - assert kv_cache2.is_contiguous(), "kv_cache2 is not contiguous" - - if kv_cache1.numel() != 0: - self.populate_kv_cache(layer, k1, v1, kv_cache1, attn_metadata) - self.populate_kv_cache(layer, k2, v2, kv_cache2, attn_metadata) - - key_cache1, value_cache1 = self.split_kv_cache(kv_cache1) - key_cache2, value_cache2 = self.split_kv_cache(kv_cache2) - else: - key_cache1, value_cache1 = torch.empty(0), torch.empty(0) - key_cache2, value_cache2 = torch.empty(0), torch.empty(0) - attn11 = self.forward_generate_kv_cache(q1, k1, v1, key_cache1, - value_cache1, - attn_metadata) - attn12 = self.forward_generate_kv_cache(q1, k1, v2, key_cache1, - value_cache2, - attn_metadata) - attn11 = attn11.view(q1.shape) - attn12 = attn12.view(q1.shape) - attn1 = torch.cat([attn11, attn12], dim=-1) - - attn21 = self.forward_generate_kv_cache(q2, k2, v1, key_cache2, - value_cache1, - attn_metadata) - attn22 = self.forward_generate_kv_cache(q2, k2, v2, key_cache2, - value_cache2, - attn_metadata) - attn21 = attn21.view(q2.shape) - attn22 = attn22.view(q2.shape) - attn2 = torch.cat([attn21, attn22], dim=-1) - - attn = attn1 - self.lambda_full * attn2 - # attn shape (-1, self.num_heads // 2, 2 * self.head_dim) - attn = self.subln(attn) - attn = attn * (1 - self.lambda_init) - # reshape back to 2 * num_head - attn_output = rearrange(attn, - "... H (two D) -> ... (H two) D", - two=2) - - else: # reuse the kv cache, full attention - q = q.view(-1, self.num_heads, self.head_size) - q1, q2 = self.split_heads(q) - # kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501 - kv_cache1, kv_cache2 = self.split_kv_cache(kv_cache) - key_cache1, value_cache1 = kv_cache1[0], kv_cache1[1] - key_cache2, value_cache2 = kv_cache2[0], kv_cache2[1] - - attn11 = self.forward_with_kv_cache_only(q1, key_cache1, - value_cache1, - attn_metadata) - attn12 = self.forward_with_kv_cache_only(q1, key_cache1, - value_cache2, - attn_metadata) - attn11 = attn11.view(q1.shape) - attn12 = attn12.view(q1.shape) - attn1 = torch.cat([attn11, attn12], dim=-1) - - attn21 = self.forward_with_kv_cache_only(q2, key_cache2, - value_cache1, - attn_metadata) - attn22 = self.forward_with_kv_cache_only(q2, key_cache2, - value_cache2, - attn_metadata) - attn21 = attn21.view(q2.shape) - attn22 = attn22.view(q2.shape) - attn2 = torch.cat([attn21, attn22], dim=-1) - - attn = attn1 - self.lambda_full * attn2 - attn = self.subln(attn) - attn = attn * (1 - self.lambda_init) - # reshape back to 2 * num_head - attn_output = rearrange(attn, - "... H (two D) -> ... (H two) D", - two=2) - attn_output = attn_output.view(-1, self.num_heads * self.head_size) - return attn_output diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py deleted file mode 100644 index de47bb8ebd8fb..0000000000000 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ /dev/null @@ -1,1495 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with Dual chunk flash attention and sparse attention. -""" -import math -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -import torch.distributed -import torch.nn.functional as F - -from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import AttentionLayer, AttentionType -from vllm.attention.backends.flash_attn import (FlashAttentionBackend, - FlashAttentionImpl, - FlashAttentionMetadata, - FlashAttentionMetadataBuilder) -from vllm.distributed.parallel_state import get_tensor_model_parallel_rank -from vllm.logger import init_logger -from vllm.utils import async_tensor_h2d -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache, sparse_attn_func) - -logger = init_logger(__name__) - - -class DualChunkFlashAttentionBackend(FlashAttentionBackend): - - accept_output_buffer: bool = False - - @staticmethod - def get_name() -> str: - return "DUAL_CHUNK_FLASH_ATTN" - - @staticmethod - def get_impl_cls() -> Type["DualChunkFlashAttentionImpl"]: - return DualChunkFlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["DualChunkFlashAttentionMetadata"]: - return DualChunkFlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["DualChunkFlashAttentionMetadataBuilder"]: - return DualChunkFlashAttentionMetadataBuilder - - -@dataclass -class DualChunkFlashAttentionMetadata(FlashAttentionMetadata): - # Block size of the paged kv cache. - block_size: int = 16 - - # Original max position embeddings. - original_max_position_embeddings: int = 0 - - # Chunk size - chunk_size: int = 8192 - - # Local size - local_size: int = 1024 - - # (batch_size,). The orig sequence length per sequence. - orig_seq_lens: Optional[List[int]] = None - - # orig_seq_lens stored as a tensor. - orig_seq_lens_tensor: Optional[torch.Tensor] = None - - # Length scaling factor - scaling_factor: Optional[torch.Tensor] = None - - # (batch_size,). Sequence lengths for intra attention. - seq_lens_intra: Optional[torch.Tensor] = None - - # Max sequence length for intra attention. - max_seq_len_intra: Optional[int] = None - - # (batch_size, num_blocks). Block table for intra attention. - block_tables_intra: Optional[torch.Tensor] = None - - # (batch_size,). Sequence lengths for succ attention. - seq_lens_succ: Optional[torch.Tensor] = None - - # Max sequence length for succ attention. - max_seq_len_succ: Optional[int] = None - - # (batch_size, num_blocks). Block table for succ attention. - block_tables_succ: Optional[torch.Tensor] = None - - # (batch_size,). Sequence lengths for inter attention. - seq_lens_inter: Optional[torch.Tensor] = None - - # Max sequence length for inter attention. - max_seq_len_inter: Optional[int] = None - - _cached_prefill_metadata: Optional[ - "DualChunkFlashAttentionMetadata"] = None - _cached_decode_metadata: Optional["DualChunkFlashAttentionMetadata"] = None - - @property - def prefill_metadata(self) -> Optional["DualChunkFlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - prefill_metadata = super().prefill_metadata - if prefill_metadata is None: - return None - - prefill_metadata = DualChunkFlashAttentionMetadata( - **prefill_metadata.asdict_zerocopy()) - - prefill_metadata.orig_seq_lens = ( - None if self.orig_seq_lens is None else - self.orig_seq_lens[:self.num_prefills]) - prefill_metadata.orig_seq_lens_tensor = ( - None if self.orig_seq_lens_tensor is None else - self.orig_seq_lens_tensor[:self.num_prefills]) - - if self.original_max_position_embeddings > 0: - assert prefill_metadata.orig_seq_lens_tensor is not None - prefill_metadata.scaling_factor = ( - 0.1 * torch.log(prefill_metadata.orig_seq_lens_tensor / - self.original_max_position_embeddings) + - 1.0).clip(min=1) - - self._cached_prefill_metadata = prefill_metadata - return prefill_metadata - - @property - def decode_metadata(self) -> Optional["DualChunkFlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - - decode_metadata = super().decode_metadata - if decode_metadata is None: - return None - - decode_metadata = DualChunkFlashAttentionMetadata( - **decode_metadata.asdict_zerocopy()) - - decode_metadata.orig_seq_lens_tensor = ( - None if self.orig_seq_lens_tensor is None else - self.orig_seq_lens_tensor[self.num_prefills:]) - - assert decode_metadata.orig_seq_lens_tensor is not None - assert decode_metadata.block_tables is not None - - cache_seq_lens = decode_metadata.orig_seq_lens_tensor - chunk_len = self.chunk_size - self.local_size - chunk_num_curr = (cache_seq_lens - 1) // chunk_len - batch_size = decode_metadata.num_decode_tokens - - if self.original_max_position_embeddings > 0: - decode_metadata.scaling_factor = (0.1 * torch.log( - cache_seq_lens / self.original_max_position_embeddings) + - 1.0).clip(min=1) - - seq_lens_intra = cache_seq_lens - chunk_num_curr * chunk_len - max_seq_len_intra = seq_lens_intra.max().item() - decode_metadata.seq_lens_intra = seq_lens_intra - decode_metadata.max_seq_len_intra = max_seq_len_intra - - block_tables_intra = torch.zeros( - batch_size, - (max_seq_len_intra - 1) // self.block_size + 1, - dtype=decode_metadata.block_tables.dtype, - device=decode_metadata.block_tables.device, - ) - for i in range(batch_size): - st = chunk_num_curr[i] * chunk_len // self.block_size - ed = min( - st + (max_seq_len_intra - 1) // self.block_size + 1, - (cache_seq_lens[i] - 1) // self.block_size + 1, - ) - block_tables_intra[i, :ed - - st] = decode_metadata.block_tables[i, st:ed] - decode_metadata.block_tables_intra = block_tables_intra - - seq_lens_succ = (chunk_num_curr - - (chunk_num_curr - 1).clip(min=0)) * chunk_len - max_seq_len_succ = seq_lens_succ.max().item() - decode_metadata.seq_lens_succ = seq_lens_succ - decode_metadata.max_seq_len_succ = max_seq_len_succ - if max_seq_len_succ: - block_tables_succ = torch.zeros( - batch_size, - (max_seq_len_succ - 1) // self.block_size + 1, - dtype=decode_metadata.block_tables.dtype, - device=decode_metadata.block_tables.device, - ) - for i in range(batch_size): - start = ((chunk_num_curr[i] - 1).clip(min=0) * chunk_len // - self.block_size) - end = min( - start + (max_seq_len_succ - 1) // self.block_size + 1, - (cache_seq_lens[i] - 1) // self.block_size + 1, - ) - block_tables_succ[ - i, :end - start] = decode_metadata.block_tables[i, - start:end] - decode_metadata.block_tables_succ = block_tables_succ - - seq_lens_inter = (chunk_num_curr - 1).clip(min=0) * chunk_len - max_seq_len_inter = seq_lens_inter.max().item() - decode_metadata.seq_lens_inter = seq_lens_inter - decode_metadata.max_seq_len_inter = max_seq_len_inter - - self._cached_decode_metadata = decode_metadata - return decode_metadata - - -class DualChunkFlashAttentionMetadataBuilder(FlashAttentionMetadataBuilder): - - def prepare(self): - super().prepare() - self.orig_seq_lens: List[int] = [] - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - super()._add_seq_group(inter_data, chunked_prefill_enabled, - prefix_cache_hit) - for prompt_len, seq_len in zip(inter_data.prompt_lens, - inter_data.seq_lens): - self.orig_seq_lens.append(max(prompt_len, seq_len)) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - attn_metadata = super().build(seq_lens, query_lens, - cuda_graph_pad_size, batch_size) - attn_metadata = DualChunkFlashAttentionMetadata( - **attn_metadata.asdict_zerocopy()) - - device = self.runner.device - attn_metadata.orig_seq_lens = self.orig_seq_lens - attn_metadata.orig_seq_lens_tensor = async_tensor_h2d( - self.orig_seq_lens, torch.int, device, self.runner.pin_memory) - - attn_metadata.block_size = self.runner.block_size - dual_chunk_attn_config = getattr(self.runner.model_config.hf_config, - "dual_chunk_attention_config", {}) - attn_metadata.original_max_position_embeddings = \ - dual_chunk_attn_config.get("original_max_position_embeddings", 0) - attn_metadata.chunk_size = dual_chunk_attn_config.get( - "chunk_size", 8192) - attn_metadata.local_size = dual_chunk_attn_config.get( - "local_size", 1024) - - return attn_metadata - - -class DualChunkFlashAttentionImpl(FlashAttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - The prompts might have different lengths, while the generation tokens - always have length 1. - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - layer_idx: int = -1, - dual_chunk_attention_config: Optional[Dict[str, Any]] = None, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "DUAL_CHUNK_FLASH_ATTN backend.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window, sliding_window) - if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - if sliding_window is not None: - # NOTE(woosuk): flash-attn's sliding window does not work with - # paged KV cache. - raise ValueError( - "Sliding window is not supported in FlashAttention.") - - support_head_sizes = ( - DualChunkFlashAttentionBackend.get_supported_head_sizes()) - - if head_size not in support_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {support_head_sizes}.") - - assert dual_chunk_attention_config is not None - self.chunk_size = dual_chunk_attention_config.get("chunk_size", 8192) - self.local_size = dual_chunk_attention_config.get("local_size", 1024) - self.original_max_position_embeddings = dual_chunk_attention_config.get( - "original_max_position_embeddings", 0) - self.sparse_attention_config = dual_chunk_attention_config.get( - "sparse_attention_config", None) - if not self.sparse_attention_config: - logger.warning_once("Sparse attention will not be enabled as " - "sparse attention config is not provided.") - self.sparse_attention_enabled = dual_chunk_attention_config.get( - "sparse_attention_enabled", self.sparse_attention_config - is not None) - self.sparse_attention_threshold = dual_chunk_attention_config.get( - "sparse_attention_threshold", 32768) - self.sparse_attention_last_q = dual_chunk_attention_config.get( - "sparse_attention_last_q", 64) - self.layer_idx = layer_idx - self.dual_chunk_attention_config = dual_chunk_attention_config - - if self.sparse_attention_config: - self.sparse_attention_config = { - int(i): j - for i, j in self.sparse_attention_config[ - self.layer_idx].items() - } - start_head = self.num_heads * get_tensor_model_parallel_rank() - end_head = start_head + self.num_heads - self.sparse_attention_config = [ - self.sparse_attention_config[i] - for i in range(start_head, end_head) - ] - - if self.sparse_attention_enabled: - self.arange = torch.arange(self.sparse_attention_last_q, - device="cuda") - self.last_q_mask = (self.arange[None, None, :, None] - >= self.arange[None, None, None, :]) - - def forward( # type: ignore - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: DualChunkFlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with DualChunkFlashAttention. - Args: - query: shape = [num_tokens, num_heads * head_size] - query_succ: shape = [num_tokens, num_heads * head_size] - query_inter: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads * head_size] - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - assert output is None, "Output tensor not supported for DualChunk" - - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for FlashAttentionImpl") - - ( - query, - query_succ, - query_inter, - query_succ_critical, - query_inter_critical, - ) = torch.split(query, query.shape[-1] // 5, dim=-1) - - assert ( - query_succ is not None and query_inter is not None - ), "query_succ and query_inter are required in Dual Chunk Attention." - - num_tokens, hidden_size = query.shape - - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - query_succ = query_succ.view(-1, self.num_heads, self.head_size) - query_inter = query_inter.view(-1, self.num_heads, self.head_size) - query_succ_critical = query_succ_critical.view(-1, self.num_heads, - self.head_size) - query_inter_critical = query_inter_critical.view( - -1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - if self.original_max_position_embeddings > 0: - if prefill_meta := attn_metadata.prefill_metadata: - assert prefill_meta.scaling_factor is not None - assert prefill_meta.query_start_loc is not None - assert prefill_meta.orig_seq_lens is not None - current_start = 0 - query_start_loc_cpu = prefill_meta.query_start_loc.cpu() - for i in range(len(prefill_meta.orig_seq_lens)): - current_end = (current_start + - (query_start_loc_cpu[i + 1] - - query_start_loc_cpu[i]).item()) - key[current_start:current_end].mul_( - prefill_meta.scaling_factor[i]) - current_start = current_end - assert current_end <= attn_metadata.num_prefill_tokens - if decode_meta := attn_metadata.decode_metadata: - assert decode_meta.scaling_factor is not None - scaling_factor = decode_meta.scaling_factor - key[attn_metadata.num_prefill_tokens:].mul_( - scaling_factor.unsqueeze(-1).unsqueeze(-1)) - - if kv_cache is not None and kv_cache.numel() > 0: - key_cache = kv_cache[0] - value_cache = kv_cache[1] - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory profiling run. - ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping.flatten(), - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - assert key.shape[0] == num_prefill_tokens + num_decode_tokens - assert value.shape[0] == num_prefill_tokens + num_decode_tokens - output = torch.empty_like(query) - - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] - decode_query_succ = query_succ[num_prefill_tokens:] - decode_query_inter = query_inter[num_prefill_tokens:] - - # QKV for prefill. - query = query[:num_prefill_tokens] - query_succ = query_succ[:num_prefill_tokens] - query_inter = query_inter[:num_prefill_tokens] - query_succ_critical = query_succ_critical[:num_prefill_tokens] - query_inter_critical = query_inter_critical[:num_prefill_tokens] - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - assert query.shape[0] == num_prefill_tokens - assert decode_query.shape[0] == num_decode_tokens - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if (kv_cache is None or prefill_meta.block_tables is None - or prefill_meta.block_tables.numel() == 0): - # normal attention, called during the profiling run. - out = flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=prefill_meta.seq_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=prefill_meta.max_prefill_seq_len, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - ) - assert output[:num_prefill_tokens].shape == out.shape - output[:num_prefill_tokens] = out - else: - # prefix-enabled attention - assert prefill_meta.seq_lens is not None - assert prefill_meta.orig_seq_lens is not None - output[:num_prefill_tokens] = ( - self._dual_chunk_flash_attn_prefill( - q=query, - q_succ=query_succ, - q_inter=query_inter, - q_succ_critical=query_succ_critical, - q_inter_critical=query_inter_critical, - k=key_cache, - v=value_cache, - cu_seqlens_q=prefill_meta.query_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - orig_seq_lens=prefill_meta.orig_seq_lens, - scaling_factor=prefill_meta.scaling_factor, - softmax_scale=self.scale, - causal=True, - window_size=(-1, -1), - alibi_slopes=self.alibi_slopes, - block_table=prefill_meta.block_tables, - chunk_size=self.chunk_size, - local_size=self.local_size, - )) - - if decode_meta := attn_metadata.decode_metadata: - # Decoding run. - output[num_prefill_tokens:] = ( - self._dual_chunk_flash_attn_decoding( - decode_query.unsqueeze(1), - decode_query_succ.unsqueeze(1), - decode_query_inter.unsqueeze(1), - key_cache, - value_cache, - block_table=decode_meta.block_tables, - cache_seqlens=decode_meta.seq_lens_tensor, - softmax_scale=self.scale, - causal=True, - alibi_slopes=self.alibi_slopes, - chunk_size=self.chunk_size, - local_size=self.local_size, - original_max_position_embeddings=self. - original_max_position_embeddings, - decode_meta=decode_meta, - ).squeeze(1)) - # Reshape the output tensor. - return output.view(num_tokens, hidden_size) - - def _dual_chunk_flash_attn_prefill( - self, - q, - q_succ, - q_inter, - q_succ_critical, - q_inter_critical, - k, - v, - cu_seqlens_q, - cu_seqlens_k, - orig_seq_lens: List[int], - scaling_factor: torch.Tensor, - softmax_scale: float, - causal: Optional[bool] = True, - window_size: Tuple[int, int] = (-1, -1), - alibi_slopes: Optional[torch.Tensor] = None, - block_table: Optional[torch.Tensor] = None, - chunk_size: int = 8192, - local_size: int = 1024, - ): - if alibi_slopes is not None: - raise ValueError( - "Dual Chunk Attention does not support alibi_slopes") - if not causal: - raise ValueError( - "Dual Chunk Attention does not support causal=False") - if window_size != (-1, -1): - raise ValueError( - "Dual Chunk Attention does not support window_size") - - cu_seqlens_q_cpu = cu_seqlens_q.cpu().tolist() - cu_seqlens_k_cpu = cu_seqlens_k.cpu().tolist() - all_outputs = [] - - for i in range(0, len(cu_seqlens_q_cpu) - 1): - qs = cu_seqlens_q_cpu[i] - qe = cu_seqlens_q_cpu[i:i + 2][-1] - ks = cu_seqlens_k_cpu[i] - ke = cu_seqlens_k_cpu[i:i + 2][-1] - - current_q = q[qs:qe] - current_q_succ = q_succ[qs:qe] - current_q_inter = q_inter[qs:qe] - current_q_succ_critical = q_succ_critical[qs:qe] - current_q_inter_critical = q_inter_critical[qs:qe] - - if block_table is None: - current_k = k[ks:ke] - current_v = v[ks:ke] - current_block_table = None - current_orig_seq_len = orig_seq_lens[i] - else: - current_block_table = block_table[i] - current_orig_seq_len = orig_seq_lens[i] - current_k = k - current_v = v - sparse_attn_enabled = (self.sparse_attention_enabled - and current_orig_seq_len - > self.sparse_attention_threshold) - - if current_q.shape[0] == 0: - continue - - if current_k.shape[0] == 0: - all_outputs.append( - torch.zeros( - (current_q.shape[0], current_q.shape[1], v.shape[2]), - device=q.device, - dtype=q.dtype, - )) - continue - - current_output = torch.empty_like(current_q) - group_size = int(current_q.size(-2) / current_k.size(-2)) - - if sparse_attn_enabled: - num_device_q_heads = current_q.size(-2) - heads_vertical_size = torch.empty(size=(num_device_q_heads, ), - dtype=torch.int32) - heads_slash_size = torch.empty(size=(num_device_q_heads, ), - dtype=torch.int32) - for head_id in range(current_q.size(-2)): - ( - ty, - vertical_size, - slash_size, - _, - ) = self.sparse_attention_config[head_id] - assert ty == "vertical_and_slash", "only support slash mode" - - if vertical_size == 30: - vertical_size += 100 - heads_vertical_size[head_id] = vertical_size - heads_slash_size[head_id] = slash_size - - current_output = self._dual_chunk_flash_attn_prefill_func( - current_q, # allheads - current_q_succ, - current_q_inter, - current_q_succ_critical, - current_q_inter_critical, - current_k, - current_v, - current_block_table, - softmax_scale, - chunk_size, - local_size, - scaling_factor[i].item(), - ke - ks, - sparse_attn_enabled=sparse_attn_enabled, - heads_vertical_size=heads_vertical_size, - heads_slash_size=heads_slash_size, - group_size=group_size) - else: - for head_id in range(current_q.size(-2)): - # (seq_len, num_heads, head_size) - current_q_head = current_q[:, head_id, :].unsqueeze(1) - current_q_succ_head = \ - current_q_succ[:, head_id, :].unsqueeze(1) - current_q_inter_head = \ - current_q_inter[:, head_id, :].unsqueeze(1) - current_q_succ_head_critical = \ - current_q_succ_critical[:, head_id, :].unsqueeze(1) - current_q_inter_head_critical = \ - current_q_inter_critical[:, head_id, :].unsqueeze(1) - if block_table is not None: - current_k_head = current_k[..., head_id // - group_size, :].unsqueeze(2) - current_v_head = current_v[..., head_id // - group_size, :].unsqueeze(2) - - else: - current_k_head = current_k[:, head_id, :].unsqueeze(1) - current_v_head = current_v[:, head_id, :].unsqueeze(1) - - current_out = self._dual_chunk_flash_attn_prefill_func( - current_q_head, - current_q_succ_head, - current_q_inter_head, - current_q_succ_head_critical, - current_q_inter_head_critical, - current_k_head, - current_v_head, - current_block_table, - softmax_scale, - chunk_size, - local_size, - scaling_factor[i].item(), - ke - ks, - sparse_attn_enabled=sparse_attn_enabled, - ) - current_output[:, head_id:head_id + 1, :] = current_out - all_outputs.append(current_output) - return torch.cat(all_outputs, dim=0) - - def _dual_chunk_flash_attn_prefill_func( - self, - q, - q_succ, - q_inter, - q_succ_critical, - q_inter_critical, - k, - v, - block_table, - softmax_scale: float, - chunk_size: int, - local_size: int, - scaling_factor: float, - k_length: int, - sparse_attn_enabled: Optional[bool] = True, - heads_vertical_size=None, - heads_slash_size=None, - group_size=None, - ): - flash_results = [] - chunk_len = chunk_size - local_size - - if block_table is not None: - block_size = v.shape[1] - if chunk_len % block_size != 0: - raise ValueError("chunk_len must be divisible by block_size.") - else: - block_size = 1 - - if self.original_max_position_embeddings > 0: - softmax_scale = softmax_scale * scaling_factor - - begin = k_length - q.shape[0] - while begin < k_length: - flash_per_chunk = [] - - prev_chunk_end_pos = (begin // chunk_len) * chunk_len - next_chunk_end_pos = prev_chunk_end_pos + chunk_len - end = min(next_chunk_end_pos, k_length) - qbegin = begin - (k_length - q.shape[0]) - qend = end - (k_length - q.shape[0]) - - qk_chunks = [] - q_states_intra = q[qbegin:qend] - # choose critical token - if block_table is not None: - block_tables_intra = _get_block(block_table, block_size, - prev_chunk_end_pos, end) - k_states_intra = k[block_tables_intra].view( - -1, *k.shape[-2:])[:(end - prev_chunk_end_pos)] - v_states_intra = v[block_tables_intra].view( - -1, *v.shape[-2:])[:(end - prev_chunk_end_pos)] - else: - block_tables_intra = None - k_states_intra = k[prev_chunk_end_pos:end] - v_states_intra = v[prev_chunk_end_pos:end] - - if sparse_attn_enabled: - last_q_size = min(qend - qbegin, self.sparse_attention_last_q) - _, num_device_k_heads, head_dim = k_states_intra.shape - k_states_intra = (k_states_intra.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, head_dim)) - v_states_intra = (v_states_intra.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, head_dim)) - qk_chunks.append( - (q_states_intra.transpose(0, 1)[:, -last_q_size:] * - softmax_scale) @ k_states_intra.permute(1, 2, 0)) - - if prev_chunk_end_pos - chunk_len >= 0: - q_states_succ = q_succ[qbegin:qend] - q_states_succ_critical = q_succ_critical[qbegin:qend] - if block_table is not None: - block_tables_succ = _get_block( - block_table, block_size, - prev_chunk_end_pos - chunk_len, prev_chunk_end_pos) - k_states_succ = k[block_tables_succ].view( - -1, *k.shape[-2:])[:chunk_len] - v_states_succ = v[block_tables_succ].view( - -1, *v.shape[-2:])[:chunk_len] - else: - k_states_succ = k[prev_chunk_end_pos - - chunk_len:prev_chunk_end_pos] - v_states_succ = v[prev_chunk_end_pos - - chunk_len:prev_chunk_end_pos] - - if sparse_attn_enabled: - k_states_succ = (k_states_succ.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - v_states_succ = (v_states_succ.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - qk_chunks.append((q_states_succ_critical.transpose( - 0, 1)[:, -last_q_size:] * softmax_scale) - @ k_states_succ.permute(1, 2, 0)) - - if prev_chunk_end_pos - chunk_len * 2 >= 0: - q_states_inter = q_inter[qbegin:qend] - q_states_inter_critical = q_inter_critical[qbegin:qend] - if block_table is not None: - block_tables_inter = _get_block( - block_table, block_size, 0, - prev_chunk_end_pos - chunk_len) - k_states_inter = k[block_tables_inter].view( - -1, *k.shape[-2:])[:(prev_chunk_end_pos - chunk_len)] - v_states_inter = v[block_tables_inter].view( - -1, *v.shape[-2:])[:(prev_chunk_end_pos - chunk_len)] - else: - k_states_inter = k[:prev_chunk_end_pos - chunk_len] - v_states_inter = v[:prev_chunk_end_pos - chunk_len] - - if sparse_attn_enabled: - k_states_inter = (k_states_inter.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - v_states_inter = (v_states_inter.unsqueeze(2).repeat( - 1, 1, group_size, - 1).reshape(-1, num_device_k_heads * group_size, - head_dim)) - qk_chunks.append((q_states_inter_critical.transpose( - 0, 1)[:, -last_q_size:] * softmax_scale) - @ k_states_inter.permute(1, 2, 0)) - - if sparse_attn_enabled: - reversed_qk = qk_chunks[::-1] - qk = torch.cat(reversed_qk, dim=-1) - - qk[:, :, -last_q_size:] = torch.where( - self.last_q_mask[..., -last_q_size:, - -last_q_size:].to(qk.device), - qk[:, :, -last_q_size:], -torch.inf) - qk = F.softmax(qk, dim=-1, dtype=torch.float32) - - vertical = qk.sum(-2, keepdim=True) - vertical[..., :30] = torch.inf - - # Avoid sorting by using the min/max ints to fill the indexer - # buffers. - int32_max = torch.iinfo(torch.int32).max - int32_min = torch.iinfo(torch.int32).min - n_heads = qk.size()[0] - max_slash_topk = torch.max(heads_slash_size).item() - max_vertical_topk = torch.max(heads_vertical_size).item() - # store each head's slash topk, vertical topk - vertical = vertical.reshape((n_heads, -1)) - # prevent out of range when prompt size < max_vertical_topk - max_vertical_topk = min(vertical.shape[-1], max_vertical_topk) - vertical_topk_buffer = torch.topk(vertical, max_vertical_topk, - -1).indices - slash_topk_buffer = torch.empty(size=(n_heads, max_slash_topk), - dtype=torch.int64, - device=qk.device) - for head_i in range(n_heads): - # (nqheads=1, lastq, k_len) - head_score = qk[head_i:head_i + 1, :, :] - slash_scores = _sum_all_diagonal_matrix(head_score) - if head_score.size(1) != 1: - # drop right up corner - slash_scores = slash_scores[..., :-last_q_size + 1] - slash_scores[..., -100:] = torch.inf - - head_slash_size = heads_slash_size[head_i] - head_slash_size = min(head_slash_size, vertical.size(-1)) - slash_topk = torch.topk(slash_scores, head_slash_size, - -1).indices - #(nheads, max_topk) - slash_topk_buffer[head_i, :head_slash_size] = slash_topk - - # reset heads topk - heads_slash_size[head_i] = head_slash_size - heads_vertical_size[head_i] = min( - heads_vertical_size[head_i], max_vertical_topk) - - # store - vertical_buffer = torch.full((n_heads, max_vertical_topk), - int32_max, - dtype=torch.int64, - device=q.device) - slash_buffer = torch.full((n_heads, max_slash_topk), - int32_min, - dtype=torch.int64, - device=q.device) - succ_vertical_buffer = torch.full((n_heads, max_vertical_topk), - int32_max, - dtype=torch.int64, - device=q.device) - succ_slash_buffer = torch.full((n_heads, max_slash_topk), - int32_min, - dtype=torch.int64, - device=q.device) - inter_vertical_buffer = torch.full( - (n_heads, max_vertical_topk), - int32_max, - dtype=torch.int64, - device=q.device) - inter_slash_buffer = torch.full((n_heads, max_slash_topk), - int32_min, - dtype=torch.int64, - device=q.device) - - vertical_size_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - slash_sizes_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - succ_vertical_size_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - succ_slash_sizes_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - inter_vertical_size_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - inter_slash_sizes_buffer = torch.empty(size=(n_heads, ), - dtype=torch.int32, - device=q.device) - - for head_i in range(n_heads): - vertical_topk = vertical_topk_buffer[ - head_i, :heads_vertical_size[head_i]] - # intra - intra_vertical_indices = vertical_topk[ - vertical_topk >= - prev_chunk_end_pos] - prev_chunk_end_pos - if intra_vertical_indices.nelement() == 0: - intra_vertical_indices = torch.cat([ - intra_vertical_indices, - torch.arange(0, - k_states_intra.size(0), - max(1, - k_states_intra.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - slash_topk = slash_topk_buffer[ - head_i, :heads_slash_size[head_i]] - intra_slash_indices = ( - (qk.size(-1) - 1) - - slash_topk[slash_topk >= prev_chunk_end_pos]) - # fill buffer - v_count = intra_vertical_indices.nelement() - s_count = intra_slash_indices.nelement() - vertical_size_buffer[head_i] = v_count - slash_sizes_buffer[head_i] = s_count - vertical_buffer[head_i, :v_count].copy_( - intra_vertical_indices) - slash_buffer[head_i, :s_count].copy_(intra_slash_indices) - # succ - if prev_chunk_end_pos - chunk_len >= 0: - succ_vertical_indices = vertical_topk[ - (vertical_topk < prev_chunk_end_pos) - & (vertical_topk >= prev_chunk_end_pos - - chunk_len)] - (prev_chunk_end_pos - chunk_len) - # TODO: support no vertical - if succ_vertical_indices.nelement() == 0: - succ_vertical_indices = torch.cat([ - succ_vertical_indices, - torch.arange( - 0, - k_states_succ.size(0), - max(1, - k_states_succ.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - succ_slash_indices = ( - (prev_chunk_end_pos + (qend - qbegin) - 1) - - slash_topk[((slash_topk >= - (prev_chunk_end_pos - chunk_len)) & - (slash_topk < (prev_chunk_end_pos + - (qend - qbegin))))]) - if succ_slash_indices.nelement() == 0: - succ_slash_indices = torch.cat([ - succ_slash_indices, - torch.arange( - 0, - k_states_succ.size(0), - max(1, - k_states_succ.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - # fill buffer - v_count = succ_vertical_indices.nelement() - s_count = succ_slash_indices.nelement() - succ_vertical_size_buffer[head_i] = v_count - succ_slash_sizes_buffer[head_i] = s_count - succ_vertical_buffer[head_i, :v_count].copy_( - succ_vertical_indices) - succ_slash_buffer[head_i, :s_count].copy_( - succ_slash_indices) - - if prev_chunk_end_pos - 2 * chunk_len >= 0: - inter_vertical_indices = vertical_topk[ - vertical_topk < prev_chunk_end_pos - chunk_len] - - if inter_vertical_indices.nelement() == 0: - inter_vertical_indices = torch.cat([ - inter_vertical_indices, - torch.arange( - 0, - k_states_inter.size(0), - max(1, - k_states_inter.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - inter_slash_indices = ( - (prev_chunk_end_pos - chunk_len + - (qend - qbegin) - 1) - - slash_topk[slash_topk < (prev_chunk_end_pos - - chunk_len + - (qend - qbegin))]) - if inter_slash_indices.nelement() == 0: - inter_slash_indices = torch.cat([ - inter_slash_indices, - torch.arange( - 0, - k_states_inter.size(0), - max(1, - k_states_inter.size(0) / 5), - dtype=torch.int32, - device=intra_vertical_indices.device) - ]) - # fill buffer - v_count = inter_vertical_indices.nelement() - s_count = inter_slash_indices.nelement() - inter_vertical_size_buffer[head_i] = v_count - inter_slash_sizes_buffer[head_i] = s_count - inter_vertical_buffer[head_i, :v_count].copy_( - inter_vertical_indices) - inter_slash_buffer[head_i, :s_count].copy_( - inter_slash_indices) - else: - intra_vertical_indices, intra_slash_indices = None, None - succ_vertical_indices, succ_slash_indices = None, None - inter_vertical_indices, inter_slash_indices = None, None - - if sparse_attn_enabled: - flash_result = self._do_flash_attn( - q_states_intra, - k_states_intra, - v_states_intra, - softmax_scale=softmax_scale, - causal=True, - stage="intra", - vertical_indices=vertical_buffer, - slash_indices=slash_buffer, - vertical_indices_count=vertical_size_buffer, - slash_indices_count=slash_sizes_buffer, - mergehead_softmax_scale=softmax_scale, - sparse_attn_enabled=sparse_attn_enabled) - else: - flash_result = self._do_flash_attn( - q_states_intra, - k_states_intra, - v_states_intra, - softmax_scale=softmax_scale, - causal=True, - stage="intra", - vertical_indices=intra_vertical_indices, - slash_indices=intra_slash_indices, - sparse_attn_enabled=sparse_attn_enabled) - flash_per_chunk.append(flash_result) - - if prev_chunk_end_pos - chunk_len >= 0: - if sparse_attn_enabled: - flash_result = self._do_flash_attn( - q_states_succ, - k_states_succ, - v_states_succ, - softmax_scale=softmax_scale, - causal=False, - stage="succ", - vertical_indices=succ_vertical_buffer, - slash_indices=succ_slash_buffer, - vertical_indices_count=succ_vertical_size_buffer, - slash_indices_count=succ_slash_sizes_buffer, - mergehead_softmax_scale=softmax_scale, - sparse_attn_enabled=sparse_attn_enabled) - else: - flash_result = self._do_flash_attn( - q_states_succ, - k_states_succ, - v_states_succ, - softmax_scale=softmax_scale, - causal=False, - stage="succ", - vertical_indices=succ_vertical_indices, - slash_indices=succ_slash_indices, - sparse_attn_enabled=sparse_attn_enabled) - flash_per_chunk.append(flash_result) - - if prev_chunk_end_pos - chunk_len * 2 >= 0: - if sparse_attn_enabled: - flash_result = self._do_flash_attn( - q_states_inter, - k_states_inter, - v_states_inter, - softmax_scale=softmax_scale, - causal=False, - stage="inter", - vertical_indices=inter_vertical_buffer, - slash_indices=inter_slash_buffer, - vertical_indices_count=inter_vertical_size_buffer, - slash_indices_count=inter_slash_sizes_buffer, - mergehead_softmax_scale=softmax_scale, - sparse_attn_enabled=sparse_attn_enabled) - else: - flash_result = self._do_flash_attn( - q_states_inter, - k_states_inter, - v_states_inter, - softmax_scale=softmax_scale, - causal=False, - stage="inter", - vertical_indices=inter_vertical_indices, - slash_indices=inter_slash_indices, - sparse_attn_enabled=sparse_attn_enabled) - flash_per_chunk.append(flash_result) - - flash_results.append(flash_per_chunk) - begin = end - - attn_output = self._merge_attn_outputs(flash_results) - del flash_results - return attn_output - - def _do_flash_attn( - self, - query_states: torch.Tensor, - key_states: torch.Tensor, - value_states: torch.Tensor, - softmax_scale: float, - causal: bool = True, - max_seqlen_k: Optional[int] = None, - stage: str = "intra", - vertical_indices: Optional[torch.Tensor] = None, - slash_indices: Optional[torch.Tensor] = None, - vertical_indices_count: Optional[torch.Tensor] = None, - slash_indices_count: Optional[torch.Tensor] = None, - mergehead_softmax_scale: Optional[float] = None, - sparse_attn_enabled: Optional[bool] = False, - ): - if max_seqlen_k is None: - max_seqlen_k = key_states.shape[0] - - q_len = query_states.shape[0] - q_heads = query_states.shape[1] - h_dim = query_states.shape[-1] - - if sparse_attn_enabled: - assert slash_indices is not None - if stage == "intra": - assert causal - else: - assert not causal - - query_states = query_states.unsqueeze(0).transpose(1, 2) - key_states = key_states.unsqueeze(0).transpose(1, 2) - value_states = value_states.unsqueeze(0).transpose(1, 2) - - q = query_states - k = key_states - v = value_states - - if (vertical_indices_count is not None and \ - slash_indices_count is not None): - assert mergehead_softmax_scale is not None - - res, s_lse = _vertical_slash_sparse_attention( - q, - k, - v, - vertical_indices, - slash_indices, - mergehead_softmax_scale, - causal=causal, - stage=stage, - vertical_indices_count=vertical_indices_count, - slash_indices_count=slash_indices_count) - res = res.view(q_heads, q_len, - h_dim).transpose(0, 1) # (qlen,nhead,h_dim) - s_lse = s_lse.view( - q_heads, q_len, - 1).squeeze(-1).unsqueeze(0).float() # (1, nhead,qlen) - else: - res, s_lse = _vertical_slash_sparse_attention(q, - k, - v, - vertical_indices, - slash_indices, - softmax_scale, - causal=causal, - stage=stage) - res = res.view(q_len, q_heads, h_dim) - s_lse = s_lse.view(q_len, q_heads, 1).transpose(0, 2).float() - return res, s_lse - - output, softmax_lse = flash_attn_varlen_func( - q=query_states, - k=key_states, - v=value_states, - softmax_scale=softmax_scale, - cu_seqlens_q=torch.tensor([0, query_states.shape[0]], - dtype=torch.int32, - device=query_states.device), - max_seqlen_q=query_states.shape[0], - cu_seqlens_k=torch.tensor([0, max_seqlen_k], - dtype=torch.int32, - device=query_states.device), - max_seqlen_k=max_seqlen_k, - causal=causal, - return_softmax_lse=True, - ) - softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0, - 2).float() - return output, softmax_lse - - def _merge_attn_outputs( - self, - flash_results: List[List[Tuple[torch.Tensor, torch.Tensor]]], - return_lse: Optional[bool] = False, - ) -> torch.Tensor: - attn_outputs_all = [] - logits_all = [] - - for flash_per_chunk in flash_results: - if len(flash_per_chunk) == 1: - attn_outputs_all.append(flash_per_chunk[0][0]) - if return_lse: - logits_all.append(flash_per_chunk[0][1]) - continue - - attn_outputs = torch.stack([ - flash_attn_output[0] for flash_attn_output in flash_per_chunk - ]) - logits = torch.stack([ - flash_attn_output[1] for flash_attn_output in flash_per_chunk - ]) - logits = logits.to(torch.float32) - - if return_lse: - max_val = torch.max(logits, dim=0).values - diff = torch.abs(logits[0] - logits[1]) - log_sum_exp = max_val + torch.log1p(torch.exp(-diff)) - logits_all.append(log_sum_exp) - - max_logits = torch.max(logits, dim=0).values - stable_logits = logits - max_logits.unsqueeze(0) - lse_s = torch.exp(stable_logits).detach() - lse_sum = torch.sum(lse_s, dim=0) - lse_s /= lse_sum - attn_outputs *= lse_s.unsqueeze(-1).transpose(2, 3).squeeze(1) - attn_outputs_all.append(attn_outputs.sum(dim=0)) - - if return_lse: - return (torch.cat(attn_outputs_all, - dim=0), torch.cat(logits_all, dim=-1)) - else: - return torch.cat(attn_outputs_all, dim=0) - - def _dual_chunk_flash_attn_decoding( - self, - query: torch.Tensor, - query_succ: torch.Tensor, - query_inter: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_table: torch.Tensor, - cache_seqlens: torch.Tensor, - softmax_scale: float, - causal: bool, - alibi_slopes: Optional[torch.Tensor], - chunk_size: int, - local_size: int, - original_max_position_embeddings: int, - decode_meta: DualChunkFlashAttentionMetadata, - ): - if not causal: - raise ValueError( - "Dual Chunk Attention does not support causal=False") - - block_size = value_cache.shape[1] - chunk_len = chunk_size - local_size - if chunk_len % block_size != 0: - raise ValueError("chunk_len must be divisible by block_size.") - if original_max_position_embeddings > 0: - assert decode_meta.scaling_factor is not None - scaling_factor = decode_meta.scaling_factor - query = (query * scaling_factor.view(-1, 1, 1, 1)).to( - query.dtype - ) # possible for numerical issue, need to fused in the kernel - query_succ = (query_succ * scaling_factor.view(-1, 1, 1, 1)).to( - query.dtype) - query_inter = (query_inter * scaling_factor.view(-1, 1, 1, 1)).to( - query.dtype) - outputs_list = [] - softmax_lses_list = [] - - # intra-attention - intra_output, intra_softmax_lse = ( - self._dual_chunk_flash_attn_decoding_with_exp_sums( - query, - key_cache, - value_cache, - decode_meta.block_tables_intra, - decode_meta.seq_lens_intra, - softmax_scale, - alibi_slopes, - causal=False, - )) - outputs_list.append(intra_output) - softmax_lses_list.append(intra_softmax_lse) - - # succ-attention - if decode_meta.max_seq_len_succ: - succ_output, succ_softmax_lse = ( - self._dual_chunk_flash_attn_decoding_with_exp_sums( - query_succ, - key_cache, - value_cache, - decode_meta.block_tables_succ, - decode_meta.seq_lens_succ, - softmax_scale, - alibi_slopes, - causal=False, - )) - outputs_list.append(succ_output) - softmax_lses_list.append(succ_softmax_lse) - - # inter-attention - if decode_meta.max_seq_len_inter: - inter_output, inter_softmax_lse = ( - self._dual_chunk_flash_attn_decoding_with_exp_sums( - query_inter, - key_cache, - value_cache, - block_table[:, :decode_meta.max_seq_len_inter], - decode_meta.seq_lens_inter, - softmax_scale, - alibi_slopes, - causal=False, - )) - outputs_list.append(inter_output) - softmax_lses_list.append(inter_softmax_lse) - outputs = torch.stack(outputs_list, dim=0) - del outputs_list - softmax_lses = torch.stack(softmax_lses_list, dim=0).to(torch.float32) - del softmax_lses_list - max_logits = torch.max(softmax_lses, dim=0).values - stable_logits = softmax_lses - max_logits.unsqueeze(0) - lse_s = torch.exp(stable_logits).detach() - lse_sum = torch.sum(lse_s, dim=0) - lse_s /= lse_sum - outputs *= lse_s.unsqueeze(-1).transpose(2, 3) - return outputs.sum(0) - - def _dual_chunk_flash_attn_decoding_with_exp_sums( - self, - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_table: torch.Tensor, - cache_seqlens: torch.Tensor, - softmax_scale: float, - alibi_slopes: Optional[torch.Tensor], - causal: bool, - ): - out, softmax_lse = flash_attn_with_kvcache( - q=query, - k_cache=key_cache, - v_cache=value_cache, - block_table=block_table, - cache_seqlens=cache_seqlens, - softmax_scale=softmax_scale, - alibi_slopes=alibi_slopes, - causal=causal, - return_softmax_lse=True, - ) - mask = (cache_seqlens == 0) - out[mask] = 0 - softmax_lse[mask] = -float("inf") - return out, softmax_lse - - -def _vertical_slash_sparse_attention( - query: torch.Tensor, # [BATCH, N_HEADS, N_CTX, D_HEAD] - key: torch.Tensor, # [BATCH, N_HEADS, N_KV_CTX, D_HEAD] - value: torch.Tensor, # [BATCH, N_HEADS, N_KV_CTX, D_HEAD] - v_idx: torch.Tensor, # [BATCH, N_HEADS, NNZ_V] - s_idx: torch.Tensor, # [BATCH, N_HEADS, NNZ_S] - softmax_scale: float, - causal: bool = True, - stage: str = "intra", - block_size_M: int = 64, - block_size_N: int = 64, - vertical_indices_count: torch.Tensor = None, # [N_HEADS,] - slash_indices_count: torch.Tensor = None, -): - if stage == "intra": - assert causal - else: - assert not causal - - batch_size, num_heads, context_size, head_dim = query.shape - _, _, kv_seq_len, _ = key.shape - - if head_dim not in [16, 32, 64, 128, 256, 512]: - target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim - query = F.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0]) - key = F.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0]) - value = F.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0]) - - v_idx = v_idx.to(torch.int32).reshape( - (batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0] - s_idx = s_idx.to(torch.int32).reshape( - (batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0] - q_seqlens = torch.tensor([context_size], - dtype=torch.int32, - device=query.device) - kv_seqlens = torch.tensor([kv_seq_len], - dtype=torch.int32, - device=query.device) - - if vertical_indices_count is not None and slash_indices_count is not None: - ( - block_count, - block_offset, - column_count, - column_index, - ) = ops.convert_vertical_slash_indexes_mergehead( - q_seqlens, kv_seqlens, v_idx, s_idx, vertical_indices_count, - slash_indices_count, context_size, block_size_M, block_size_N, - causal) - else: - ( - block_count, - block_offset, - column_count, - column_index, - ) = ops.convert_vertical_slash_indexes(q_seqlens, kv_seqlens, v_idx, - s_idx, context_size, - block_size_M, block_size_N, - causal) - - q = query.transpose(1, 2).contiguous() - k = key.transpose(1, 2).contiguous() - v = value.transpose(1, 2).contiguous() - out, lse = sparse_attn_func( - q, - k, - v, - block_count, - block_offset, - column_count, - column_index, - causal=causal, - softmax_scale=softmax_scale, - return_softmax_lse=True, - ) - out = out.transpose(1, 2).contiguous() - softmax_lse = lse.reshape(*lse.shape, 1) - return (out[..., :context_size, :head_dim], - softmax_lse[..., :context_size, :]) - - -def _sum_all_diagonal_matrix(mat: torch.tensor): - h, n, m = mat.shape - # Zero matrix used for padding - zero_mat = torch.zeros((h, n, n), device=mat.device) - # pads the matrix on left and right - mat_padded = torch.cat((zero_mat, mat, zero_mat), -1) - # Change the strides - mat_strided = mat_padded.as_strided((1, n, n + m), - (n * (2 * n + m), 2 * n + m + 1, 1)) - # Sums the resulting matrix's columns - sum_diags = torch.sum(mat_strided, 1) - return sum_diags[:, 1:] # drop left bottom corner - - -def _get_block(block_table: torch.Tensor, block_size: int, begin: int, - end: int): - begin_block = begin // block_size - end_block = (end - 1) // block_size + 1 - return block_table[begin_block:end_block] diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py deleted file mode 100755 index edb3afb4aa075..0000000000000 --- a/vllm/attention/backends/flash_attn.py +++ /dev/null @@ -1,929 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with FlashAttention.""" -from collections import defaultdict -from dataclasses import dataclass -from itertools import accumulate -from typing import Dict, List, Optional, Tuple, Type - -import torch - -from vllm import _custom_ops as ops -# yapf conflicts with isort for this block -# yapf: disable -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionType, - is_quantized_kv_cache) -# yapf: enable -from vllm.attention.backends.utils import ( - PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping, - compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens, - get_seq_len_block_table_args, is_all_cross_attn_metadata_set, - is_all_encoder_attn_metadata_set, is_block_tables_empty) -from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8, - get_flash_attn_version) -from vllm.logger import init_logger -from vllm.multimodal import MultiModalPlaceholderMap -from vllm.utils import async_tensor_h2d, make_tensor_with_pad -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) - -logger = init_logger(__name__) - - -class FlashAttentionBackend(AttentionBackend): - - accept_output_buffer: bool = True - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] - - @staticmethod - def get_name() -> str: - return "FLASH_ATTN" - - @staticmethod - def get_impl_cls() -> Type["FlashAttentionImpl"]: - return FlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return FlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]: - return FlashAttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - if block_size % 16 != 0: - raise ValueError("Block size must be a multiple of 16.") - return (2, num_blocks, block_size, num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - src_key_cache = src_kv_cache[0] - dst_key_cache = dst_kv_cache[0] - ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) - src_value_cache = src_kv_cache[1] - dst_value_cache = dst_kv_cache[1] - ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -@dataclass -class FlashAttentionMetadata(AttentionMetadata): - """Metadata for FlashAttentionBackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - - use_cuda_graph: bool - - # Maximum query length in the batch. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None - _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - encoder_seq_start_loc: Optional[torch.Tensor] = None - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return is_all_encoder_attn_metadata_set(self) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return is_all_cross_attn_metadata_set(self) - - @property - def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert ((self.seq_lens is not None) - or (self.encoder_seq_lens is not None)) - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - - self._cached_prefill_metadata = FlashAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_query_len=0, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - - self._cached_decode_metadata = FlashAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=self.max_decode_query_len, - max_query_len=self.max_query_len, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - # Batch may be composed of prefill|decodes, adjust query start - # indices to refer to the start of decodes. E.g. - # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - query_start_loc=(self.query_start_loc[self.num_prefills:] - - self.query_start_loc[self.num_prefills]) - if self.query_start_loc is not None else None, - seq_start_loc=self.seq_start_loc[self.num_prefills:] - if self.seq_start_loc is not None else None, - context_lens_tensor=None, - block_tables=block_tables, - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - encoder_seq_start_loc=self.encoder_seq_start_loc, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_decode_metadata - - -class FlashAttentionMetadataBuilder( - AttentionMetadataBuilder[FlashAttentionMetadata]): - - def __init__(self, input_builder): - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - - def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.num_decode_tokens = 0 - self.has_prefix_cache_hit = False - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - - if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - - def _get_graph_runner_block_tables( - self, num_seqs: int, - block_tables: List[List[int]]) -> torch.Tensor: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - max_batch_size, max_blocks = self.runner.graph_block_tables.shape - assert max_batch_size >= num_seqs - - graph_block_tables = self.runner.graph_block_tables[:num_seqs] - for i, block_table in enumerate(block_tables): - if block_table: - num_blocks = len(block_table) - if num_blocks <= max_blocks: - graph_block_tables[i, :num_blocks] = block_table - else: - # It may be possible to have more blocks allocated due - # to lookahead slots of multi-step, however, they are - # not used anyway, so can be safely ignored. - graph_block_tables[ - i, :max_blocks] = block_table[:max_blocks] - - return torch.from_numpy(graph_block_tables).to( - device=self.runner.device, non_blocking=True) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. - """ - prefix_cache_hit = any([ - inter_data.prefix_cache_hit - for inter_data in self.input_builder.inter_data_list - ]) - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled, - prefix_cache_hit) - - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - max_query_len = max(query_lens) - decode_query_lens = query_lens[self.num_prefills:] - if len(decode_query_lens) > 0: - max_decode_query_len = max(decode_query_lens) - else: - max_decode_query_len = 1 - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - num_decode_tokens = self.num_decode_tokens - query_start_loc = list(accumulate(query_lens, initial=0)) - seq_start_loc = list(accumulate(seq_lens, initial=0)) - - num_seqs = len(seq_lens) - if use_captured_graph: - self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) - self.block_tables.extend([] * cuda_graph_pad_size) - num_decode_tokens = batch_size - self.num_prefill_tokens - block_tables = self._get_graph_runner_block_tables( - num_seqs, self.block_tables) - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - - assert device is not None - context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, - device, self.runner.pin_memory) - query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, - device, - self.runner.pin_memory) - seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, - device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } - - return FlashAttentionMetadata( - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - seq_lens=seq_lens, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=True, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_decode_query_len=max_decode_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc_tensor, - seq_start_loc=seq_start_loc_tensor, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=use_captured_graph, - ) - - -class FlashAttentionImpl(AttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "FLASH_ATTN backend.") - if use_irope: - logger.warning( - "Using irope in V0 is not supported yet, it will fall back " - "to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window - 1, - 0) if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - self.vllm_flash_attn_version = get_flash_attn_version( - requires_alibi=self.alibi_slopes is not None) - if is_quantized_kv_cache(self.kv_cache_dtype) and ( - not self.kv_cache_dtype.startswith("fp8") - or not flash_attn_supports_fp8()): - raise NotImplementedError( - f"FlashAttention does not support {self.kv_cache_dtype} " - "kv-cache on this device " - f"(FA supports fp8 = {flash_attn_supports_fp8()}).") - if logits_soft_cap is None: - # In flash-attn, setting logits_soft_cap as 0 means no soft cap. - logits_soft_cap = 0 - self.logits_soft_cap = logits_soft_cap - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() - if head_size not in support_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {support_head_sizes}.") - self.attn_type = attn_type - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: FlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with FlashAttention. - - Args: - query: shape = [num_tokens, num_heads, head_size] - key: shape = [num_tokens, num_kv_heads, head_size] - value: shape = [num_tokens, num_kv_heads, head_size] - output: shape = [num_tokens, num_heads, head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size, num_kv_heads, head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - NOTE: It in-place updates the output tensor. - NOTE: FP8 quantization, flash-attn expect the size of - {q,k,v}_descale to be (num_sequences, num_kv_heads). - We use torch's .expand() to avoid duplicating values - """ - assert output is not None, "Output tensor must be provided." - - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for FlashAttentionImpl") - - # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache. - if not flash_attn_supports_fp8() or output.dtype != torch.bfloat16: - assert ( - layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0), ( - "key/v_scale is only supported in FlashAttention 3 with " - "base dtype bfloat16") - - attn_type = self.attn_type - if (attn_type == AttentionType.ENCODER - and (not attn_metadata.is_all_encoder_attn_metadata_set)): - raise AttributeError("Encoder attention requires setting " - "encoder metadata attributes.") - elif (attn_type == AttentionType.ENCODER_DECODER - and (not attn_metadata.is_all_cross_attn_metadata_set)): - raise AttributeError("Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes.") - - kv_cache_dtype: str = self.kv_cache_dtype - softmax_scale: float = self.scale - window_size = self.sliding_window - alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes - logits_soft_cap: Optional[float] = self.logits_soft_cap - fp8_attention = kv_cache_dtype.startswith("fp8") - - if fp8_attention and not flash_attn_supports_fp8(): - raise NotImplementedError( - "FlashAttention does not support FP8 kv-cache on this device.") - - if kv_cache.numel() > 0: - key_cache = kv_cache[0] - value_cache = kv_cache[1] - # We skip updating the KV cache under two conditions: - # a. When the Attention Type is ENCODER. In this phase, we compute - # only the encoder attention without updating the cache. - # b. When both Key and Value are None. This occurs during - # cross-attention computation in the decoding phase, where the - # KV cache is already populated with the cross-attention - # tensor. Thus, we skip cache updates during this time. - if (attn_type != AttentionType.ENCODER) and (key is not None) and ( - value is not None): - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory - # profiling run. - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], - updated_slot_mapping.flatten(), # type: ignore[union-attr] - kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - if fp8_attention: - kv_cache = kv_cache.view(torch.float8_e4m3fn) - key_cache = key_cache.view(torch.float8_e4m3fn) - value_cache = value_cache.view(torch.float8_e4m3fn) - - if fp8_attention: - num_tokens, num_heads, head_size = query.shape - query, _ = ops.scaled_fp8_quant( - query.reshape( - (num_tokens, num_heads * head_size)).contiguous(), - layer._q_scale) - query = query.reshape((num_tokens, num_heads, head_size)) - - (num_prefill_query_tokens, num_prefill_kv_tokens, - num_decode_query_tokens) = \ - get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) - decode_query = query[num_prefill_query_tokens:] - decode_output = output[num_prefill_query_tokens:] - # QKV for prefill. - query = query[:num_prefill_query_tokens] - prefill_output = output[:num_prefill_query_tokens] - assert query.shape[0] == num_prefill_query_tokens - assert decode_query.shape[0] == num_decode_query_tokens - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if (kv_cache.numel() == 0 or prefill_meta.block_tables is None - or prefill_meta.block_tables.numel() == 0): - # normal attention - # When block_tables are not filled, it means q and k are the - # prompt, and they have the same length. - q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \ - _get_query_key_seq_metadata(prefill_meta, True, attn_type) - - key = key[:num_prefill_kv_tokens] - value = value[:num_prefill_kv_tokens] - - if fp8_attention: - num_kv_tokens, num_kv_heads, head_size = key.shape - - key, _ = ops.scaled_fp8_quant( - key.reshape((num_kv_tokens, - num_kv_heads * head_size)).contiguous(), - layer._k_scale) - key = key.reshape((num_kv_tokens, num_kv_heads, head_size)) - - value, _ = ops.scaled_fp8_quant( - value.reshape((num_kv_tokens, - num_kv_heads * head_size)).contiguous(), - layer._v_scale) - value = value.reshape( - (num_kv_tokens, num_kv_heads, head_size)) - - descale_shape = (q_seq_start_loc.shape[0] - 1, key.shape[1]) - flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=q_seq_start_loc, - cu_seqlens_k=k_seq_start_loc, - max_seqlen_q=q_seq_len, - max_seqlen_k=k_seq_len, - softmax_scale=softmax_scale, - causal=_get_causal_option(attn_type), - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - out=prefill_output, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - else: - # prefix-enabled attention - assert attn_type == AttentionType.DECODER, ( - "Only decoder-only models support prefix caching") - assert prefill_meta.seq_lens is not None - assert prefill_meta.query_start_loc is not None - max_seq_len = max(prefill_meta.seq_lens) - descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, - key.shape[1]) - flash_attn_varlen_func( # noqa - q=query, - k=key_cache, - v=value_cache, - cu_seqlens_q=prefill_meta.query_start_loc, - max_seqlen_q=prefill_meta.max_query_len, - seqused_k=prefill_meta.seq_lens_tensor, - max_seqlen_k=max_seq_len, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - block_table=prefill_meta.block_tables, - softcap=logits_soft_cap, - out=prefill_output, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - - if decode_meta := attn_metadata.decode_metadata: - # Decoding run. - # Use flash_attn_varlen_func kernel for speculative decoding - # because different queries might have different lengths. - - assert decode_meta.max_decode_query_len is not None - # use only for actual varlen decoding - if decode_meta.max_decode_query_len > 1: - assert attn_type == AttentionType.DECODER, ( - "Only decoder-only models support max_decode_query_len > 1" - ) - assert decode_meta.query_start_loc is not None - descale_shape = (decode_meta.query_start_loc.shape[0] - 1, - key.shape[1]) - flash_attn_varlen_func( - q=decode_query, - k=key_cache, - v=value_cache, - cu_seqlens_q=decode_meta.query_start_loc, - max_seqlen_q=decode_meta.max_decode_query_len, - seqused_k=decode_meta.seq_lens_tensor, - max_seqlen_k=decode_meta.max_decode_seq_len, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - block_table=decode_meta.block_tables, - out=decode_output, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - else: - # Use flash_attn_with_kvcache for normal decoding. - ( - seq_lens_arg, - _, - block_tables_arg, - ) = get_seq_len_block_table_args(decode_meta, False, attn_type) - descale_shape = (seq_lens_arg.shape[0], key_cache.shape[-2]) - flash_attn_with_kvcache( - q=decode_query.unsqueeze(1), - k_cache=key_cache, - v_cache=value_cache, - block_table=block_tables_arg, - cache_seqlens=seq_lens_arg, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - out=decode_output.unsqueeze(1), - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - return output - - -def _get_query_key_seq_metadata( - attn_metadata: FlashAttentionMetadata, - is_prompt: bool, - attn_type: str, -) -> tuple: - """ - Returns sequence metadata for key and query based on the specified - attention type and whether input is a prompt. - - This function computes the starting locations and maximum sequence lengths - for key and query sequences for different attention types. - - Args: - attn_metadata: The attention metadata object - is_prompt (bool): A flag indicating if the input is a prompt - attn_type (AttentionType): The type of attention being used. - - Returns: - tuple: A tuple containing four integers: - - Starting location for the query sequence. - - Maximum sequence length for the query sequence. - - Starting location for the key sequence. - - Maximum sequence length for the key sequence. - - Raises: - AttributeError: If an invalid attention type is provided. - """ - if attn_type == AttentionType.DECODER: - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - if is_prompt: - max_seq_len = attn_metadata.max_prefill_seq_len - else: - max_seq_len = attn_metadata.max_decode_seq_len - return (attn_metadata.seq_start_loc, max_seq_len, - attn_metadata.seq_start_loc, max_seq_len) - - elif attn_type == AttentionType.ENCODER_DECODER: - # This is cross attention between the where the key - # is the precomputed encoder attention and query - # is the input sequence. - # Choose query max length based on whether it is prompt - # or not. - if is_prompt: - max_seq_len = attn_metadata.max_prefill_seq_len - else: - max_seq_len = attn_metadata.max_decode_seq_len - return (attn_metadata.seq_start_loc, max_seq_len, - attn_metadata.encoder_seq_start_loc, - attn_metadata.max_encoder_seq_len) - elif attn_type == AttentionType.ENCODER: - # For encoder attention both the query and the key are same i.e. the - # encoder sequence. - return (attn_metadata.encoder_seq_start_loc, - attn_metadata.max_encoder_seq_len, - attn_metadata.encoder_seq_start_loc, - attn_metadata.max_encoder_seq_len) - elif attn_type == AttentionType.ENCODER_ONLY: - assert is_prompt, "Should not have decode for encoder only model." - return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len, - attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -def _get_causal_option(attn_type: str) -> bool: - """ - Determine whether the given attention type is suitable for causal - attention mechanisms. - - Args: - attn_type (AttentionType): The type of attention being evaluated - - Returns: - bool: Returns `True` if the attention type is suitable for causal - attention (i.e., not encoder, encoder-only, or encoder-decoder), - otherwise returns `False`. - """ - return not (attn_type == AttentionType.ENCODER - or attn_type == AttentionType.ENCODER_ONLY - or attn_type == AttentionType.ENCODER_DECODER) diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py deleted file mode 100644 index aeaa0ab631cfb..0000000000000 --- a/vllm/attention/backends/flashmla.py +++ /dev/null @@ -1,227 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from contextlib import contextmanager -from dataclasses import dataclass -from typing import List, Optional, Tuple, Type - -import torch - -from vllm.attention.backends.abstract import (AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.mla.common import (MLACommonBackend, - MLACommonImpl, - MLACommonMetadata, - MLACommonMetadataBuilder, - MLACommonState) -from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, - get_mla_metadata, - is_flashmla_supported) - - -class FlashMLABackend(MLACommonBackend): - - @staticmethod - def get_name() -> str: - return "FLASHMLA" - - @staticmethod - def get_impl_cls() -> Type["FlashMLAImpl"]: - return FlashMLAImpl - - @staticmethod - def get_metadata_cls() -> Type["FlashMLAMetadata"]: - return FlashMLAMetadata - - @staticmethod - def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]: - return FlashMLAMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["FlashMLAState"]: - return FlashMLAState - - -@dataclass -class FlashMLAMetadata(MLACommonMetadata): - decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor, - torch.Tensor]] = None - decode_num_splits: Optional[torch.Tensor] = None - - @property - def decode_metadata(self): - decode_metadata = super().decode_metadata - # TODO: cache assignment? - if decode_metadata is not None: - decode_metadata.decode_tile_scheduler_metadata=\ - self.decode_tile_scheduler_metadata - decode_metadata.decode_num_splits=\ - self.decode_num_splits - return decode_metadata - - -class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.num_q_heads = self.runner.model_config.get_num_attention_heads( - self.runner.parallel_config) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - m = super().build(seq_lens, query_lens, cuda_graph_pad_size, - batch_size) - - if m.num_decode_tokens > 0: - m.decode_tile_scheduler_metadata, m.decode_num_splits = \ - get_mla_metadata( - m.seq_lens_tensor[m.num_prefills:], - self.num_q_heads, - 1, # MQA for the decode path - ) - - return m - - -class FlashMLAState(MLACommonState[FlashMLAMetadata]): - - def __init__(self, *args, **kwds): - super().__init__(*args, **kwds) - - self.num_q_heads = self.runner.model_config.get_num_attention_heads( - self.runner.parallel_config) - - @contextmanager - def graph_capture(self, max_batch_size: int): - # Run a dummy `get_mla_metadata` so we can get the right shapes - self._graph_decoder_tile_scheduler_metadata, \ - self._graph_decode_num_splits = get_mla_metadata( - torch.ones( - max_batch_size, dtype=torch.int32, device=self.runner.device), - self.num_q_heads, - 1, # MQA for the decode path - ) - - with super().graph_capture(max_batch_size): - yield - - del self._graph_decoder_tile_scheduler_metadata - del self._graph_decode_num_splits - - def graph_capture_get_metadata_for_batch( - self, batch_size: int, is_encoder_decoder_model: bool = False): - metadata = super().graph_capture_get_metadata_for_batch( - batch_size, is_encoder_decoder_model) - assert metadata.num_decode_tokens > 0 - - decoder_tile_scheduler_metadata, decode_num_splits = get_mla_metadata( - self._graph_seq_lens[:batch_size], - self.num_q_heads, - 1, # MQA for the decode path - ) - - self._graph_decoder_tile_scheduler_metadata.copy_( - decoder_tile_scheduler_metadata) - self._graph_decode_num_splits[:batch_size + 1].copy_(decode_num_splits) - - metadata.decode_tile_scheduler_metadata=\ - self._graph_decoder_tile_scheduler_metadata - metadata.decode_num_splits=\ - self._graph_decode_num_splits[:batch_size + 1] - - return metadata - - def get_graph_input_buffers(self, - attn_metadata, - is_encoder_decoder_model: bool = False): - input_buffers = super().get_graph_input_buffers( - attn_metadata, is_encoder_decoder_model) - input_buffers["decode_tile_scheduler_metadata"] = \ - attn_metadata.decode_metadata.decode_tile_scheduler_metadata - input_buffers["decode_num_splits"] = \ - attn_metadata.decode_metadata.decode_num_splits - - return input_buffers - - def prepare_graph_input_buffers(self, - input_buffers, - attn_metadata, - is_encoder_decoder_model: bool = False): - super().prepare_graph_input_buffers(input_buffers, attn_metadata, - is_encoder_decoder_model) - - input_buffers["decode_tile_scheduler_metadata"].copy_( - attn_metadata.decode_metadata.decode_tile_scheduler_metadata) - input_buffers["decode_num_splits"].copy_( - attn_metadata.decode_metadata.decode_num_splits) - - -class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str] = None, - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - is_supported, reason = is_flashmla_supported() - assert is_supported, reason - - unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] - if any(unsupported_features): - raise NotImplementedError( - "FlashMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, logits_soft_cap") - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "FlashMLAImpl") - - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "FlashMLA with FP8 KV cache not yet supported") - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: FlashMLAMetadata, - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - - q = torch.cat([q_nope, q_pe], dim=-1)\ - .unsqueeze(1) # Add seqlen dim of 1 (decode) - - o, _ = flash_mla_with_kvcache( - q=q, - k_cache=kv_c_and_k_pe_cache.unsqueeze(-2), # Add head dim of 1 - block_table=decode_meta.block_tables, - cache_seqlens=decode_meta.seq_lens_tensor, - head_dim_v=self.kv_lora_rank, - tile_scheduler_metadata=decode_meta.decode_tile_scheduler_metadata, - num_splits=decode_meta.decode_num_splits, - softmax_scale=self.scale, - causal=True, - ) - - return self._v_up_proj(o) diff --git a/vllm/attention/backends/mla/__init__.py b/vllm/attention/backends/mla/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py deleted file mode 100644 index 826b63e1ccda8..0000000000000 --- a/vllm/attention/backends/mla/common.py +++ /dev/null @@ -1,1305 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -# MLA Common Components - -This file implements common components for MLA implementations. - -First we define: - -Sq as Q sequence length -Skv as KV sequence length - -MLA has two possible ways of computing, a data-movement friendly approach and a -compute friendly approach, we generally want to use the compute friendly -approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1) -and the data-movement friendly approach for "decode" (i.e. the ratio -Sq / Skv is "large"). - -NOTE what we deem small and large is currently determined by if its labelled -prefill or decode by the scheduler, but this is something we should probably -tune. - -Main reference: DeepseekV2 paper, and FlashInfer Implementation -(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). - -Deepseek's MLA attention works the following way: -* Use a single latent vector to represent the per-token entry of the KV cache. -* For decode (i.e. the memory friendly approach) the attention "simulates" a -multi-head attention, while the compute is similar to multi-query attention. - -Below is example of both paths assuming batchsize = 1 - -## More Extent Definitions: - -C Context length, `Skv - Sq` -H hidden size -N number of attention heads -Lq latent dimension for Q 1536 in DSV3 -Lkv latent dimension for K/V 512 in DSV3 -P nope dimension, no rope. 128 in DSV3 -R rope dimension, goes through rope. 64 in DSV3 -V V head dim. 128 in DSV3 - -## Vector/Matrix Definitions - -h_t hidden states (input to attention) shape [Sq, H] -q_c latent/compressed Q shape [Sq, Lq] -q_nope uncompressed Q (no-rope) shape [Sq, N, P] -q_pe uncompressed Q (rope) shape [Sq, N, R] -kv_c latent/compressed KV shape [Skv, Lkv] -k_pe decoupled k position embeddings shape [Skv, R] -new_kv_c new kv_c from current iter shape [Sq, Lkv] -new_k_pe new k_pe from current iter shape [Sq, R] -cache_kv_c cached k_c from previous iters shape [C, Lkv] -cache_k_pe cached k_pe from previous iters shape [C, R] -W_DQ project h_t to q_c shape [H, Lq] -W_UQ project q_c to q_nope shape [Lq, N * P] -W_QR project q_c to q_pe shape [Lq, N * R] -W_DKV project h_t to kv_c shape [H, Lkv] -W_UK project kv_c to k_nope shape [Lkv, N, P] -W_KR project h_t to k_pe shape [H, R] -W_UV project kv_c to v shape [Lkv, N, V] -W_O project v to h_t shape [N * V, H] - - -## Compute Friendly Approach (i.e. "_forward_prefill"): - -q_c = h_t @ W_DQ -q_nope = (q_c @ W_UQ).view(Sq, N, P) -q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) -new_kv_c = h_t @ W_DKV -new_k_pe = RoPE(h_t @ W_KR) -kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0) -k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) -k_nope = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P) -v = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V) - -// MHA with QK headdim = P + R -// V headdim = V -// spda_o shape [Sq, N, V] -spda_o = scaled_dot_product_attention( - torch.cat([q_nope, q_pe], dim=-1), - torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), - v -) -return spda_o @ W_O - -NOTE: in the actual code, - `kv_b_proj` is [W_UK; W_UV] concatenated per head - `q_b_proj` is [W_UQ; W_QR] concatenated per head - `out_proj` is W_O - - -## Data-Movement Friendly Approach (i.e. "_forward_decode"): - -Runtime -q_c = h_t @ W_DQ -q_nope = (q_c @ W_UQ).view(-1, N, P) -ql_nope = einsum("snh,lnh->snl", q, W_UK) -q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) -new_kv_c = h_t @ W_DKV -new_k_pe = RoPE(h_t @ W_KR) -kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0) -k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) - -// MQA with QK headdim = Lkv + R -// V headdim = Lkv -// spda_o shape [Sq, N, Lkv] -// NOTE: this is less compute-friendly since Lkv > P -// but is more data-movement friendly since its MQA vs MHA -spda_o = scaled_dot_product_attention( - torch.cat([ql_nope, q_pe], dim=-1), - torch.cat([kv_c, k_pe], dim=-1), - kv_c -) - -o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV) -return o.view(-1, N * V) @ self.num_heads @ W_O - - -## Chunked Prefill - -For chunked prefill we want to use the compute friendly algorithm. We are -assuming sufficiently large Sq / Skv ratio, in the future may want to switch to -the data-movement friendly approach if the chunk (i.e. `Sq`) is small. - -However, the compute-friendly approach can potentially run out of memory if Skv -is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)` - -To mitigate this, we chunk the computation of attention with respect to the -current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a -fixed workspace size. - -The chunked prefill approach is as follows: - -MCC Max chunk of context to process per iter, computed dynamically, - used to bound the memory usage - -q_c = h_t @ W_DQ -q_nope = (q_c @ W_UQ).view(Sq, N, P) -q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) -new_kv_c = h_t @ W_DKV -new_k_pe = RoPE(h_t @ W_KR) -new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P) -new_v = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V) - -// MHA between queries and new KV -// with QK headdim = P + R -// V headdim = V -// curr_o shape [Sq, N, V] -// curr_lse shape [N, Sq], this is just order FA returns -curr_o, curr_lse = scaled_dot_product_attention( - torch.cat([q_nope, q_pe], dim=-1), - torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), - new_v, - casual=True, - return_softmax_lse=True -) - -// Compute attention with the already existing context -for chunk_idx in range(cdiv(C, MCC)): - chunk_start = chunk_idx * MCC - chunk_end = min(chunk_start + MCC, C) - Sc = chunk_end - chunk_start - cache_kv_c_chunk = cache_kv_c[chunk_start:chunk_end] - cache_k_pe_chunk = cache_k_pe[chunk_start:chunk_end] - cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P) - cache_v_chunk = (cache_kv_c_chunk @ W_UV).view(-1, N, V) - - chunk_o, chunk_lse = scaled_dot_product_attention( - torch.cat([q_nope, q_pe], dim=-1), - torch.cat([cache_k_nope_chunk, - cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)], - dim=-1), - cache_v_chunk, - casual=False, - return_softmax_lse=True - ) - - curr_o, curr_lse = merge_attn_states( - suffix_output=curr_o, - suffix_lse=curr_lse, - prefix_output=chunk_o, - prefix_lse=chunk_lse, - ) - -return curr_o @ W_O -""" - -import functools -from abc import abstractmethod -from collections import defaultdict -from contextlib import contextmanager -from dataclasses import dataclass -from itertools import accumulate -from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar - -import torch - -from vllm import _custom_ops as ops -from vllm import envs -from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionState, MLAAttentionImpl) -from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, - compute_slot_mapping_start_idx, - is_block_tables_empty) -from vllm.attention.ops.merge_attn_states import merge_attn_states -from vllm.attention.utils.fa_utils import get_flash_attn_version -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearBase, - UnquantizedLinearMethod) -from vllm.multimodal import MultiModalPlaceholderMap -from vllm.platforms import current_platform -from vllm.triton_utils import HAS_TRITON -from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down - -if HAS_TRITON: - from vllm.attention.ops.triton_flash_attention import triton_attention -else: - triton_attention = None - -try: - from vllm.vllm_flash_attn import flash_attn_varlen_func - is_vllm_fa = True -except ImportError: - is_vllm_fa = False - try: - # For rocm use upstream flash attention - from flash_attn import flash_attn_varlen_func - except ImportError: - flash_attn_varlen_func = None - -is_hip = current_platform.is_rocm() - - -class MLACommonBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "TRITON_MLA" - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return MLACommonMetadata - - @staticmethod - def get_builder_cls() -> Type["MLACommonMetadataBuilder"]: - return MLACommonMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["MLACommonState"]: - return MLACommonState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, # assumed to be 1 for MLA - head_size: int, - ) -> Tuple[int, ...]: - return (num_blocks, block_size, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - ops.copy_blocks_mla(kv_caches, src_to_dists) - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [576] - - -T = TypeVar("T", bound="MLACommonMetadata") - - -class MLACommonState(AttentionState, Generic[T]): - - def __init__(self, runner): - self.runner = runner - self._is_graph_capturing = False - - scheduler_config = runner.scheduler_config - self.model_config = runner.model_config - cache_config = runner.cache_config - - self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled - self.enable_prefix_caching = cache_config.enable_prefix_caching - - if self.chunked_prefill_enabled or self.enable_prefix_caching: - self.context_chunk_workspace_size = min( - # Max sure there is enough for 8 full length request or at least - # 4 pages of cache per request - max( - 8 * self.model_config.max_model_len, 4 * - scheduler_config.max_num_seqs * cache_config.block_size), - # For long-context models try not to over-allocate limiting - # kv-cache space, limiting it to 64k tokens, - # which would result in the workspace being: - # 2*(576)*(64*1024) = 144mb - # (assuming 576 MLA head dim, and fp16) - # which would result in up-projected context being - # 2*(192*128)*(64*1024) = 3gb - # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) - assert self.context_chunk_workspace_size >= \ - scheduler_config.max_num_seqs * cache_config.block_size - - @contextmanager - def graph_capture(self, max_batch_size: int): - self._is_graph_capturing = True - - self._graph_slot_mapping = torch.full((max_batch_size, ), - PAD_SLOT_ID, - dtype=torch.long, - device=self.runner.device) - self._graph_seq_lens = torch.ones(max_batch_size, - dtype=torch.int32, - device=self.runner.device) - self._graph_block_tables = torch.from_numpy( - self.runner.graph_block_tables).to(device=self.runner.device) - - self._positions = torch.zeros((max_batch_size, ), - dtype=torch.long, - device=self.runner.device) - - yield - - self._is_graph_capturing = False - del self._graph_slot_mapping - del self._graph_seq_lens - del self._graph_block_tables - del self._positions - - def graph_clone(self, batch_size: int): - assert self._is_graph_capturing - return self.__class__(self.runner) - - def graph_capture_get_metadata_for_batch( - self, - batch_size: int, - is_encoder_decoder_model: bool = False) -> T: - assert self._is_graph_capturing - - attn_metadata = self.runner.attn_backend.make_metadata( - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - use_cuda_graph=True, - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=batch_size, - slot_mapping=self._graph_slot_mapping[:batch_size], - seq_lens=None, - seq_lens_tensor=self._graph_seq_lens[:batch_size], - max_query_len=1, - max_decode_query_len=1, - max_prefill_seq_len=0, - max_decode_seq_len=self.runner.max_seq_len_to_capture, - query_start_loc=None, - seq_start_loc=None, - context_lens_tensor=None, - block_tables=self._graph_block_tables[:batch_size], - head_dim=self.runner.model_config.get_head_size()) - - if is_encoder_decoder_model: - raise NotImplementedError( - "MLACommonState does not support encoder/decoder yet") - - return attn_metadata - - def get_graph_input_buffers(self, - attn_metadata, - is_encoder_decoder_model: bool = False): - input_buffers = { - "slot_mapping": attn_metadata.slot_mapping, - "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, - "block_tables": attn_metadata.decode_metadata.block_tables, - } - if is_encoder_decoder_model: - raise NotImplementedError( - "MLACommonState does not support encoder/decoder yet") - - return input_buffers - - def prepare_graph_input_buffers(self, - input_buffers, - attn_metadata, - is_encoder_decoder_model: bool = False): - input_buffers["seq_lens_tensor"].copy_( - attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True) - input_buffers["block_tables"].copy_( - attn_metadata.decode_metadata.block_tables, non_blocking=True) - if is_encoder_decoder_model: - raise NotImplementedError( - "TritonMLAState does not support encoder/decoder yet") - - def begin_forward(self, model_input): - if self.chunked_prefill_enabled or self.enable_prefix_caching: - if not hasattr(self, "context_chunk_workspace"): - # not self.runner.device does not return the correct device - # for this process, (init_device sets the correct device but - # only on the Worker). The only way Ive figured out to get the - # correct device is to allocate the workspace on the first call - # to begin_forward and use the device of the input tokens - assert model_input.input_tokens is not None - self.context_chunk_workspace = torch.empty( - (self.context_chunk_workspace_size, - self.model_config.get_head_size()), - dtype=self.model_config.dtype, - device=model_input.input_tokens.device, - ) - - model_input.attn_metadata.context_chunk_workspace = \ - self.context_chunk_workspace - - -@dataclass -class MLACommonMetadata(AttentionMetadata): - """Metadata for MLACommon. - - NOTE: Please read the comment at the top of the file before trying to - understand this class - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - - # Maximum query length in the batch. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - _cached_prefill_metadata: Optional[Any] = None - _cached_decode_metadata: Optional[Any] = None - - num_prefill_tokens: int - - # The dimension of the attention heads - head_dim: Optional[int] = None - - # Used when chunked prefill is enabled to simulate worst case workspace - # allocations, hopefully to avoid going OOM - is_profile_run: bool = False - - # New for MLA (compared to FlashAttention) - # For chunked prefill - context_chunk_cu_seq_lens: Optional[torch.Tensor] = None - context_chunk_starts: Optional[torch.Tensor] = None - context_chunk_seq_tot: Optional[List[int]] = None - context_chunk_max_seq_lens: Optional[List[int]] = None - # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted - context_chunk_workspace: Optional[torch.Tensor] = None - - def __post_init__(self): - supported_head_sizes = MLACommonBackend.get_supported_head_sizes() - if self.head_dim is not None and self.head_dim \ - not in supported_head_sizes: - raise ValueError( - f"Only {supported_head_sizes} are supported for head_dim,", - f" received {self.head_dim}.") - - @property - def prefill_metadata(self): - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert self.seq_lens is not None - assert self.seq_lens_tensor is not None - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - - self._cached_prefill_metadata = self.__class__( - # Required by ModelRunner - use_cuda_graph=False, # Not Attention Related - # Required by Attention Metadata - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - # Required by Attention Metadata (not used) - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - # MLACommonMetadata - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_query_len=0, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - head_dim=self.head_dim, - is_profile_run=self.is_profile_run, - # MLACommonMetadata Chunk prefill specific - context_chunk_cu_seq_lens=self.context_chunk_cu_seq_lens, - context_chunk_starts=self.context_chunk_starts, - context_chunk_seq_tot=self.context_chunk_seq_tot, - context_chunk_max_seq_lens=self.context_chunk_max_seq_lens, - ) - return self._cached_prefill_metadata - - @property - def decode_metadata(self): - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert self.seq_lens_tensor is not None - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - - self._cached_decode_metadata = self.__class__( - # Required by ModelRunner - use_cuda_graph=self.use_cuda_graph, # Not Attention Related - # Required by Attention Metadata - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - # Required by Attention Metadata (not used) - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - # MLACommonMetadata - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_query_len=self.max_decode_query_len, - max_query_len=self.max_query_len, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - # Batch may be composed of prefill|decodes, adjust query start - # indices to refer to the start of decodes. E.g. - # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - query_start_loc=(self.query_start_loc[self.num_prefills:] - - self.query_start_loc[self.num_prefills]) - if self.query_start_loc is not None else None, - seq_start_loc=self.seq_start_loc[self.num_prefills:] - if self.seq_start_loc is not None else None, - context_lens_tensor=None, - block_tables=block_tables, - head_dim=self.head_dim, - is_profile_run=self.is_profile_run) - return self._cached_decode_metadata - - -class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): - """ - NOTE: Please read the comment at the top of the file before trying to - understand this class - """ - BLOCK_TABLE_EXTENDER: list[list[int]] = [] - - def __init__(self, input_builder): - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - self.chunked_prefill_enabled = \ - self.runner.scheduler_config.chunked_prefill_enabled - self.enable_prefix_caching = \ - self.runner.cache_config.enable_prefix_caching - - if self.chunked_prefill_enabled or self.enable_prefix_caching: - attn_state = self.input_builder.runner.attn_state - self.context_chunk_workspace_size = \ - attn_state.context_chunk_workspace_size - self.page_size = self.runner.block_size - - def prepare(self): - self.slot_mapping: List[int] = [] - self.prefill_seq_lens: List[int] = [] - self.context_lens: List[int] = [] - self.block_tables: List[List[int]] = [] - self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - self.num_prefills = 0 - self.num_prefill_tokens = 0 - self.num_decode_tokens = 0 - self.has_prefix_cache_hit = False - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - if is_prompt: - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - - def _get_graph_runner_block_tables( - self, num_seqs: int, - block_tables: List[List[int]]) -> torch.Tensor: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - max_batch_size, max_blocks = self.runner.graph_block_tables.shape - assert max_batch_size >= num_seqs - - graph_block_tables = self.runner.graph_block_tables[:num_seqs] - for i, block_table in enumerate(block_tables): - if block_table: - num_blocks = len(block_table) - if num_blocks <= max_blocks: - graph_block_tables[i, :num_blocks] = block_table - else: - # It may be possible to have more blocks allocated due - # to lookahead slots of multi-step, however, they are - # not used anyway, so can be safely ignored. - graph_block_tables[ - i, :max_blocks] = block_table[:max_blocks] - - return torch.from_numpy(graph_block_tables).to( - device=self.runner.device, non_blocking=True) - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors. - - Args: - seq_lens: The maybe padded sequence lengths of the input sequences. - query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. - """ - prefix_cache_hit = any([ - inter_data.prefix_cache_hit - for inter_data in self.input_builder.inter_data_list - ]) - - for inter_data in self.input_builder.inter_data_list: - self._add_seq_group(inter_data, - self.input_builder.chunked_prefill_enabled, - prefix_cache_hit) - - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - max_query_len = max(query_lens) - decode_query_lens = query_lens[self.num_prefills:] - if len(decode_query_lens) > 0: - max_decode_query_len = max(decode_query_lens) - else: - max_decode_query_len = 1 - max_prefill_seq_len = max(self.prefill_seq_lens, default=0) - max_decode_seq_len = max(self.curr_seq_lens, default=0) - num_decode_tokens = self.num_decode_tokens - query_start_loc = list(accumulate(query_lens, initial=0)) - seq_start_loc = list(accumulate(seq_lens, initial=0)) - - num_seqs = len(seq_lens) - if use_captured_graph: - self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) - self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER * - cuda_graph_pad_size) - num_decode_tokens = batch_size - self.num_prefill_tokens - - block_tables = self._get_graph_runner_block_tables( - num_seqs, self.block_tables) - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=0, - dtype=torch.int, - device=device, - ) - assert max_query_len > 0, ("query_lens: {}".format(query_lens)) - - assert device is not None - context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int, - device, self.runner.pin_memory) - seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, - self.runner.pin_memory) - slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, - device, self.runner.pin_memory) - query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, - device, - self.runner.pin_memory) - seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, - device, self.runner.pin_memory) - - context_chunk_cu_seq_lens = None - context_chunk_starts = None - context_chunk_seq_tot = None - context_chunk_max_seq_lens = None - - if (self.chunked_prefill_enabled or self.enable_prefix_caching) \ - and self.num_prefills > 0 \ - and context_lens_tensor is not None \ - and context_lens_tensor[:self.num_prefills].max() > 0: - - # NOTE: it is recommended you read the `Chunked Prefill` section in - # the comment at the top of the file before trying to understand - # the following code - - num_prefills_with_context = \ - (context_lens_tensor[:self.num_prefills] > 0).sum().item() - - # currently we allocate an equal amount of workspace for each - # prefill in the batch, we could probably use a more advanced - # algorithm here and allocate more workspace to prefills with - # longer context lengths - max_context_chunk = \ - self.context_chunk_workspace_size // num_prefills_with_context - - # align max_context_chunk to page_size by rounding down, - # currently the `gather_and_maybe_dequant_cache` kernel cannot - # handle `context_chunk_starts` that are not aligned to page_size - max_context_chunk = round_down(max_context_chunk, self.page_size) - assert max_context_chunk > 0 - num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk) - - # if `max_context_chunk = 256`, `num_chunks = 3`, and - # `num_prefills_with_context = 4`, create a tensor that looks like - # [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]] - context_chunk_starts = \ - torch.arange(num_chunks, device=device, dtype=torch.int32)\ - .unsqueeze(1).expand(-1, self.num_prefills)\ - * max_context_chunk - chunk_ends = torch.min(context_lens_tensor[:self.num_prefills]\ - .unsqueeze(0), context_chunk_starts + max_context_chunk) - chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0) - _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to( - torch.int32) - zero = torch.zeros(num_chunks, dtype=torch.int32, device=device)\ - .unsqueeze(-1) - context_chunk_cu_seq_lens = \ - torch.cat([zero, _context_chunk_cu_seq_lens], dim=1) - context_chunk_max_seq_lens = \ - chunk_seq_lens.max(dim=1).values.tolist() - context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist() - assert max(context_chunk_seq_tot) <= \ - self.context_chunk_workspace_size - - return self.runner.attn_backend.make_metadata( - # Required by ModelRunner - use_cuda_graph=use_captured_graph, # Not Attention Related - # Required by Attention Metadata - num_prefills=self.num_prefills, - slot_mapping=slot_mapping_tensor, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - # Required by Attention Metadata (not used) - multi_modal_placeholder_index_maps=None, # Not Attention Related - enable_kv_scales_calculation=False, - # MLACommonMetadata - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_decode_query_len=max_decode_query_len, - max_prefill_seq_len=max_prefill_seq_len, - max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc_tensor, - seq_start_loc=seq_start_loc_tensor, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - head_dim=self.runner.model_config.get_head_size(), - is_profile_run=self.runner.in_profile_run, - # MLACommonMetadata Chunk prefill specific - context_chunk_cu_seq_lens=context_chunk_cu_seq_lens, - context_chunk_starts=context_chunk_starts, - context_chunk_seq_tot=context_chunk_seq_tot, - context_chunk_max_seq_lens=context_chunk_max_seq_lens, - ) - - -class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): - """ - NOTE: Please read the comment at the top of the file before trying to - understand this class - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - q_lora_rank: Optional[int], - kv_lora_rank: int, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - qk_head_dim: int, - v_head_dim: int, - kv_b_proj: ColumnParallelLinear, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing not supported in V0.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - self.kv_cache_dtype = kv_cache_dtype - - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.qk_head_dim = qk_head_dim - self.v_head_dim = v_head_dim - self.kv_b_proj = kv_b_proj - - self.triton_fa_func = triton_attention - # Handle the differences between the flash_attn_varlen from flash_attn - # and the one from vllm_flash_attn. The former is used on RoCM and the - # latter has an additional parameter to control FA2 vs FA3 - self.flash_attn_varlen_func = flash_attn_varlen_func - self.vllm_flash_attn_version = get_flash_attn_version() - if self.vllm_flash_attn_version is not None: - self.flash_attn_varlen_func = \ - functools.partial(flash_attn_varlen_func, - fa_version=self.vllm_flash_attn_version) - - # For MLA the v head dim is smaller than qk head dim so we pad out - # v with 0s to match the qk head dim for attention backends that do - # not support different headdims - # We don't need to pad V if we are on a hopper system with FA3 - self._pad_v = self.vllm_flash_attn_version is None or not ( - self.vllm_flash_attn_version == 3 - and current_platform.get_device_capability()[0] == 9) - - def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale, - return_softmax_lse, **kwargs): - maybe_padded_v = v - if self._pad_v: - maybe_padded_v = torch.nn.functional.pad( - v, [0, q.shape[-1] - v.shape[-1]], value=0) - - if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \ - and not return_softmax_lse: - attn_out = self.triton_fa_func( - q, - k, - maybe_padded_v, - None, # output - kwargs["cu_seqlens_q"], - kwargs["cu_seqlens_k"], - kwargs["max_seqlen_q"], - kwargs["max_seqlen_k"], - kwargs["causal"], - softmax_scale, - None, # bias - ) - elif is_vllm_fa: - attn_out = self.flash_attn_varlen_func( - q=q, - k=k, - v=maybe_padded_v, - return_softmax_lse=return_softmax_lse, - softmax_scale=softmax_scale, - **kwargs, - ) - else: - # Use return_attn_probs instead of return_softmax_lse for RoCM - attn_out = self.flash_attn_varlen_func( - q=q, - k=k, - v=maybe_padded_v, - return_attn_probs=return_softmax_lse, - softmax_scale=softmax_scale, - **kwargs, - ) - - # Unpack the output if there is multiple results, - # triton always returns (output, softmax_lse), - # vllm_flash_attn returns (output, softmax_lse) when - # `return_softmax_lse = True` - # flash_attn (RoCM) returns (output, softmax_lse, ...) when - # `return_attn_probs = True` - rest = None - if isinstance(attn_out, tuple): - attn_out, *rest = attn_out - - # Remain consistent with old `flash_attn_varlen_func` where there - # is only one output tensor if `return_softmax_lse` is False. - if return_softmax_lse: - assert rest is not None - return attn_out, rest[0] - return attn_out - - def _v_up_proj(self, x): - # Convert from (B, N, L) to (N, B, L) - x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) - # Multiply (N, B, L) x (N, L, V) -> (N, B, V) - x = torch.bmm(x, self.W_UV) - # Convert from (N, B, V) to (B, N * V) - return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) - - def process_weights_after_loading(self, act_dtype: torch.dtype): - - def get_layer_weight(layer): - WEIGHT_NAMES = ("weight", "qweight", "weight_packed") - for attr in WEIGHT_NAMES: - if hasattr(layer, attr): - return getattr(layer, attr) - raise AttributeError( - f"Layer '{layer}' has no recognized weight attribute:" - f" {WEIGHT_NAMES}.") - - def get_and_maybe_dequant_weights(layer: LinearBase): - if not isinstance(layer.quant_method, UnquantizedLinearMethod): - # NOTE: This should only be used offline, since it's O(N^3) - eye = torch.eye(layer.input_size_per_partition, - dtype=act_dtype, - device=get_layer_weight(layer).device) - dequant_weights = layer.quant_method.apply(layer, - eye, - bias=None) - del eye - # standardize to (output, input) - return dequant_weights.T - return layer.weight - - # we currently do not have quantized bmm's which are needed for - # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform - # the bmm's in 16-bit, the extra memory overhead of this is fairly low - kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T - assert kv_b_proj_weight.shape == ( - self.kv_lora_rank, - self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), ( - f"{kv_b_proj_weight.shape=}, " - f"{self.kv_lora_rank=}, " - f"{self.num_heads=}, " - f"{self.qk_nope_head_dim=}, " - f"{self.v_head_dim=}") - kv_b_proj_weight = kv_b_proj_weight.view( - self.kv_lora_rank, - self.num_heads, - self.qk_nope_head_dim + self.v_head_dim, - ) - - W_UK, W_UV = kv_b_proj_weight.split( - [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - # Convert from (L, N, V) to (N, L, V) - self.W_UV = W_UV.transpose(0, 1) - # Convert from (L, N, P) to (N, P, L) - self.W_UK_T = W_UK.permute(1, 2, 0) - - def _compute_prefill_context( - self, - q: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - k_scale: torch.Tensor, - ): - prefill_metadata = attn_metadata.prefill_metadata - assert prefill_metadata is not None - assert prefill_metadata.context_chunk_seq_tot is not None - assert prefill_metadata.context_chunk_cu_seq_lens is not None - assert prefill_metadata.context_chunk_starts is not None - assert prefill_metadata.context_chunk_max_seq_lens is not None - assert prefill_metadata.context_lens_tensor is not None - - output = None - iters = len(prefill_metadata.context_chunk_seq_tot) - - # Fetch from attn_metadata directly, since it late bound by - # MLAAttentionState, grabbing it directly `attn_metadata` can avoid - # any weirdness around prefill_metadata caching - assert attn_metadata.context_chunk_workspace is not None - workspace = attn_metadata.context_chunk_workspace - - for i in range(iters): - toks = prefill_metadata.context_chunk_seq_tot[i] - - ops.gather_and_maybe_dequant_cache( - src_cache=kv_c_and_k_pe_cache, - dst=workspace, - block_table=prefill_metadata.block_tables, - cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i], - batch_size=prefill_metadata.num_prefills, - kv_cache_dtype=self.kv_cache_dtype, - scale=k_scale, - seq_starts=prefill_metadata.context_chunk_starts[i], - ) - - kv_c_normed = workspace[:toks]\ - [..., :self.kv_lora_rank] - k_pe = workspace[:toks]\ - [..., self.kv_lora_rank:].unsqueeze(1) - - kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \ - -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - k_nope, v = kv_nope\ - .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), - dim=-1) - - attn_output, attn_softmax_lse = \ - self._flash_attn_varlen_diff_headdims( - q=q, - k=k, - v=v, - cu_seqlens_q=prefill_metadata.query_start_loc, - cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i], - max_seqlen_q=prefill_metadata.max_query_len, - max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i], - softmax_scale=self.scale, - causal=False, # Context is unmasked - return_softmax_lse=True, - ) - - if output is None: - output = attn_output - output_lse = attn_softmax_lse - else: - output_tmp = torch.empty_like(output) - output_lse_tmp = torch.empty_like(output_lse) - merge_attn_states( - output=output_tmp, - output_lse=output_lse_tmp, - prefix_output=output, - prefix_lse=output_lse, - suffix_output=attn_output, - suffix_lse=attn_softmax_lse, - ) - output = output_tmp - output_lse = output_lse_tmp - - return output, output_lse - - def _forward_prefill( - self, - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - k_scale: torch.Tensor, - ) -> torch.Tensor: - - prefill_metadata = attn_metadata.prefill_metadata - assert prefill_metadata is not None - - has_context = prefill_metadata.context_lens_tensor is not None \ - and prefill_metadata.context_lens_tensor.max() > 0 - - kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\ - -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - k_nope, v = kv_nope\ - .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) - - output = self._flash_attn_varlen_diff_headdims( - q=q, - k=k, - v=v, - cu_seqlens_q=prefill_metadata.query_start_loc, - cu_seqlens_k=prefill_metadata.query_start_loc, - max_seqlen_q=prefill_metadata.max_prefill_seq_len, - max_seqlen_k=prefill_metadata.max_prefill_seq_len, - softmax_scale=self.scale, - causal=True, - return_softmax_lse=has_context, - ) - - if has_context: - # ROCm flash_attn_varlen_func will return 3 objects instead of 2 - suffix_output, suffix_lse = output - context_output, context_lse = self._compute_prefill_context( \ - q, kv_c_and_k_pe_cache, attn_metadata, k_scale) - - output = torch.empty_like(suffix_output) - merge_attn_states( - output=output, - prefix_output=context_output, - prefix_lse=context_lse, - suffix_output=suffix_output, - suffix_lse=suffix_lse, - ) - - # unpad if necessary - if self._pad_v: - output = output[..., :v.shape[-1]] - - return output.flatten(start_dim=-2) - - @abstractmethod - def _forward_decode( - self, - ql_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: T, - ) -> torch.Tensor: - raise NotImplementedError - - def forward( - self, - layer: AttentionLayer, - q: torch.Tensor, # query in unified attn - k_c_normed: torch.Tensor, # key in unified attn - k_pe: torch.Tensor, # value in unified attn - kv_cache: torch.Tensor, - attn_metadata: T, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - if output is not None: - raise NotImplementedError( - "output is not yet supported for MLAImplBase") - - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for MLAImplBase") - - if attn_metadata.is_profile_run and \ - attn_metadata.context_chunk_workspace is not None: - # During the profile run try to simulate to worse case output size - # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context` - # since this can be large - _ = torch.empty( - (attn_metadata.context_chunk_workspace.shape[0], - self.num_heads, self.qk_nope_head_dim + self.v_head_dim), - device=k_c_normed.device, - dtype=k_c_normed.dtype, - ) - - has_decode = attn_metadata.decode_metadata is not None - has_prefill = attn_metadata.prefill_metadata is not None - - num_prefill_tokens: int = attn_metadata.num_prefill_tokens - q = q.view(-1, self.num_heads, self.qk_head_dim) - - decode_q = q[num_prefill_tokens:] - - prefill_q = q[:num_prefill_tokens] - prefill_k_pe = k_pe[:num_prefill_tokens] - prefill_k_c_normed = k_c_normed[:num_prefill_tokens] - - # write the latent and rope to kv cache - if kv_cache.numel() > 0: - ops.concat_and_cache_mla( - k_c_normed, - k_pe.squeeze(1), - kv_cache, - attn_metadata.slot_mapping.flatten(), - kv_cache_dtype=self.kv_cache_dtype, - scale=layer._k_scale, - ) - - output = torch.empty(attn_metadata.num_prefill_tokens + - attn_metadata.num_decode_tokens, - self.v_head_dim * self.num_heads, - device=q.device, - dtype=q.dtype) - if has_prefill: - output[:num_prefill_tokens] = self._forward_prefill( - prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache, - attn_metadata, layer._k_scale) - - if has_decode: - decode_q_nope, decode_q_pe = decode_q.split( - [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - # Convert from (B, N, P) to (N, B, P) - decode_q_nope = decode_q_nope.transpose(0, 1) - # Multiply (N, B, P) x (N, P, L) -> (N, B, L) - decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) - # Convert from (N, B, L) to (B, N, L) - decode_ql_nope = decode_ql_nope.transpose(0, 1) - - output[num_prefill_tokens:] = self._forward_decode( - decode_ql_nope, decode_q_pe, kv_cache, attn_metadata) - - return output diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py deleted file mode 100644 index 587d08858b92b..0000000000000 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ /dev/null @@ -1,407 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from contextlib import contextmanager -from dataclasses import dataclass -from typing import Optional, Type, Union - -import torch - -import vllm.envs as envs -from vllm.attention.backends.mla.common import (MLACommonBackend, - MLACommonImpl, - MLACommonMetadata, - MLACommonMetadataBuilder, - MLACommonState) -from vllm.attention.backends.utils import (compute_slot_mapping, - compute_slot_mapping_start_idx, - is_block_tables_empty) -from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd, - get_aiter_mla_metadata) - - -def is_aiter_mla_enabled() -> bool: - return envs.VLLM_ROCM_USE_AITER \ - and envs.VLLM_ROCM_USE_AITER_MLA - - -class AiterMLABackend(MLACommonBackend): - - @staticmethod - def get_name() -> str: - return "ROCM_AITER_MLA" - - @staticmethod - def get_impl_cls() -> Type["AiterMLAImpl"]: - return AiterMLAImpl - - @staticmethod - def get_metadata_cls() -> Type["AiterMLAMetadata"]: - return AiterMLAMetadata - - @staticmethod - def get_builder_cls() -> Type["AiterMLAMetadataBuilder"]: - return AiterMLAMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["AiterMLAState"]: - return AiterMLAState - - -@dataclass -class AiterMLAMetadata(MLACommonMetadata): - # The following 5 tensors are for current version of AITER MLA - block_table_bound: Optional[torch.Tensor] = None - # The indptr of the paged kv cache, shape: [batch_size + 1] - paged_kv_indptr: Optional[torch.Tensor] = None - # The page indices of the paged kv cache - paged_kv_indices: Optional[torch.Tensor] = None - # The number of entries in the last page of each request in - # the paged kv cache, shape: [batch_size] - paged_kv_last_page_lens: Optional[torch.Tensor] = None - - # This is just to make new AITER MLA API work - # -- MTP support is not added yet. - qo_indptr: Optional[torch.Tensor] = None - - @property - def prefill_metadata(self): - prefill_metadata = super().prefill_metadata - self._cached_prefill_metadata = prefill_metadata - - if prefill_metadata is not None: - prefill_metadata.paged_kv_indptr = self.paged_kv_indptr - prefill_metadata.paged_kv_indices = self.paged_kv_indices - prefill_metadata\ - .paged_kv_last_page_lens = self.paged_kv_last_page_lens - prefill_metadata.block_table_bound = self.block_table_bound - prefill_metadata.qo_indptr = self.qo_indptr - - # update the cache - self._cached_prefill_metadata = self.__class__( - **prefill_metadata.__dict__) - - return self._cached_prefill_metadata - - @property - def decode_metadata(self): - decode_metadata = super().decode_metadata - - self._cached_decode_metadata = decode_metadata - - if decode_metadata is not None: - decode_metadata.paged_kv_indptr = self.paged_kv_indptr - decode_metadata.paged_kv_indices = self.paged_kv_indices - decode_metadata\ - .paged_kv_last_page_lens = self.paged_kv_last_page_lens - decode_metadata.block_table_bound = self.block_table_bound - decode_metadata.qo_indptr = self.qo_indptr - - # update the cache - self._cached_decode_metadata = self.__class__( - **decode_metadata.__dict__) - - return self._cached_decode_metadata - - -class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): - BLOCK_TABLE_EXTENDER: list[list[int]] = [[]] - - def __init__(self, input_builder): - super().__init__(input_builder) - assert self.block_size == 1, "AITER MLA requires only block size 1." - - def prepare(self): - super().prepare() - self.paged_kv_indices: list[int] = [] - self.paged_kv_indptr: list[int] = [0] - self.paged_kv_last_page_lens: list[int] = [] - self.total_blocks = 0 - self.qo_indptr: list[int] = [0] - - def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool, - prefix_cache_hit: bool): - """Add a sequence group to the metadata. Specifically update/append - 1. context length. - 2. block table. - 3. slot mapping. - """ - is_prompt = inter_data.is_prompt - block_tables = inter_data.block_tables - - for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, - curr_sliding_window_block) in zip( - inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], - inter_data.orig_seq_lens, inter_data.seq_lens, - inter_data.query_lens, inter_data.context_lens, - inter_data.curr_sliding_window_blocks): - self.context_lens.append(context_len) - if is_prompt: - self.num_prefills += 1 - self.num_prefill_tokens += token_len - self.prefill_seq_lens.append(seq_len) - else: - self.num_decode_tokens += query_len - self.curr_seq_lens.append(curr_seq_len) - - # Compute block table. - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - block_table = [] - if prefix_cache_hit: - # NOTE(woosuk): For flash-attn, the block table should - # include the entries for the incoming prefill tokens. - block_table = block_tables[seq_id] - elif ((chunked_prefill_enabled or not is_prompt) - and block_tables is not None): - if curr_sliding_window_block == 0: - block_table = block_tables[seq_id] - else: - block_table = block_tables[seq_id][ - -curr_sliding_window_block:] - self.block_tables.append(block_table) - - # Compute slot mapping. - is_profile_run = is_block_tables_empty(block_tables) - start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, - context_len, - self.sliding_window) - compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id, - seq_len, context_len, start_idx, - self.block_size, inter_data.block_tables) - if is_profile_run: - return - - # Update paged_kv_* tensors only for non-profile run - block_table = block_tables[seq_id] - self._update_paged_kv_tensors(block_table, seq_len) - - def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int): - # Get the number of valid blocks based on sequence length. - # If seq_len = 16, block_size = 16, - # block_table_bound is 1 with 1 valid block. - # If seq_len = 15, block_size = 16, - # block_table_bound is 0 + 1 with 1 valid block. - self.total_blocks += len(block_table) - block_table_bound = seq_len // self.block_size + 1 \ - if seq_len % self.block_size != 0 \ - else seq_len // self.block_size - self.paged_kv_indices.extend(block_table[:block_table_bound]) - self.paged_kv_indptr.append(self.paged_kv_indptr[-1] + - block_table_bound) - self.qo_indptr.append(self.qo_indptr[-1] + 1) - - last_page_len = seq_len % self.block_size - if last_page_len == 0: - last_page_len = self.block_size - self.paged_kv_last_page_lens.append(last_page_len) - - def build(self, seq_lens: list[int], query_lens: list[int], - cuda_graph_pad_size: int, batch_size: int) -> AiterMLAMetadata: - metadata = super().build(seq_lens, query_lens, cuda_graph_pad_size, - batch_size) - device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 - - if use_captured_graph: - last_paged_kv_indptr = self.paged_kv_indptr[-1] - self.paged_kv_indptr.extend([last_paged_kv_indptr] * - cuda_graph_pad_size) - self.paged_kv_last_page_lens.extend([0] * cuda_graph_pad_size) - last_qo_indptr = self.qo_indptr[-1] - self.qo_indptr.extend([last_qo_indptr] * cuda_graph_pad_size) - - # For current version of AITER MLA - if len(self.paged_kv_indptr) > 0: - # extend to the maximum number of blocks as returned by the - # scheduler - self.paged_kv_indices.extend( - [0] * (self.total_blocks - len(self.paged_kv_indices))) - paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices, - device=device, - dtype=torch.int) - paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr, - device=device, - dtype=torch.int) - paged_kv_last_page_lens_tensor = torch.tensor( - self.paged_kv_last_page_lens, device=device, dtype=torch.int) - block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) - - 1, - device=device, - dtype=torch.int) - - qo_indptr = torch.tensor(self.qo_indptr, - device=device, - dtype=torch.int) - else: - paged_kv_indices_tensor = None - paged_kv_indptr_tensor = None - paged_kv_last_page_lens_tensor = None - block_table_bound_tensor = None - qo_indptr = None - - metadata.paged_kv_indptr = paged_kv_indptr_tensor - metadata.paged_kv_indices = paged_kv_indices_tensor - metadata.paged_kv_last_page_lens = paged_kv_last_page_lens_tensor - metadata.block_table_bound = block_table_bound_tensor - metadata.qo_indptr = qo_indptr - - return metadata - - -class AiterMLAState(MLACommonState[AiterMLAMetadata]): - - @contextmanager - def graph_capture(self, max_batch_size: int): - kv_indices, kv_indptr, last_page_lens, qo_indptr = \ - get_aiter_mla_metadata( - max_batch_size=max_batch_size, - block_size=self.runner.block_size, - max_block_per_batch=\ - self.runner.get_max_block_per_batch(), - device=self.runner.device) - self._paged_kv_indices_tensor = kv_indices - self._paged_kv_indptr_tensor = kv_indptr - self._paged_kv_last_page_lens_tensor = last_page_lens - self._qo_indptr_tensor = qo_indptr - - with super().graph_capture(max_batch_size): - yield - - del self._paged_kv_indices_tensor - del self._paged_kv_indptr_tensor - del self._paged_kv_last_page_lens_tensor - del self._qo_indptr_tensor - - def graph_capture_get_metadata_for_batch( - self, - batch_size: int, - is_encoder_decoder_model: bool = False) -> AiterMLAMetadata: - - metadata = super().graph_capture_get_metadata_for_batch( - batch_size, is_encoder_decoder_model) - - paged_kv_indptr = self._paged_kv_indptr_tensor[:batch_size + 1] - paged_kv_indices = self._paged_kv_indices_tensor - paged_kv_last_page_lens = self._paged_kv_last_page_lens_tensor[: - batch_size] - qo_indptr = self._qo_indptr_tensor[:batch_size + 1] - - metadata.paged_kv_indptr = paged_kv_indptr - metadata.paged_kv_indices = paged_kv_indices - metadata.paged_kv_last_page_lens = paged_kv_last_page_lens - metadata.qo_indptr = qo_indptr - - return metadata - - def get_graph_input_buffers(self, - attn_metadata: AiterMLAMetadata, - is_encoder_decoder_model: bool = False): - input_buffers = super().get_graph_input_buffers( - attn_metadata, is_encoder_decoder_model) - input_buffers[ - 'paged_kv_indptr'] = attn_metadata.decode_metadata.paged_kv_indptr - input_buffers[ - "paged_kv_indices"] = attn_metadata.\ - decode_metadata.paged_kv_indices - input_buffers[ - "paged_kv_last_page_lens"] = attn_metadata.\ - decode_metadata.paged_kv_last_page_lens - input_buffers['qo_indptr'] = attn_metadata.qo_indptr - - return input_buffers - - def prepare_graph_input_buffers(self, - input_buffers, - attn_metadata: AiterMLAMetadata, - is_encoder_decoder_model: bool = False): - super().prepare_graph_input_buffers(input_buffers, attn_metadata, - is_encoder_decoder_model) - - num_total_blocks = attn_metadata.decode_metadata.paged_kv_indices.shape[ - 0] - input_buffers["paged_kv_indptr"].copy_( - attn_metadata.decode_metadata.paged_kv_indptr, non_blocking=True) - input_buffers["paged_kv_indices"][:num_total_blocks].copy_( - attn_metadata.decode_metadata.paged_kv_indices, non_blocking=True) - input_buffers["paged_kv_last_page_lens"].copy_( - attn_metadata.decode_metadata.paged_kv_last_page_lens, - non_blocking=True) - input_buffers["qo_indptr"].copy_( - attn_metadata.decode_metadata.qo_indptr, non_blocking=True) - - -class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[list[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] - if any(unsupported_features): - raise NotImplementedError( - "Aiter MLA does not support one of the following: " - "alibi_slopes, sliding_window, logits_soft_cap") - - from aiter import flash_attn_varlen_func - self.flash_attn_varlen_func = flash_attn_varlen_func - - def _flash_attn_varlen_diff_headdims( - self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - softmax_scale: float, return_softmax_lse: bool, - **kwargs) -> Union[tuple[torch.Tensor, ...], torch.Tensor]: - output = self.flash_attn_varlen_func( - q, - k, - v, - **kwargs, - ) - - return output - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: AiterMLAMetadata, - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - B = q_nope.shape[0] - - q = torch.cat([q_nope, q_pe], dim=-1) - o = torch.empty(B, - self.num_heads, - self.kv_lora_rank, - dtype=q.dtype, - device=q.device) - - kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2) - - aiter_mla_decode_fwd(q, kv_buffer, o, self.scale, - attn_metadata.qo_indptr, - attn_metadata.max_query_len, - attn_metadata.paged_kv_indptr, - attn_metadata.paged_kv_indices, - attn_metadata.paged_kv_last_page_lens) - - return self._v_up_proj(o) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py deleted file mode 100644 index 9262144e37b54..0000000000000 --- a/vllm/attention/backends/rocm_flash_attn.py +++ /dev/null @@ -1,953 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer ROCm GPUs.""" -import itertools -from dataclasses import dataclass -from functools import cache -from typing import List, Optional, Tuple, Type - -import torch - -import vllm.envs as envs -from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType) -from vllm.attention.backends.utils import (CommonAttentionState, - CommonMetadataBuilder) -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) -from vllm.config import get_current_vllm_config -from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - QuantKey, kFp8StaticTensorSym) -from vllm.platforms import current_platform - -logger = init_logger(__name__) -_PARTITION_SIZE_ROCM = 256 - - -@cache -def is_rocm_aiter_paged_attn_enabled() -> bool: - return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \ - and envs.VLLM_ROCM_USE_AITER \ - - -@cache -def _get_paged_attn_module() -> PagedAttention: - """ - Initializes the appropriate PagedAttention module from `attention/ops`, - which is used as helper function - by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`. - - The choice of attention module depends on whether - AITER paged attention is enabled: - - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`. - - Otherwise, it defaults to using the original `PagedAttention`. - """ - if is_rocm_aiter_paged_attn_enabled(): - # Import AITERPagedAttention only when the flag is enabled - from vllm.attention.ops.rocm_aiter_paged_attn import ( - AITERPagedAttention) - return AITERPagedAttention() - return PagedAttention() - - -class ROCmFlashAttentionBackend(AttentionBackend): - accept_output_buffer: bool = True - - @staticmethod - def get_name() -> str: - return "ROCM_FLASH" - - @staticmethod - def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: - return ROCmFlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return ROCmFlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]: - return ROCmFlashAttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - paged_attn = _get_paged_attn_module() - return paged_attn.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - paged_attn = _get_paged_attn_module() - paged_attn.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - paged_attn = _get_paged_attn_module() - paged_attn.copy_blocks(kv_caches, src_to_dists) - - -@dataclass -class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for FlashAttentionBackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # Maximum query length in the batch. None for decoding. - max_query_len: Optional[int] = None - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None - _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - @property - def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert self.seq_lens is not None - assert self.seq_lens_tensor is not None - assert self.block_tables is not None - - self._cached_prefill_metadata = ROCmFlashAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=self.slot_mapping[:self.num_prefill_tokens], - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=self.seq_lens[:self.num_prefills], - seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_seq_len=0, - query_start_loc=None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1], - seq_start_loc=None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1], - context_lens_tensor=None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills], - block_tables=self.block_tables[:self.num_prefills], - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert self.block_tables is not None - assert self.seq_lens_tensor is not None - - self._cached_decode_metadata = ROCmFlashAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=self.slot_mapping[self.num_prefill_tokens:], - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens=None, - seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], - max_query_len=None, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - query_start_loc=None, - seq_start_loc=None, - context_lens_tensor=None, - block_tables=self.block_tables[self.num_prefills:], - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - # Batch may be composed of prefill|decodes, adjust query start indices - # to refer to the start of decodes when the two are split apart. - # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - if self._cached_decode_metadata.query_start_loc is not None: - qs = self._cached_decode_metadata.query_start_loc - self._cached_decode_metadata.query_start_loc = qs - qs[0] - return self._cached_decode_metadata - - -class ROCmFlashAttentionMetadataBuilder( - CommonMetadataBuilder[ROCmFlashAttentionMetadata]): - - _metadata_cls = ROCmFlashAttentionMetadata - - -def _make_alibi_bias(alibi_slopes: torch.Tensor, - dtype: torch.dtype, - seq_lens: Optional[List[int]], - make_attn_mask: bool = True) -> List[torch.Tensor]: - attn_biases = [] - if seq_lens: - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - num_heads = alibi_slopes.shape[0] - bias = bias[None, :].repeat( - (num_heads, 1, 1)).to(alibi_slopes.device) - bias.mul_(alibi_slopes[:, None, None]) - if make_attn_mask: - inf_mask = torch.empty( - (1, seq_len, seq_len), - dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1).to( - alibi_slopes.device) - attn_biases.append((bias + inf_mask).to(dtype)) - else: - attn_biases.append(bias.to(dtype)) - - return attn_biases - - -def _get_seq_len_block_table_args( - attn_metadata: ROCmFlashAttentionMetadata, - attn_type: str, -) -> tuple: - ''' - The particular choice of sequence-length - attributes which should be extracted from attn_metadata is dependent - on the type of attention operation. - - Decoder attn -> select entirely decoder self-attention-related fields - Encoder/decoder cross-attn -> select encoder sequence lengths - Encoder attn -> select encoder sequence lengths fields - Encoder-only attn -> select prefill sequence lengths with - bidirectional attention - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention op - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention, encoder-only - - Returns: - - * Appropriate sequence-lengths tensors for query and key - * Appropriate max sequence-length scalar - * Causal masking flag - ''' - - if attn_type == AttentionType.ENCODER: - assert attn_metadata.encoder_seq_lens is not None - assert attn_metadata.encoder_seq_lens_tensor is not None - query_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)), - device=attn_metadata.encoder_seq_lens_tensor.device, - dtype=attn_metadata.encoder_seq_lens_tensor.dtype) - causal_mask = False - - # No block tables associated with encoder attention - return (query_seq_start_loc, attn_metadata.max_encoder_seq_len, - query_seq_start_loc, attn_metadata.max_encoder_seq_len, - attn_metadata.encoder_seq_lens, causal_mask) - - elif attn_type == AttentionType.ENCODER_ONLY: - # For encoder-only models, we use the prefill sequence lengths - assert attn_metadata.seq_lens is not None - assert attn_metadata.seq_lens_tensor is not None - query_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.seq_lens)), - device=attn_metadata.seq_lens_tensor.device, - dtype=attn_metadata.seq_lens_tensor.dtype) - max_seq_len = attn_metadata.max_prefill_seq_len - # Encoder-only models typically use bidirectional attention - causal_mask = False - - return (query_seq_start_loc, max_seq_len, query_seq_start_loc, - max_seq_len, attn_metadata.seq_lens, causal_mask) - - elif attn_type == AttentionType.DECODER: - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - assert attn_metadata.seq_lens is not None - assert attn_metadata.seq_lens_tensor is not None - query_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.seq_lens)), - device=attn_metadata.seq_lens_tensor.device, - dtype=attn_metadata.seq_lens_tensor.dtype) - max_seq_len = attn_metadata.max_prefill_seq_len - causal_mask = True - - return (query_seq_start_loc, max_seq_len, query_seq_start_loc, - max_seq_len, attn_metadata.seq_lens, causal_mask) - elif attn_type == AttentionType.ENCODER_DECODER: - assert attn_metadata.seq_lens is not None - assert attn_metadata.encoder_seq_lens_tensor is not None - query_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.seq_lens)), - device=attn_metadata.encoder_seq_lens_tensor.device, - dtype=attn_metadata.encoder_seq_lens_tensor.dtype) - - assert attn_metadata.encoder_seq_lens is not None - assert attn_metadata.seq_lens_tensor is not None - key_seq_start_loc = torch.tensor( - list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)), - device=attn_metadata.seq_lens_tensor.device, - dtype=attn_metadata.seq_lens_tensor.dtype) - causal_mask = False - - # Enc/dec cross-attention KVs match encoder sequence length; - # cross-attention utilizes special "cross" block tables - return (query_start_loc, attn_metadata.max_prefill_seq_len, - key_seq_start_loc, attn_metadata.max_encoder_seq_len, - attn_metadata.seq_lens, causal_mask) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -class ROCmFlashAttentionImpl(AttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prompt_tokens -------------->| - |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->| - - Otherwise, the layout is as follows: - |<------------------ num_generation_tokens (M) ----------------->| - |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens ----------->| - |<-prompt_0->|...|<-prompt_N-1->|<-generation_0->|...|<-generation_M-1->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "ROCM_FLASH backend.") - if use_irope: - logger.warning_once( - "Using irope in ROCm Flash Attention is not supported yet, it " - "will fail back to global attention for long context.") - if use_irope: - logger.warning( - "Using irope in V0 is not supported yet, it will fall back " - "to global attention for long context.") - if logits_soft_cap is None: - # In flash-attn, setting logits_soft_cap as 0 means no soft cap. - self.logits_soft_cap = 0.0 - else: - self.logits_soft_cap = logits_soft_cap - self.attn_type = attn_type - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window, sliding_window) - if sliding_window is not None else (-1, -1)) - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - self.paged_attn_module = _get_paged_attn_module() - supported_head_sizes = self.paged_attn_module.get_supported_head_sizes( - ) - - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - self.use_naive_attn = False - # NOTE: Allow for switching between Triton and CK. Defaulting to triton. - self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN - if self.use_triton_flash_attn: - if logits_soft_cap is not None: - raise ValueError( - "ROCm Triton FlashAttention does not support attention" - " logits soft capping." - " please try using the ROCm CK " - "FA backend instead by setting the env var " - "`VLLM_USE_TRITON_FLASH_ATTN=0`") - - from vllm.attention.ops.triton_flash_attention import ( # noqa: F401 - triton_attention) - self.triton_attn_func = triton_attention - logger.debug("Using Triton FA in ROCmBackend") - if self.sliding_window != (-1, -1): - logger.warning("ROCm Triton FA does not currently support " - "sliding window attention. If using half " - "precision, please try using the ROCm CK " - "FA backend instead by setting the env var " - "`VLLM_USE_TRITON_FLASH_ATTN=0`") - else: - # if not using triton, navi3x/navi21/navi10 do not use flash-attn - # either - if not current_platform.has_device_capability(90): - self.use_naive_attn = True - else: - try: - from flash_attn import flash_attn_varlen_func # noqa: F401 - self.fa_attn_func = flash_attn_varlen_func - logger.debug("Using CK FA in ROCmBackend") - except ModuleNotFoundError: - self.use_naive_attn = True - - if self.use_naive_attn: - if logits_soft_cap is not None: - raise ValueError( - "ROCm Naive FlashAttention does not support " - "attention logits soft capping.") - - self.sdpa_attn_func = _sdpa_attention - logger.debug("Using naive (SDPA) attention in ROCmBackend") - - self.aiter_kv_scales_initialized = False - self.force_fp8_attention = ( - get_current_vllm_config() is not None - and get_current_vllm_config().model_config.override_attention_dtype - == "fp8") - - def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: - """torch.repeat_interleave(x, dim=1, repeats=n_rep)""" - tokens, n_kv_heads, head_dim = x.shape - return (x[:, :, - None, :].expand(tokens, n_kv_heads, n_rep, - head_dim).reshape(tokens, n_kv_heads * n_rep, - head_dim)) - - def fused_output_quant_supported(self, quant_key: QuantKey): - if self.use_triton_flash_attn: - return quant_key == kFp8StaticTensorSym - - # Only supported in the Triton backend - return False - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: ROCmFlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with FlashAttention and PagedAttention. - - For decoder-only models: query, key and value must be non-None. - - For encoder/decoder models: - * ROCmFlashAttentionImpl.forward() may be invoked for both self- and - cross-attention layers. - * For self-attention: query, key and value must be non-None. - * For cross-attention: - * Query must be non-None - * During prefill, key and value must be non-None; key and value - get cached for use during decode. - * During decode, key and value may be None, since: - (1) key and value tensors were cached during prefill, and - (2) cross-attention key and value tensors do not grow during - decode - - A note on how the attn_type (attention type enum) argument impacts - attention forward() behavior: - - * DECODER: normal decoder-only behavior; - use decoder self-attention block table - * ENCODER: no KV caching; pass encoder sequence - attributes (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) to kernel, in lieu of decoder - sequence attributes (seq_lens/seq_lens_tensor/max_seq_len) - * ENCODER_DECODER: cross-attention behavior; - use cross-attention block table for caching KVs derived - from encoder hidden states; since KV sequence lengths - will match encoder sequence lengths, pass encoder sequence - attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) - * ENCODER_ONLY: bidirectional attention with no KV caching; - use prefill sequence attributes - - Args: - layer: Attention layer instance. - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size * num_kv_heads * head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - output: Optional output tensor. - output_scale: Optional output scale tensor. - output_block_scale: Optional output block scale tensor. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - assert output is not None, "Output tensor must be provided." - - if output_scale is not None and not self.use_triton_flash_attn: - raise NotImplementedError( - "fused output quantization only supported for Triton" - " implementation in ROCMFlashAttentionImpl for now") - - if output_block_scale is not None: - raise NotImplementedError( - "fused nvfp4 output quantization is not supported" - " for ROCMFlashAttentionImpl") - - query = query.view(-1, self.num_heads, self.head_size) - if key is not None: - assert value is not None - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - else: - assert value is None - - paged_attn = self.paged_attn_module - - # Reshaping kv tensors is required for AITER paged attention kernel - # because it works on a different tensor shape, - # when the size of one element is one byte (int8/fp8 dtypes). - # This reshaping is only required on the first forward call - # and the kv cache must not be empty. - if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1 - and not self.aiter_kv_scales_initialized - and kv_cache.shape != torch.Size([0])): - num_blocks = kv_cache.shape[1] - block_size = kv_cache.shape[2] // (self.num_kv_heads * - self.head_size) - k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size), - dtype=torch.float32, - device=kv_cache.device) - v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size), - dtype=torch.float32, - device=kv_cache.device) - self.aiter_kv_scales_initialized = True - k_scale.fill_(layer._k_scale.item()) - v_scale.fill_(layer._v_scale.item()) - layer._k_scale = k_scale - layer._v_scale = v_scale - - # Only update KV cache for decoder self-attention - # and encoder-decoder cross-attention - if self.attn_type not in [ - AttentionType.ENCODER, AttentionType.ENCODER_ONLY - ] and kv_cache.numel() > 0: - key_cache, value_cache = paged_attn.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - if key is not None and value is not None: - # Reshape the input keys and values and store them in the - # cache. If kv_cache is not provided, the new key and value - # tensors are not cached. This happens during the initial - # memory profiling run. - paged_attn.write_to_paged_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping - if self.attn_type != AttentionType.ENCODER_DECODER else - attn_metadata.cross_slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - if self.attn_type != AttentionType.ENCODER: - num_prefill_tokens = attn_metadata.num_prefill_tokens - elif self.attn_type == AttentionType.ENCODER_ONLY: - # For encoder-only models, all tokens are processed in one go - num_prefill_tokens = query.shape[0] - else: - assert attn_metadata.num_encoder_tokens is not None - num_prefill_tokens = attn_metadata.num_encoder_tokens - - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] - # QKV for prefill. - query = query[:num_prefill_tokens] - - # For encoder-only and encoder models, - # we process all tokens at once - # For decoder and encoder-decoder, - # we may need to limit key/value to prefill tokens - if key is not None and value is not None \ - and self.attn_type not in [AttentionType.ENCODER_DECODER, - AttentionType.ENCODER_ONLY]: - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - # normal attention and DECODER - if self.attn_type == AttentionType.DECODER and ( - kv_cache.numel() == 0 or prefill_meta.block_tables is None - or prefill_meta.block_tables.numel() == 0): - (query_seq_start_loc, query_max_seq_len, key_seq_start_loc, - key_max_seq_len, seq_lens, - causal_mask) = (prefill_meta.seq_start_loc, - prefill_meta.max_prefill_seq_len, - prefill_meta.seq_start_loc, - prefill_meta.max_prefill_seq_len, - attn_metadata.seq_lens, True) - # prefix-enabled attention and ENCODER/ENCODER_DECODER - else: - (query_seq_start_loc, query_max_seq_len, key_seq_start_loc, - key_max_seq_len, seq_lens, - causal_mask) = _get_seq_len_block_table_args( - prefill_meta, self.attn_type) - # Prompt run. - if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0: - # triton attention - # When block_tables are not filled, it means q and k are the - # prompt, and they have the same length. - attn_masks = None - if self.use_triton_flash_attn: - if self.alibi_slopes is not None: - attn_masks = _make_alibi_bias( - self.alibi_slopes, - query.dtype, - seq_lens, - make_attn_mask=causal_mask) # type: ignore - - use_fp8_scales = (layer._q_scale and layer._k_scale - and layer._v_scale and layer._prob_scale - and (self.kv_cache_dtype == "fp8" - or self.force_fp8_attention)) - - full_scales = ( - layer._q_scale.item(), layer._k_scale.item(), - layer._v_scale.item(), - layer._prob_scale.item()) if use_fp8_scales else None - self.triton_attn_func( - query, - key, - value, - output[:num_prefill_tokens], - query_seq_start_loc, - key_seq_start_loc, - query_max_seq_len, - key_max_seq_len, - causal_mask, - self.scale, - attn_masks[0][None] - if attn_masks is not None else None, - full_scales, - output_scale, - ) - elif self.use_naive_attn: - if self.num_kv_heads != self.num_heads: - # Interleave for MQA workaround. - key = self.repeat_kv(key, self.num_queries_per_kv) - value = self.repeat_kv(value, self.num_queries_per_kv) - if self.alibi_slopes is not None: - attn_masks = _make_alibi_bias( - self.alibi_slopes, - query.dtype, - attn_metadata.seq_lens, - make_attn_mask=causal_mask) # type: ignore - query = query.movedim(0, query.dim() - 2) - key = key.movedim(0, key.dim() - 2) - value = value.movedim(0, value.dim() - 2) - # sdpa math backend attention - self.sdpa_attn_func( - query, - key, - value, - output[:num_prefill_tokens], - query_seq_start_loc, - num_prefill_tokens, - self.num_heads, - self.head_size, - self.scale, - attn_masks, - ) - else: - # upstream FA does not support an output arg, copy - output[:num_prefill_tokens] = self.fa_attn_func( - q=query, - k=key, - v=value, - cu_seqlens_q=query_seq_start_loc, - cu_seqlens_k=key_seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=key_max_seq_len, - softmax_scale=self.scale, - causal=causal_mask, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - softcap=self.logits_soft_cap, - ) - - else: - # prefix-enabled attention - - # not applicable for encoder-only models - if self.attn_type != AttentionType.ENCODER_ONLY: - output[:num_prefill_tokens] = paged_attn.forward_prefix( - query, - key, - value, - self.kv_cache_dtype, - key_cache, - value_cache, - prefill_meta.block_tables, - prefill_meta.query_start_loc, - prefill_meta.seq_lens_tensor, - prefill_meta.max_query_len, - self.alibi_slopes, - self.sliding_window[0], - layer._k_scale, - layer._v_scale, - ) - # Skip decode phase for encoder-only models - if (decode_meta := attn_metadata.decode_metadata) and ( - self.attn_type != AttentionType.ENCODER_ONLY): - # Decoding run. - # Whether to use rocm custom paged attention or not - num_seqs, num_heads, head_size = decode_query.shape - block_size = value_cache.shape[3] - gqa_ratio = num_heads // self.num_kv_heads - from vllm.platforms.rocm import use_rocm_custom_paged_attention - use_custom = use_rocm_custom_paged_attention( - decode_query.dtype, head_size, block_size, gqa_ratio, - decode_meta.max_decode_seq_len, self.sliding_window, - self.kv_cache_dtype, self.alibi_slopes) - - if use_custom: - max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type - != AttentionType.ENCODER_DECODER else - decode_meta.max_encoder_seq_len) - assert max_seq_len is not None - max_num_partitions = ( - (max_seq_len + _PARTITION_SIZE_ROCM - 1) // - _PARTITION_SIZE_ROCM) - assert _PARTITION_SIZE_ROCM % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=query.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - - query_start_loc = None - ops.paged_attention_rocm( - output[num_prefill_tokens:], - exp_sums, - max_logits, - tmp_output, - decode_query, - key_cache, - value_cache, - self.num_kv_heads, - self.scale, - decode_meta.block_tables - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.cross_block_tables, - decode_meta.seq_lens_tensor - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.encoder_seq_lens_tensor, - query_start_loc, - block_size, - max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - output_scale, - ) - else: - # PagedAttention does not support fused quant, manually quantize - if output_scale is None: - out_pa = output[num_prefill_tokens:] - else: - out_pa = torch.empty_like(output[num_prefill_tokens:], - dtype=query.dtype) - - out_pa[:] = paged_attn.forward_decode( - decode_query, - key_cache, - value_cache, - decode_meta.block_tables - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.cross_block_tables, - decode_meta.seq_lens_tensor - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.encoder_seq_lens_tensor, - decode_meta.max_decode_seq_len - if self.attn_type != AttentionType.ENCODER_DECODER else - decode_meta.max_encoder_seq_len, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - ) - - # Manually perform quantization - if output_scale is not None: - out_uq = out_pa.view(-1, self.num_heads * self.head_size) - out_q = output.view(-1, self.num_heads * self.head_size) - ops.scaled_fp8_quant(out_uq, - output_scale, - output=out_q[num_prefill_tokens:]) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - -def _sdpa_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - output: torch.Tensor, - seq_lens: torch.Tensor, - num_tokens: int, - num_heads: int, - head_size: int, - scale: float, - attn_masks: Optional[List[torch.Tensor]] = None, -) -> torch.Tensor: - start = 0 - assert output.shape == (num_tokens, num_heads, head_size) - assert output.dtype == query.dtype - assert output.device == query.device - - for i, seq_len in enumerate(seq_lens): - end = start + seq_len - with torch.nn.attention.sdpa_kernel( - torch.nn.attention.SDPBackend.MATH): - sub_out = torch.nn.functional.scaled_dot_product_attention( - query[:, start:end, :], - key[:, start:end, :], - value[:, start:end, :], - dropout_p=0.0, - is_causal=attn_masks is None, - attn_mask=attn_masks[i] if attn_masks else None, - scale=scale).movedim(query.dim() - 2, 0) - output[start:end, :, :] = sub_out - start = end - - return output diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py deleted file mode 100644 index fba5b5f6bca86..0000000000000 --- a/vllm/attention/backends/triton_mla.py +++ /dev/null @@ -1,111 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Type - -import torch - -from vllm.attention.backends.abstract import (AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.mla.common import (MLACommonBackend, - MLACommonImpl, - MLACommonMetadata) -from vllm.attention.ops.triton_decode_attention import decode_attention_fwd - - -class TritonMLABackend(MLACommonBackend): - - @staticmethod - def get_name() -> str: - return "TRITON_MLA" - - @staticmethod - def get_impl_cls() -> Type["TritonMLAImpl"]: - return TritonMLAImpl - - -class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] - if any(unsupported_features): - raise NotImplementedError( - "TritonMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, logits_soft_cap") - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "TritonMLAImpl") - - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "TritonMLA with FP8 KV cache not yet supported") - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: MLACommonMetadata, - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - B = q_nope.shape[0] - - q = torch.cat([q_nope, q_pe], dim=-1) - o = torch.zeros(B, - self.num_heads, - self.kv_lora_rank, - dtype=q.dtype, - device=q.device) - - num_kv_splits = 4 # TODO: heuristic - - # TODO(lucas) Allocate ahead of time - attn_logits = torch.empty( - ( - B, - self.num_heads, - num_kv_splits, - # NOTE(lucas) idk why the +1 is here but sglang has it so we - # just mirror that - self.kv_lora_rank + 1, - ), - dtype=torch.float32, - device=q.device, - ) - - # Add a head dim of 1 - kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2) - kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank] - PAGE_SIZE = kv_c_and_k_pe_cache.size(1) - - # Run MQA - decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o, - decode_meta.block_tables, - decode_meta.seq_lens_tensor, attn_logits, - num_kv_splits, self.scale, PAGE_SIZE) - - return self._v_up_proj(o) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index b28e6a4237cb6..3f15580872c7f 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -338,10 +338,9 @@ class CommonAttentionState(AttentionState): # The encoder decoder model works only with XFormers and # Flash Attention backend. Assert the same. assert self.runner.attn_backend.get_name() in \ - ["XFORMERS", "FLASH_ATTN", "ROCM_FLASH"], \ - f"Expected attn_backend name to be either 'XFORMERS'," \ - f"'ROCM_FLASH', or 'FLASH_ATTN', but " \ - f"got '{self.runner.attn_backend.get_name()}'" + ["XFORMERS", "FLASH_ATTN"], \ + f"Expected attn_backend name to be either 'XFORMERS' or " \ + f"'FLASH_ATTN', but got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -360,10 +359,9 @@ class CommonAttentionState(AttentionState): # The encoder decoder model works only with XFormers and # Flash Attention backend. Assert the same. assert self.runner.attn_backend.get_name() in \ - ["XFORMERS", "FLASH_ATTN", "ROCM_FLASH"], \ - f"Expected attn_backend name to be either 'XFORMERS'," \ - f"'ROCM_FLASH', or 'FLASH_ATTN', but " \ - f"got '{self.runner.attn_backend.get_name()}'" + ["XFORMERS", "FLASH_ATTN"], \ + f"Expected attn_backend name to be either 'XFORMERS' or " \ + f"'FLASH_ATTN', but got '{self.runner.attn_backend.get_name()}'" self._add_additional_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py deleted file mode 100644 index 302d3d7ea903f..0000000000000 --- a/vllm/attention/backends/xformers.py +++ /dev/null @@ -1,805 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with xFormers and PagedAttention.""" -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Type - -import torch -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import (AttentionBias, - BlockDiagonalCausalMask, - BlockDiagonalMask, - LowerTriangularMaskWithTensorBias) - -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType) -from vllm.attention.backends.utils import ( - CommonAttentionState, CommonMetadataBuilder, - get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args, - is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set) -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class XFormersBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "XFORMERS" - - @staticmethod - def get_impl_cls() -> Type["XFormersImpl"]: - return XFormersImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return XFormersMetadata - - @staticmethod - def get_builder_cls() -> Type["XFormersMetadataBuilder"]: - return XFormersMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], - ) -> None: - PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) - - -@dataclass -class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for XFormersbackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # FIXME: It is for flash attn. - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] = None - - # FIXME: It is for flash attn. - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] = None - - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] = None - - # Maximum query length in the batch. None for decoding. - max_query_len: Optional[int] = None - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] = None - - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] = None - - # Self-attention prefill/decode metadata cache - _cached_prefill_metadata: Optional["XFormersMetadata"] = None - _cached_decode_metadata: Optional["XFormersMetadata"] = None - - # Begin encoder attn & enc/dec cross-attn fields... - - # Encoder sequence lengths representation - encoder_seq_lens: Optional[List[int]] = None - encoder_seq_lens_tensor: Optional[torch.Tensor] = None - # FIXME: It is for flash attn. - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - encoder_seq_start_loc: Optional[torch.Tensor] = None - - # Maximum sequence length among encoder sequences - max_encoder_seq_len: Optional[int] = None - - # Number of tokens input to encoder - num_encoder_tokens: Optional[int] = None - - # Cross-attention memory-mapping data structures: slot mapping - # and block tables - cross_slot_mapping: Optional[torch.Tensor] = None - cross_block_tables: Optional[torch.Tensor] = None - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[AttentionBias]] = None - self.encoder_attn_bias: Optional[List[AttentionBias]] = None - self.cross_attn_bias: Optional[List[AttentionBias]] = None - - @property - def is_all_encoder_attn_metadata_set(self): - ''' - All attention metadata required for encoder attention is set. - ''' - return is_all_encoder_attn_metadata_set(self) - - @property - def is_all_cross_attn_metadata_set(self): - ''' - All attention metadata required for enc/dec cross-attention is set. - - Superset of encoder attention required metadata. - ''' - return is_all_cross_attn_metadata_set(self) - - @property - def prefill_metadata(self) -> Optional["XFormersMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - # Recover cached prefill-phase attention - # metadata structure - return self._cached_prefill_metadata - - assert ((self.seq_lens is not None) - or (self.encoder_seq_lens is not None)) - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - query_start_loc = (None if self.query_start_loc is None else - self.query_start_loc[:self.num_prefills + 1]) - seq_start_loc = (None if self.seq_start_loc is None else - self.seq_start_loc[:self.num_prefills + 1]) - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[:self.num_prefill_tokens]) - seq_lens = (None if self.seq_lens is None else - self.seq_lens[:self.num_prefills]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[:self.num_prefills]) - context_lens_tensor = (None if self.context_lens_tensor is None else - self.context_lens_tensor[:self.num_prefills]) - block_tables = (None if self.block_tables is None else - self.block_tables[:self.num_prefills]) - - # Construct & cache prefill-phase attention metadata structure - self._cached_prefill_metadata = XFormersMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_seq_len=0, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["XFormersMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - # Recover cached decode-phase attention - # metadata structure - return self._cached_decode_metadata - assert ((self.seq_lens_tensor is not None) - or (self.encoder_seq_lens_tensor is not None)) - - # Compute some attn_metadata fields which default to None - slot_mapping = (None if self.slot_mapping is None else - self.slot_mapping[self.num_prefill_tokens:]) - seq_lens_tensor = (None if self.seq_lens_tensor is None else - self.seq_lens_tensor[self.num_prefills:]) - block_tables = (None if self.block_tables is None else - self.block_tables[self.num_prefills:]) - - # Construct & cache decode-phase attention metadata structure - self._cached_decode_metadata = XFormersMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - seq_lens_tensor=seq_lens_tensor, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - block_tables=block_tables, - use_cuda_graph=self.use_cuda_graph, - # Begin encoder & cross attn fields below... - encoder_seq_lens=self.encoder_seq_lens, - encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, - max_encoder_seq_len=self.max_encoder_seq_len, - cross_slot_mapping=self.cross_slot_mapping, - cross_block_tables=self.cross_block_tables) - - # Batch may be composed of prefill|decodes, adjust query start indices - # to refer to the start of decodes when the two are split apart. - # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. - if self._cached_decode_metadata.query_start_loc is not None: - qs = self._cached_decode_metadata.query_start_loc - self._cached_decode_metadata.query_start_loc = qs - qs[0] - return self._cached_decode_metadata - - -def _get_attn_bias( - attn_metadata: XFormersMetadata, - attn_type: str, -) -> Optional[AttentionBias]: - ''' - Extract appropriate attention bias from attention metadata - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - * Appropriate attention bias value given the attention type - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - return attn_metadata.attn_bias - elif attn_type == AttentionType.ENCODER: - return attn_metadata.encoder_attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - return attn_metadata.cross_attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -def _set_attn_bias( - attn_metadata: XFormersMetadata, - attn_bias: List[Optional[AttentionBias]], - attn_type: str, -) -> None: - ''' - Update appropriate attention bias field of attention metadata, - according to attention type. - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention - * attn_bias: The desired attention bias value - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - ''' - - if (attn_type == AttentionType.DECODER - or attn_type == AttentionType.ENCODER_ONLY): - attn_metadata.attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER: - attn_metadata.encoder_attn_bias = attn_bias - elif attn_type == AttentionType.ENCODER_DECODER: - attn_metadata.cross_attn_bias = attn_bias - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - -class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]): - - _metadata_cls = XFormersMetadata - - -class XFormersImpl(AttentionImpl[XFormersMetadata]): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - If chunked prefill is enabled, prefill tokens and decode tokens can be - batched together in a flattened 1D query. - - |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| - |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| - - Currently, cuda graph is disabled for chunked prefill, meaning there's no - padding between prefill and decode tokens. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "XFORMERS backend.") - if logits_soft_cap is not None: - logger.warning_once("XFormers does not support logits soft cap. " - "Outputs may be slightly off.") - if use_irope: - logger.warning_once( - "Using irope in XFormers is not supported yet, it will fall" - " back to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = sliding_window - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - supported_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - self.attn_type = attn_type - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: Optional[torch.Tensor], - value: Optional[torch.Tensor], - kv_cache: torch.Tensor, - attn_metadata: "XFormersMetadata", - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - output_block_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with xFormers and PagedAttention. - - For decoder-only models: query, key and value must be non-None. - - For encoder/decoder models: - * XFormersImpl.forward() may be invoked for both self- and cross- - attention layers. - * For self-attention: query, key and value must be non-None. - * For cross-attention: - * Query must be non-None - * During prefill, key and value must be non-None; key and value - get cached for use during decode. - * During decode, key and value may be None, since: - (1) key and value tensors were cached during prefill, and - (2) cross-attention key and value tensors do not grow during - decode - - A note on how the attn_type (attention type enum) argument impacts - attention forward() behavior: - - * DECODER: normal decoder-only behavior; - use decoder self-attention block table - * ENCODER: no KV caching; pass encoder sequence - attributes (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) to kernel, in lieu of decoder - sequence attributes (seq_lens/seq_lens_tensor/max_seq_len). - Used for encoder branch of encoder-decoder models. - * ENCODER_ONLY: no kv_caching, uses the normal attention - attributes (seq_lens/seq_lens_tensor/max_seq_len). - * ENCODER_DECODER: cross-attention behavior; - use cross-attention block table for caching KVs derived - from encoder hidden states; since KV sequence lengths - will match encoder sequence lengths, pass encoder sequence - attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/ - max_encoder_seq_len) - - Args: - layer: Attention layer instance. - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size * num_kv_heads * head_size]. - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - output: Optional output tensor. - output_scale: Optional output scale tensor. - output_block_scale: Optional output block scale tensor. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for XFormersImpl") - - attn_type = self.attn_type - # Check that appropriate attention metadata attributes are - # selected for the desired attention type - if (attn_type == AttentionType.ENCODER - and (not attn_metadata.is_all_encoder_attn_metadata_set)): - raise AttributeError("Encoder attention requires setting " - "encoder metadata attributes.") - - elif (attn_type == AttentionType.ENCODER_DECODER - and (not attn_metadata.is_all_cross_attn_metadata_set)): - raise AttributeError("Encoder/decoder cross-attention " - "requires setting cross-attention " - "metadata attributes.") - - query = query.view(-1, self.num_heads, self.head_size) - if key is not None: - assert value is not None - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - else: - assert value is None - - # Self-attention vs. cross-attention will impact - # which KV cache memory-mapping & which - # seqlen datastructures we utilize - - if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): - # KV-cache during decoder-self- or - # encoder-decoder-cross-attention, but not - # during encoder attention. - # - # Even if there are no new key/value pairs to cache, - # we still need to break out key_cache and value_cache - # i.e. for later use by paged attention - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - if (key is not None) and (value is not None): - - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - # During cross-attention decode, key & value will be None, - # preventing this IF-statement branch from running - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory - # profiling run. - PagedAttention.write_to_paged_cache( - key, value, key_cache, value_cache, updated_slot_mapping, - self.kv_cache_dtype, layer._k_scale, layer._v_scale) - (num_prefill_query_tokens, num_prefill_kv_tokens, - num_decode_query_tokens) = \ - get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) - - output = torch.empty_like(query) - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_query_tokens:] - # QKV for prefill. - query = query[:num_prefill_query_tokens] - if key is not None and value is not None: - key = key[:num_prefill_kv_tokens] - value = value[:num_prefill_kv_tokens] - - assert query.shape[0] == num_prefill_query_tokens - assert decode_query.shape[0] == num_decode_query_tokens - - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0: - # normal attention. - # block tables are empty if the prompt does not have a cached - # prefix. - out = self._run_memory_efficient_xformers_forward( - query, key, value, prefill_meta, attn_type=attn_type) - assert out.shape == output[:num_prefill_query_tokens].shape - output[:num_prefill_query_tokens] = out - else: - assert attn_type != AttentionType.ENCODER_ONLY, ( - "Encoder-only models should not have prefix attention.") - - assert prefill_meta.query_start_loc is not None - assert prefill_meta.max_query_len is not None - - # prefix-enabled attention - # TODO(Hai) this triton kernel has regression issue (broke) to - # deal with different data types between KV and FP8 KV cache, - # to be addressed separately. - out = PagedAttention.forward_prefix( - query, - key, - value, - self.kv_cache_dtype, - key_cache, - value_cache, - prefill_meta.block_tables, - prefill_meta.query_start_loc, - prefill_meta.seq_lens_tensor, - prefill_meta.max_query_len, - self.alibi_slopes, - self.sliding_window, - layer._k_scale, - layer._v_scale, - ) - assert output[:num_prefill_query_tokens].shape == out.shape - output[:num_prefill_query_tokens] = out - - if decode_meta := attn_metadata.decode_metadata: - assert attn_type != AttentionType.ENCODER_ONLY, ( - "Encoder-only models should not have decode metadata.") - - ( - seq_lens_arg, - max_seq_len_arg, - block_tables_arg, - ) = get_seq_len_block_table_args(decode_meta, False, attn_type) - - output[num_prefill_query_tokens:] = PagedAttention.forward_decode( - decode_query, - key_cache, - value_cache, - block_tables_arg, - seq_lens_arg, - max_seq_len_arg, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - ) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - def _run_memory_efficient_xformers_forward( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_metadata: XFormersMetadata, - attn_type: str = AttentionType.DECODER, - ) -> torch.Tensor: - """Attention for 1D query of multiple prompts. Multiple prompt - tokens are flattened in to `query` input. - - See https://facebookresearch.github.io/xformers/components/ops.html - for API spec. - - Args: - query: shape = [num_prefill_tokens, num_heads, head_size] - key: shape = [num_prefill_tokens, num_kv_heads, head_size] - value: shape = [num_prefill_tokens, num_kv_heads, head_size] - attn_metadata: Metadata for attention. - attn_type: Select attention type, between encoder attention, - decoder self-attention, or encoder/decoder cross- - attention. Defaults to decoder self-attention, - which is the vLLM default generally - """ - - original_query = query - if self.num_kv_heads != self.num_heads: - # GQA/MQA requires the shape [B, M, G, H, K]. - # Note that the output also has the same shape (which is different - # from a spec from the doc). - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, key.shape[-1]) - value = value[:, :, - None, :].expand(value.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) - - # Set attention bias if not provided. This typically happens at - # the very attention layer of every iteration. - # FIXME(woosuk): This is a hack. - attn_bias = _get_attn_bias(attn_metadata, attn_type) - if attn_bias is None: - if self.alibi_slopes is None: - - # Cross attention block of decoder branch of encoder-decoder - # model uses seq_lens for dec / encoder_seq_lens for enc - if (attn_type == AttentionType.ENCODER_DECODER): - assert attn_metadata.seq_lens is not None - assert attn_metadata.encoder_seq_lens is not None - - # Cross-attention mask is non-causal - attn_bias = BlockDiagonalMask.from_seqlens( - attn_metadata.seq_lens, - attn_metadata.encoder_seq_lens, - device=query.device) - - # Encoder branch of encoder-decoder model uses - # attn_metadata.encoder_seq_lens - elif attn_type == AttentionType.ENCODER: - - assert attn_metadata.encoder_seq_lens is not None - - # Encoder self-attention mask is non-causal - attn_bias = BlockDiagonalMask.from_seqlens( - attn_metadata.encoder_seq_lens, device=query.device) - - # Self-attention block of encoder-only model just - # uses the seq_lens directly. - elif attn_type == AttentionType.ENCODER_ONLY: - assert attn_metadata.seq_lens is not None - - # Encoder self-attention mask is non-causal - attn_bias = BlockDiagonalMask.from_seqlens( - attn_metadata.seq_lens, device=query.device) - - # Self-attention block of decoder branch just - # uses the seq_lens directly - elif attn_type == AttentionType.DECODER: - assert attn_metadata.seq_lens is not None - - # Decoder self-attention mask is causal - attn_bias = BlockDiagonalCausalMask.from_seqlens( - attn_metadata.seq_lens, device=query.device) - else: - raise ValueError("Unknown AttentionType: %s", attn_type) - - if self.sliding_window is not None: - attn_bias = attn_bias.make_local_attention( - self.sliding_window) - attn_bias = [attn_bias] - else: - assert attn_type == AttentionType.DECODER - assert attn_metadata.seq_lens is not None - attn_bias = _make_alibi_bias(self.alibi_slopes, - self.num_kv_heads, query.dtype, - attn_metadata.seq_lens) - - _set_attn_bias(attn_metadata, attn_bias, attn_type) - - # No alibi slopes. - # TODO(woosuk): Too many view operations. Let's try to reduce - # them in the future for code readability. - if self.alibi_slopes is None: - # Add the batch dimension. - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - out = xops.memory_efficient_attention_forward( - query, - key, - value, - attn_bias=attn_bias[0], - p=0.0, - scale=self.scale) - return out.view_as(original_query) - - # Attention with alibi slopes. - # FIXME(woosuk): Because xformers does not support dynamic sequence - # lengths with custom attention bias, we process each prompt one by - # one. This is inefficient, especially when we have many short prompts. - assert attn_metadata.seq_lens is not None - output = torch.empty_like(original_query) - start = 0 - for i, seq_len in enumerate(attn_metadata.seq_lens): - end = start + seq_len - out = xops.memory_efficient_attention_forward( - query[None, start:end], - key[None, start:end], - value[None, start:end], - attn_bias=attn_bias[i], - p=0.0, - scale=self.scale) - # TODO(woosuk): Unnecessary copy. Optimize. - output[start:end].copy_(out.view_as(original_query[start:end])) - start += seq_len - return output - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - num_kv_heads: int, - dtype: torch.dtype, - seq_lens: List[int], -) -> List[AttentionBias]: - attn_biases: List[AttentionBias] = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - # Calculate a matrix where each element represents ith element- jth - # element. - bias = bias[None, :] - bias[:, None] - - padded_len = (seq_len + 7) // 8 * 8 - num_heads = alibi_slopes.shape[0] - bias = torch.empty( - 1, # batch size - num_heads, - seq_len, - padded_len, - device=alibi_slopes.device, - dtype=dtype, - )[:, :, :, :seq_len].copy_(bias) - bias.mul_(alibi_slopes[:, None, None]) - attn_biases.append(LowerTriangularMaskWithTensorBias(bias)) - - return attn_biases diff --git a/vllm/config/model.py b/vllm/config/model.py index 95fe52883db0d..33e5d3ea04a48 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -32,8 +32,7 @@ from vllm.transformers_utils.config import ( from vllm.transformers_utils.runai_utils import (ObjectStorageModel, is_runai_obj_uri) from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, - LazyLoader, common_broadcastable_dtype) +from vllm.utils import LayerBlockType, LazyLoader, common_broadcastable_dtype if TYPE_CHECKING: from transformers import PretrainedConfig @@ -1103,10 +1102,6 @@ class ModelConfig: self.hf_config.dual_chunk_attention_config[ "sparse_attention_enabled"] = True - if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL: - raise ValueError("please set VLLM_ATTENTION_BACKEND to " - f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") - def verify_with_parallel_config( self, parallel_config: ParallelConfig, diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 911d77ba36fa0..efa4c9abf47f7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -44,7 +44,7 @@ class model_aware_kv_ops_helper: # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading # to a kv_cache shape of [2, num_blks, blk_size, # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim]. - # For more details, see vllm/attention/backends/mla/common.py. + # For more details, see vllm/v1/attention/backends/mla/common.py. if self.is_deepseek_mla and self.use_mla_opt: head_size = model_config.kv_lora_rank + \ model_config.qk_rope_head_dim diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7e00260caa39a..b09d43f705580 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -44,8 +44,8 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.config import (get_model_path, is_interleaved, maybe_override_with_speculators) from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, - GiB_bytes, get_ip, is_in_ray_actor) +from vllm.utils import (FlexibleArgumentParser, GiB_bytes, get_ip, + is_in_ray_actor) from vllm.v1.sample.logits_processor import LogitsProcessor # yapf: enable @@ -1163,17 +1163,6 @@ class EngineArgs: self._set_default_args_v0(model_config) assert self.enable_chunked_prefill is not None - if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]: - assert self.enforce_eager, ( - "Cuda graph is not supported with DualChunkFlashAttention. " - "To run the model in eager mode, set 'enforce_eager=True' " - "or use '--enforce-eager' in the CLI.") - assert current_platform.is_cuda(), ( - "DualChunkFlashAttention is only supported on CUDA platform.") - assert not use_v1, ( - "DualChunkFlashAttention is not supported on V1 engine. " - "To run the model in V0 engine, try set 'VLLM_USE_V1=0'") - sliding_window: Optional[int] = None if not is_interleaved(model_config.hf_text_config): # Only set CacheConfig.sliding_window if the model is all sliding diff --git a/vllm/envs.py b/vllm/envs.py index 3991a789d80f6..cbd1d5474e60f 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -529,7 +529,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "TORCH_SDPA": use torch.nn.MultiheadAttention # - "FLASH_ATTN": use FlashAttention # - "XFORMERS": use XFormers - # - "ROCM_FLASH": use ROCmFlashAttention # - "FLASHINFER": use flashinfer # - "FLASHMLA": use FlashMLA # - "FLASH_ATTN_MLA": use FlashAttention for MLA diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index c926e17a2c197..7f376b70a7ae0 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -53,13 +53,18 @@ class Mamba2Metadata: def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: """Returns the appropriate metadata classes for the current platform.""" if current_platform.is_rocm(): - from vllm.attention.backends.rocm_flash_attn import ( - ROCmFlashAttentionMetadata) - return (ROCmFlashAttentionMetadata, PlaceholderAttentionMetadata) - elif current_platform.is_cuda(): - from vllm.attention.backends.flash_attn import FlashAttentionMetadata - from vllm.attention.backends.xformers import XFormersMetadata - return (FlashAttentionMetadata, XFormersMetadata, + from vllm.v1.attention.backends.rocm_aiter_fa import ( + AiterFlashAttentionMetadata) + from vllm.v1.attention.backends.triton_attn import ( + TritonAttentionMetadata) + return (AiterFlashAttentionMetadata, TritonAttentionMetadata, + PlaceholderAttentionMetadata) + if current_platform.is_cuda(): + from vllm.v1.attention.backends.flash_attn import ( + FlashAttentionMetadata) + from vllm.v1.attention.backends.xformers import ( + XFormersAttentionMetadata) + return (FlashAttentionMetadata, XFormersAttentionMetadata, PlaceholderAttentionMetadata) raise ValueError( f"Unsupported platform for Mamba2: {current_platform.device_type}") diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index a99a6679a5696..415d36c681d8c 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -478,7 +478,8 @@ class DeepseekV2MLAAttention(nn.Module): Main reference: DeepseekV2 paper, and FlashInfer Implementation (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). - For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py + For more info see MLACommonImpl in: + vllm/v1/attention/backends/mla/utils.py """ def __init__( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c263e2afe83b0..05f129f513a0a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -226,8 +226,10 @@ class CudaPlatformBase(Platform): kv_cache_dtype, block_size, use_v1, use_mla, has_sink) -> str: if use_mla: - # TODO(lucas): refactor to be more concise - # we should probably consider factoring out V1 here + if not use_v1: + raise RuntimeError( + "MLA attention backends require the V1 engine. " + "Set VLLM_USE_V1=1 to enable them.") from vllm.attention.ops.flashmla import is_flashmla_supported from vllm.attention.utils.fa_utils import flash_attn_supports_mla @@ -246,35 +248,17 @@ class CudaPlatformBase(Platform): use_triton = selected_backend == _Backend.TRITON_MLA or ( selected_backend is None) - def _get_version(name, import_suffix) -> str: - if use_v1: - logger.info_once(f"Using {name} backend on V1 engine.") - return f"vllm.v1.attention.backends.mla.{import_suffix}" - else: - logger.info_once(f"Using {name} backend.") - return f"vllm.attention.backends.{import_suffix}" - if use_cutlassmla: - if use_v1: - logger.info_once("Using Cutlass MLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "cutlass_mla.CutlassMLABackend") - else: - logger.warning( - "Cutlass MLA backend is only supported on V1 engine") + logger.info_once("Using Cutlass MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "cutlass_mla.CutlassMLABackend") if use_flashinfermla: - if use_v1: - from vllm.v1.attention.backends.utils import ( - set_kv_cache_layout) - set_kv_cache_layout("HND") - logger.info_once( - "Using FlashInfer MLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "flashinfer_mla.FlashInferMLABackend") - else: - logger.warning( - "FlashInfer MLA backend is only supported on V1 engine" - ) + from vllm.v1.attention.backends.utils import ( + set_kv_cache_layout) + set_kv_cache_layout("HND") + logger.info_once("Using FlashInfer MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "flashinfer_mla.FlashInferMLABackend") if use_flashmla: if block_size != 64: logger.warning( @@ -282,20 +266,18 @@ class CudaPlatformBase(Platform): " (currently only supports block size 64).", block_size) else: - return _get_version("FlashMLA", "flashmla.FlashMLABackend") - if use_flashattn: - if use_v1: - logger.info_once( - "Using FlashAttention MLA backend on V1 engine.") + logger.info_once("Using FlashMLA backend on V1 engine.") return ("vllm.v1.attention.backends.mla." - "flashattn_mla.FlashAttnMLABackend") - else: - logger.warning( - "FlashAttention MLA backend is only supported on V1 " - "engine.") + "flashmla.FlashMLABackend") + if use_flashattn: + logger.info_once( + "Using FlashAttention MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "flashattn_mla.FlashAttnMLABackend") if use_triton: - return _get_version("Triton MLA", - "triton_mla.TritonMLABackend") + logger.info_once("Using Triton MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "triton_mla.TritonMLABackend") if use_v1: FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501 FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 @@ -382,78 +364,9 @@ class CudaPlatformBase(Platform): ) return FLEX_ATTENTION_V1 - # Backends for V0 engine - if selected_backend == _Backend.XFORMERS: - logger.info("Using XFormers backend.") - return "vllm.attention.backends.xformers.XFormersBackend" - elif selected_backend == _Backend.DUAL_CHUNK_FLASH_ATTN: - logger.info("Using DualChunkFlashAttention backend.") - return ("vllm.attention.backends.dual_chunk_flash_attn." - "DualChunkFlashAttentionBackend") - elif selected_backend == _Backend.DIFFERENTIAL_FLASH_ATTN: - logger.info("Using DifferentialFlashAttention backend.") - return ("vllm.attention.backends.differential_flash_attn." - "DifferentialFlashAttentionBackend") - elif selected_backend == _Backend.FLASH_ATTN: - pass - elif selected_backend: - raise ValueError( - f"Invalid attention backend for {cls.device_name}, " - f"with use_v1: {use_v1} use_mla: {use_mla}") - - target_backend = _Backend.FLASH_ATTN - if not cls.has_device_capability(80): - # Volta and Turing NVIDIA GPUs. - logger.info( - "Cannot use FlashAttention-2 backend for Volta and Turing " - "GPUs.") - target_backend = _Backend.XFORMERS - elif dtype not in (torch.float16, torch.bfloat16): - logger.info( - "Cannot use FlashAttention-2 backend for dtype other than " - "torch.float16 or torch.bfloat16.") - target_backend = _Backend.XFORMERS - elif block_size % 16 != 0: - logger.info( - "Cannot use FlashAttention-2 backend for block size not " - "divisible by 16.") - target_backend = _Backend.XFORMERS - - # FlashAttn is valid for the model, checking if the package is - # installed. - if target_backend == _Backend.FLASH_ATTN: - try: - import vllm.vllm_flash_attn # noqa: F401 - from vllm.attention.backends.flash_attn import ( # noqa: F401 - FlashAttentionBackend, flash_attn_supports_fp8) - - supported_sizes = \ - FlashAttentionBackend.get_supported_head_sizes() - if head_size not in supported_sizes: - logger.info( - "Cannot use FlashAttention-2 backend for head size %d.", - head_size) - target_backend = _Backend.XFORMERS - fp8_kv_cache = (kv_cache_dtype is not None - and kv_cache_dtype.startswith("fp8")) - if (fp8_kv_cache and not flash_attn_supports_fp8()): - logger.info( - "Cannot use FlashAttention backend for FP8 KV cache.") - target_backend = _Backend.XFORMERS - except ImportError: - logger.info( - "Cannot use FlashAttention-2 backend because the " - "vllm.vllm_flash_attn package is not found. " - "Make sure that vllm_flash_attn was built and installed " - "(on by default).") - target_backend = _Backend.XFORMERS - - if target_backend == _Backend.XFORMERS: - logger.info("Using XFormers backend.") - return "vllm.attention.backends.xformers.XFormersBackend" - - logger.info("Using Flash Attention backend.") - return "vllm.attention.backends.flash_attn.FlashAttentionBackend" + raise RuntimeError( + "V0 attention backends have been removed. Set VLLM_USE_V1=1 " + "to select a supported backend.") @classmethod def get_punica_wrapper(cls) -> str: diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index dce2924ac7a9d..9470434aa428b 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -191,6 +191,11 @@ class RocmPlatform(Platform): kv_cache_dtype, block_size, use_v1, use_mla, has_sink) -> str: if use_mla: + if not use_v1: + raise RuntimeError( + "MLA attention backends require the V1 engine. " + "Set VLLM_USE_V1=1 to enable them.") + from vllm.v1.attention.backends.mla.rocm_aiter_mla import ( is_aiter_mla_enabled) @@ -201,39 +206,24 @@ class RocmPlatform(Platform): if selected_backend == _Backend.TRITON_MLA: if block_size != 1: - if use_v1: - logger.info_once( - "Using Triton MLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "triton_mla.TritonMLABackend") - else: - logger.info("Using Triton MLA backend.") - return "vllm.attention.backends.triton_mla.TritonMLABackend" # noqa: E501 - else: - raise ValueError( - f" The selected backend, {selected_backend.name}," - f"does not support block size {block_size}.") - elif selected_backend == _Backend.ROCM_AITER_MLA \ - or selected_backend == _Backend.ROCM_AITER_MLA_VLLM_V1: - if block_size == 1: - if use_v1: - logger.info("Using AITER MLA backend on V1 engine.") - return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501 - else: - logger.info("Using AITER MLA backend") - return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend" # noqa: E501 - else: - raise ValueError( - f" The selected backend, {selected_backend.name}," - f"does not support block size {block_size}." - "(currently only supports block size 1)") - else: + logger.info_once("Using Triton MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "triton_mla.TritonMLABackend") raise ValueError( f" The selected backend, {selected_backend.name}," - f"is not MLA type while requested for MLA backend.") - - if selected_backend is None or selected_backend == _Backend.FLASH_ATTN: - selected_backend = _Backend.ROCM_FLASH + f"does not support block size {block_size}.") + if selected_backend in (_Backend.ROCM_AITER_MLA, + _Backend.ROCM_AITER_MLA_VLLM_V1): + if block_size == 1: + logger.info("Using AITER MLA backend on V1 engine.") + return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501 + raise ValueError( + f" The selected backend, {selected_backend.name}," + f"does not support block size {block_size}." + "(currently only supports block size 1)") + raise ValueError( + f" The selected backend, {selected_backend.name}," + f"is not MLA type while requested for MLA backend.") if envs.VLLM_USE_V1: if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA \ @@ -245,14 +235,9 @@ class RocmPlatform(Platform): logger.info("Using Triton Attention backend on V1 engine.") return ("vllm.v1.attention.backends." "triton_attn.TritonAttentionBackend") - if selected_backend == _Backend.ROCM_FLASH: - if not cls.has_device_capability(90): - # not Instinct series GPUs. - logger.info("flash_attn is not supported on NAVI GPUs.") - else: - logger.info("%s is not supported in AMD GPUs.", selected_backend) - logger.info("Using ROCmFlashAttention backend.") - return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend" # noqa: E501 + raise RuntimeError( + "V0 attention backends have been removed. Set VLLM_USE_V1=1 " + "to select a supported backend.") @classmethod def set_device(cls, device: torch.device) -> None: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 968bba664f0a9..834ec9b1d30b4 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -157,10 +157,8 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND" # register, corresponding to possible backends STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER" STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA" -STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH" STR_XFORMERS_ATTN_VAL: str = "XFORMERS" STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" -STR_DUAL_CHUNK_FLASH_ATTN_VAL: str = "DUAL_CHUNK_FLASH_ATTN" STR_INVALID_VAL: str = "INVALID" MB_bytes = 1_000_000 From 04d3752329e39d11674862be341faaab42ac4d11 Mon Sep 17 00:00:00 2001 From: Yang Liu <127183760+KKSK-DON@users.noreply.github.com> Date: Sun, 21 Sep 2025 16:06:16 -0700 Subject: [PATCH 534/584] [Bugfix][V0 Deprecation][CI] use async mock and await for async method (#25325) Signed-off-by: Yang <lymailforjob@gmail.com> --- .../entrypoints/openai/test_lora_resolvers.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index e2c83b9c40045..9d5ee84a19567 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -5,7 +5,7 @@ from contextlib import suppress from dataclasses import dataclass, field from http import HTTPStatus from typing import Optional -from unittest.mock import MagicMock +from unittest.mock import AsyncMock, MagicMock import pytest @@ -83,20 +83,31 @@ def register_mock_resolver(): def mock_serving_setup(): """Provides a mocked engine and serving completion instance.""" mock_engine = MagicMock(spec=AsyncLLM) - mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False - def mock_add_lora_side_effect(lora_request: LoRARequest): + tokenizer = get_tokenizer(MODEL_NAME) + mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer) + + async def mock_add_lora_side_effect(lora_request: LoRARequest): """Simulate engine behavior when adding LoRAs.""" if lora_request.lora_name == "test-lora": # Simulate successful addition - return - elif lora_request.lora_name == "invalid-lora": + return True + if lora_request.lora_name == "invalid-lora": # Simulate failure during addition (e.g. invalid format) raise ValueError(f"Simulated failure adding LoRA: " f"{lora_request.lora_name}") + return True + + mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect) + + async def mock_generate(*args, **kwargs): + for _ in []: + yield _ + + mock_engine.generate = MagicMock(spec=AsyncLLM.generate, + side_effect=mock_generate) - mock_engine.add_lora.side_effect = mock_add_lora_side_effect mock_engine.generate.reset_mock() mock_engine.add_lora.reset_mock() @@ -131,7 +142,7 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup, with suppress(Exception): await serving_completion.create_completion(req_found) - mock_engine.add_lora.assert_called_once() + mock_engine.add_lora.assert_awaited_once() called_lora_request = mock_engine.add_lora.call_args[0][0] assert isinstance(called_lora_request, LoRARequest) assert called_lora_request.lora_name == lora_model_name @@ -157,7 +168,7 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup, response = await serving_completion.create_completion(req) - mock_engine.add_lora.assert_not_called() + mock_engine.add_lora.assert_not_awaited() mock_engine.generate.assert_not_called() assert isinstance(response, ErrorResponse) @@ -181,7 +192,7 @@ async def test_serving_completion_resolver_add_lora_fails( response = await serving_completion.create_completion(req) # Assert add_lora was called before the failure - mock_engine.add_lora.assert_called_once() + mock_engine.add_lora.assert_awaited_once() called_lora_request = mock_engine.add_lora.call_args[0][0] assert isinstance(called_lora_request, LoRARequest) assert called_lora_request.lora_name == invalid_model From 5aeb9254521023f97aca292b3478aa7ff485ffb2 Mon Sep 17 00:00:00 2001 From: Deboleina <debroy@redhat.com> Date: Sun, 21 Sep 2025 19:07:11 -0400 Subject: [PATCH 535/584] Multimodal - audio tests (#25285) Signed-off-by: Debolina Roy <debroy@redhat.com> --- tests/multimodal/test_audio.py | 140 +++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tests/multimodal/test_audio.py diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py new file mode 100644 index 0000000000000..ba39af845041a --- /dev/null +++ b/tests/multimodal/test_audio.py @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# test_audio.py +import base64 +from pathlib import Path +from unittest.mock import patch + +import numpy as np +import pytest + +from vllm.multimodal.audio import (AudioMediaIO, AudioResampler, + resample_audio_librosa, + resample_audio_scipy) + + +@pytest.fixture +def dummy_audio(): + return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float) + + +def test_resample_audio_librosa(dummy_audio): + with patch("vllm.multimodal.audio.librosa.resample") as mock_resample: + mock_resample.return_value = dummy_audio * 2 + out = resample_audio_librosa(dummy_audio, + orig_sr=44100, + target_sr=22050) + mock_resample.assert_called_once_with(dummy_audio, + orig_sr=44100, + target_sr=22050) + assert np.all(out == dummy_audio * 2) + + +def test_resample_audio_scipy(dummy_audio): + out_down = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=2) + out_up = resample_audio_scipy(dummy_audio, orig_sr=2, target_sr=4) + out_same = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=4) + + assert len(out_down) == 3 + assert len(out_up) == 10 + assert np.all(out_same == dummy_audio) + + +@pytest.mark.xfail( + reason="resample_audio_scipy is buggy for non-integer ratios") +def test_resample_audio_scipy_non_integer_ratio(dummy_audio): + out = resample_audio_scipy(dummy_audio, orig_sr=5, target_sr=3) + + expected_len = int(round(len(dummy_audio) * 3 / 5)) + assert len(out) == expected_len + + assert isinstance(out, np.ndarray) + assert np.isfinite(out).all() + + +def test_audio_resampler_librosa_calls_resample(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="librosa") + with patch( + "vllm.multimodal.audio.resample_audio_librosa") as mock_resample: + mock_resample.return_value = dummy_audio + out = resampler.resample(dummy_audio, orig_sr=44100) + mock_resample.assert_called_once_with(dummy_audio, + orig_sr=44100, + target_sr=22050) + assert np.all(out == dummy_audio) + + +def test_audio_resampler_scipy_calls_resample(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="scipy") + with patch("vllm.multimodal.audio.resample_audio_scipy") as mock_resample: + mock_resample.return_value = dummy_audio + out = resampler.resample(dummy_audio, orig_sr=44100) + mock_resample.assert_called_once_with(dummy_audio, + orig_sr=44100, + target_sr=22050) + assert np.all(out == dummy_audio) + + +def test_audio_resampler_invalid_method(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="invalid") + with pytest.raises(ValueError): + resampler.resample(dummy_audio, orig_sr=44100) + + +def test_audio_resampler_no_target_sr(dummy_audio): + resampler = AudioResampler(target_sr=None) + with pytest.raises(RuntimeError): + resampler.resample(dummy_audio, orig_sr=44100) + + +@pytest.fixture +def dummy_audio_bytes(): + return b"FAKEAUDIOBYTES" + + +def test_audio_media_io_load_bytes(dummy_audio_bytes): + audio_io = AudioMediaIO() + with patch("vllm.multimodal.audio.librosa.load") as mock_load: + mock_load.return_value = (np.array([0.1, 0.2]), 16000) + out = audio_io.load_bytes(dummy_audio_bytes) + mock_load.assert_called_once() + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 + + +def test_audio_media_io_load_base64(dummy_audio_bytes): + audio_io = AudioMediaIO() + encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8") + with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes: + mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000) + out = audio_io.load_base64("audio/wav", encoded) + mock_load_bytes.assert_called_once() + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 + + +def test_audio_media_io_load_file(): + audio_io = AudioMediaIO() + path = Path("/fake/path.wav") + with patch("vllm.multimodal.audio.librosa.load") as mock_load: + mock_load.return_value = (np.array([0.1, 0.2]), 16000) + out = audio_io.load_file(path) + mock_load.assert_called_once_with(path, sr=None) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 + + +def test_audio_media_io_encode_base64(dummy_audio): + audio_io = AudioMediaIO() + media = (dummy_audio, 16000) + with patch("vllm.multimodal.audio.soundfile.write") as mock_write: + + def write_to_buffer(buffer, *_args, **_kwargs): + buffer.write(b"dummy_wav_data") + + mock_write.side_effect = write_to_buffer + + out = audio_io.encode_base64(media) + decoded = base64.b64decode(out) + assert decoded == b"dummy_wav_data" + mock_write.assert_called_once() From 7b57a433daf94dc075ccf52267b82aae359d0d53 Mon Sep 17 00:00:00 2001 From: Roger Wang <hey@rogerw.io> Date: Sun, 21 Sep 2025 19:24:40 -0700 Subject: [PATCH 536/584] [Model] Support Dots OCR (#24645) Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: yinz-aizip <yinz@aizip.ai> --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 18 + tests/models/registry.py | 2 + vllm/model_executor/models/dots_ocr.py | 824 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/dotsocr.py | 69 ++ 7 files changed, 917 insertions(+) create mode 100644 vllm/model_executor/models/dots_ocr.py create mode 100644 vllm/transformers_utils/configs/dotsocr.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index cbc0a56a645ea..9d288667a318f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -352,6 +352,7 @@ th { | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | +| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ | | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index de3f3afc17948..f8ddb5a22b31a 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -126,6 +126,23 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: ) +# Dots-OCR +def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions] + engine_args = EngineArgs( + model="rednote-hilab/dots.ocr", + limit_mm_per_prompt={modality: 1}, + trust_remote_code=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1676,6 +1693,7 @@ model_example_map = { "aya_vision": run_aya_vision, "blip-2": run_blip2, "chameleon": run_chameleon, + "dots_ocr": run_dots_ocr, "command_a_vision": run_command_a_vision, "deepseek_vl_v2": run_deepseek_vl2, "ernie45_vl": run_ernie45_vl, diff --git a/tests/models/registry.py b/tests/models/registry.py index e9cc5170ade74..29b6980aaa42a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -448,6 +448,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible.", # noqa: E501 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 + "DotsOCRForCausalLM": _HfExamplesInfo("rednote-hilab/dots.ocr", + trust_remote_code=True), "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT", # noqa: E501 trust_remote_code=True), diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py new file mode 100644 index 0000000000000..04fa5584199a3 --- /dev/null +++ b/vllm/model_executor/models/dots_ocr.py @@ -0,0 +1,824 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import LayerNorm +from transformers.modeling_utils import PreTrainedModel +from transformers.models.qwen2_vl import Qwen2VLProcessor + +from vllm.attention.layer import check_upstream_fa_availability +from vllm.config import VllmConfig +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, + SupportsMultiModal, + SupportsPP) +from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM +from vllm.model_executor.models.qwen2_vl import (Qwen2VLDummyInputsBuilder, + Qwen2VLMultiModalProcessor, + Qwen2VLProcessingInfo) +from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, + init_vllm_registered_model, + maybe_prefix, + merge_multimodal_embeddings) +from vllm.model_executor.models.vision import get_vit_attn_backend +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalDataDict +from vllm.platforms import _Backend +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.dotsocr import (DotsOCRConfig, + DotsVisionConfig) + +IMAGE_TOKEN = "<|imgpad|>" + + +class DotsOCRImagePixelInputs(TypedDict): + type: Literal["pixel_values", "image_grid_thw"] + + pixel_values: torch.Tensor + image_grid_thw: torch.Tensor + + +class DotsOCRImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds", "image_grid_thw"] + image_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all images' features. + Each tensor holds an image's features. + - `torch.Tensor`: A tensor holding all images' features + (concatenation of all images' feature tensors). + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the images. + - `hidden_size` must match the hidden size of language model backbone. + """ + + image_grid_thw: torch.Tensor + + +DotsOCRImageInputs = Union[DotsOCRImagePixelInputs, + DotsOCRImageEmbeddingInputs] + + +class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + return IMAGE_TOKEN * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501 + ) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + } + + +class DotsOCRProcessingInfo(Qwen2VLProcessingInfo): + + def get_hf_config(self) -> DotsOCRConfig: + config = self.ctx.get_hf_config() + if not config.__class__.__name__ == 'DotsOCRConfig': + raise TypeError(f"Expected DotsOCRConfig, got {type(config)}") + + if hasattr(config, "vision_config") and isinstance( + config.vision_config, dict): + config.vision_config = DotsVisionConfig(**config.vision_config) + + return config + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_image_tokens = self.get_max_image_tokens() + return {"image": max_image_tokens} + + def get_hf_processor( + self, + **kwargs: object, + ) -> Qwen2VLProcessor: + self.get_tokenizer( + ).image_token = IMAGE_TOKEN # Ensure image token is set + processor = self.ctx.get_hf_processor( + Qwen2VLProcessor, + **kwargs, + ) + processor.image_token = IMAGE_TOKEN + processor.video_token = "<|video_pad|>" + return processor + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb_vision(tensor: torch.Tensor, + freqs: torch.Tensor) -> torch.Tensor: + orig_dtype = tensor.dtype + tensor = tensor.float() + + cos = freqs.cos() + sin = freqs.sin() + + cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() + sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() + + output = (tensor * cos) + (rotate_half(tensor) * sin) + + output = output.to(orig_dtype) + + return output + + +class VisionRotaryEmbedding(nn.Module): + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + inv_freq = 1.0 / (theta + **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, seqlen: int) -> torch.Tensor: + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + return freqs + + +class PatchMerger(nn.Module): + + def __init__( + self, + dim: int, + context_dim: int, + spatial_merge_size: int = 2, + pre_norm="layernorm", + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + self.pre_norm = pre_norm + if self.pre_norm == "layernorm": + self.ln_q = LayerNorm(context_dim, eps=1e-6) + elif self.pre_norm == "rmsnorm": + self.ln_q = RMSNorm(context_dim, eps=1e-6) + else: + print("no norm in patch merger") + + self.mlp = nn.Sequential( + ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + return_bias=False, + disable_tp=True), + nn.GELU(), + RowParallelLinear(self.hidden_size, + dim, + bias=True, + return_bias=False, + disable_tp=True), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.pre_norm: + x = self.mlp(self.ln_q(x).view(-1, self.hidden_size)) + else: + x = self.mlp(x.view(-1, self.hidden_size)) + return x + + +class DotsVisionAttention(nn.Module): + + def __init__(self, + config, + dim: int, + num_heads: int = 16, + bias: bool = True, + *, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + super().__init__() + from vllm.distributed import (parallel_state, + tensor_model_parallel_all_gather) + from vllm.distributed import utils as dist_utils + + self.embed_dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.num_heads_per_partition = dist_utils.divide( + num_heads, self.tp_size) + + # qkv/proj follow Qwen2-VL style; bias controlled by arg + self.qkv = QKVParallelLinear(hidden_size=dim, + head_size=dim // num_heads, + total_num_heads=num_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv") + self.proj = RowParallelLinear(input_size=dim, + output_size=dim, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.proj") + self._all_gather = tensor_model_parallel_all_gather + self._split_last = dist_utils.split_tensor_along_last_dim + + # Select attention backend + self.attn_backend = get_vit_attn_backend(self.head_dim, + torch.get_default_dtype()) + self.use_upstream_fa = False + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, + _Backend.ROCM_AITER_FA + }: + raise RuntimeError( + f"Unsupported vision attention backend: {self.attn_backend}") + self.is_flash_attn_backend = self.attn_backend in { + _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA + } + + def _split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # qkv: [S, B, 3*dim] + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = self._all_gather(qkv) + q, k, v = qkv.chunk(3, dim=2) + if self.tp_size > 1: + q = self._split_last(q, num_partitions=self.tp_size)[self.tp_rank] + k = self._split_last(k, num_partitions=self.tp_size)[self.tp_rank] + v = self._split_last(v, num_partitions=self.tp_size)[self.tp_rank] + new_shape = (seq_len, bs, self.num_heads_per_partition, self.head_dim) + return (q.view(*new_shape), k.view(*new_shape), v.view(*new_shape)) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + *, + max_seqlen: Optional[int] = None, + seqlens: Optional[list[int]] = None, + ) -> torch.Tensor: + # [S, C] -> [S, B=1, C] + x = hidden_states.unsqueeze(1) + x, _ = self.qkv(x) + q, k, v = self._split_qkv(x) + bs = q.shape[1] + # [S,B,H,D] -> [B,S,H,D] + q = q.permute(1, 0, 2, 3).contiguous() + k = k.permute(1, 0, 2, 3).contiguous() + v = v.permute(1, 0, 2, 3).contiguous() + + if rotary_pos_emb is not None: + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) + + if self.is_flash_attn_backend: + if self.attn_backend == _Backend.ROCM_AITER_FA: + from aiter import flash_attn_varlen_func + else: + if self.use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.vllm_flash_attn import flash_attn_varlen_func + q_ = q.reshape(bs * q.shape[1], q.shape[2], q.shape[3]) + k_ = k.reshape(bs * k.shape[1], k.shape[2], k.shape[3]) + v_ = v.reshape(bs * v.shape[1], v.shape[2], v.shape[3]) + output = flash_attn_varlen_func(q_, + k_, + v_, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0.0, + causal=False) + context_layer = output.view(bs, -1, self.num_heads_per_partition, + self.head_dim) + elif self.attn_backend == _Backend.TORCH_SDPA: + outputs = [] + for i in range(1, len(cu_seqlens)): + s = int(cu_seqlens[i - 1]) + e = int(cu_seqlens[i]) + q_i = q[:, s:e].permute(0, 2, 1, 3) + k_i = k[:, s:e].permute(0, 2, 1, 3) + v_i = v[:, s:e].permute(0, 2, 1, 3) + out_i = F.scaled_dot_product_attention(q_i, + k_i, + v_i, + dropout_p=0.0) + out_i = out_i.permute(0, 2, 1, 3) + outputs.append(out_i) + context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0] + elif self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None, + device=q.device) + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None) + else: + raise RuntimeError("Unsupported attention backend") + + # [B,S,H,D] -> [S,B,H*D] -> [S, C] + context_layer = context_layer.permute(1, 0, 2, 3).contiguous() + context_layer = context_layer.view(context_layer.shape[0], bs, -1) + out, _ = self.proj(context_layer) + return out.squeeze(1) + + +class DotsSwiGLUFFN(nn.Module): + + def __init__(self, + config, + *, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + hidden_features = config.intermediate_size + in_features = config.embed_dim + bias = config.use_bias + + # Referenced aimv2.py AIMv2SwiGLUFFN + self.fc13 = MergedColumnParallelLinear(in_features, + [hidden_features] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc13", + disable_tp=True) + self.fc2 = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + disable_tp=True) + self.act_fn = SiluAndMul() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.fc13(x) + x = self.act_fn(x) + x, _ = self.fc2(x) + return x + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params = dict(self.named_parameters()) + loaded: set[str] = set() + for name, w in weights: + # Map fc1 -> fc13 (shard 0) + if name.startswith("fc1."): + tgt = name.replace("fc1.", "fc13.") + if tgt in params: + params[tgt].weight_loader(params[tgt], w, 0) + loaded.add(tgt) + continue + # Map fc3 -> fc13 (shard 1) + if name.startswith("fc3."): + tgt = name.replace("fc3.", "fc13.") + if tgt in params: + params[tgt].weight_loader(params[tgt], w, 1) + loaded.add(tgt) + continue + # Pass-through for fc2 and others + if name in params: + params[name].weight_loader(params[name], w) + loaded.add(name) + return loaded + + +class DotsPatchEmbed(nn.Module): + + def __init__(self, config): + super().__init__() + self.num_channels = config.num_channels + self.patch_size = config.patch_size + self.temporal_patch_size = config.temporal_patch_size + self.embed_dim = config.embed_dim + self.config = config + self.proj = nn.Conv2d( + config.num_channels, + config.embed_dim, + kernel_size=(config.patch_size, config.patch_size), + stride=(config.patch_size, config.patch_size), + ) + self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) + + def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor: + x = x.view(-1, self.num_channels, self.temporal_patch_size, + self.patch_size, self.patch_size)[:, :, 0] + x = self.proj(x).view(-1, self.embed_dim) + x = self.norm(x) + return x + + +class DotsViTPreprocessor(nn.Module): + + def __init__(self, config): + super().__init__() + self.patch_h = config.patch_size + self.patch_w = config.patch_size + self.embed_dim = config.embed_dim + self.config = config + self.patchifier = DotsPatchEmbed(config) + + def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor: + tokens = self.patchifier(x, grid_thw) + return tokens + + +class DotsVisionBlock(nn.Module): + + def __init__(self, + config, + *, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + + self.attn = DotsVisionAttention( + config, + config.embed_dim, + num_heads=config.num_attention_heads, + bias=config.use_bias, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) + self.mlp = DotsSwiGLUFFN(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) + + def forward(self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + max_seqlen: Optional[int] = None, + seqlens: Optional[list[int]] = None) -> torch.Tensor: + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class DotsVisionTransformer(PreTrainedModel): + + def __init__( + self, + config: DotsVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: + super().__init__(config) + self.config = config + self.spatial_merge_size = config.spatial_merge_size + + self.patch_embed = DotsViTPreprocessor(config) + + head_dim = config.embed_dim // config.num_attention_heads + self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, dtype=torch.get_default_dtype()) + if self.attn_backend != _Backend.FLASH_ATTN and \ + check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + + # Keep blocks for compatibility with other vision towers + num_layers = (config.num_hidden_layers if num_hidden_layers_override + is None else num_hidden_layers_override) + self.blocks = nn.ModuleList([ + DotsVisionBlock(config, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{i}") + for i in range(num_layers) + ]) + if require_post_norm is None: + require_post_norm = (len(self.blocks) == config.num_hidden_layers) + if require_post_norm and self.config.post_norm: + self.post_trunk_norm = RMSNorm(config.embed_dim, + eps=config.rms_norm_eps) + else: + self.post_trunk_norm = None + + self.merger = PatchMerger( + dim=config.hidden_size, + context_dim=config.embed_dim, + spatial_merge_size=config.spatial_merge_size, + ) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.patchifier.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.patchifier.proj.weight.device + + def get_pos_ids_by_grid(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + + return pos_ids + + def rot_pos_emb(self, grid_thw): + pos_ids = self.get_pos_ids_by_grid(grid_thw) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def compute_attn_mask_seqlen( + self, cu_seqlens: torch.Tensor + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return max_seqlen, seqlens + + def forward(self, hidden_states: torch.Tensor, + grid_thw: torch.Tensor) -> torch.Tensor: + hidden_states = hidden_states.to(self.dtype) + hidden_states = self.patch_embed(hidden_states, grid_thw) + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + cu_seqlens = torch.repeat_interleave( + grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + dtype=grid_thw.dtype + if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + for blk in self.blocks: + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens) + + if self.post_trunk_norm is not None: + hidden_states = self.post_trunk_norm(hidden_states) + + hidden_states = self.merger(hidden_states) + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen2VLMultiModalProcessor, + info=DotsOCRProcessingInfo, + dummy_inputs=DotsOCRDummyInputsBuilder, +) +class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".attn.qkv_proj.": ".attn.qkv.", + ".attn.out_proj.": ".attn.proj.", + }, + orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }, + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "<|img|><|imgpad|><|endofimg|>" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + self.config: DotsOCRConfig = vllm_config.model_config.hf_config + self.quant_config = vllm_config.quant_config + self.multimodal_config = vllm_config.model_config.multimodal_config + + if isinstance(self.config.vision_config, dict): + vision_config = DotsVisionConfig(**self.config.vision_config) + self.config.vision_config = vision_config + else: + vision_config = self.config.vision_config + + self.vision_tower = DotsVisionTransformer( + vision_config, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "vision_tower"), + ) + self.language_model: Qwen2ForCausalLM = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=self.config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Qwen2ForCausalLM"], + ) + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[DotsOCRImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return DotsOCRImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return DotsOCRImageEmbeddingInputs(type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) + + def _process_image_input( + self, image_input: DotsOCRImageInputs) -> tuple[torch.Tensor, ...]: + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type( + self.vision_tower.dtype) + else: + pixel_values = image_input["pixel_values"].type( + self.vision_tower.dtype) + image_embeds = self.vision_tower( + pixel_values, grid_thw)[:, :self.config.hidden_size] + + # Split concatenated embeddings for each image item. + merge_size = self.vision_tower.spatial_merge_size + sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) // + (merge_size * merge_size)).tolist() + + return image_embeds.split(sizes) + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return [] + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + self.config.image_token_id, + ) + + return inputs_embeds + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + elif inputs_embeds is None and kwargs.get("pixel_values") is not None: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + inputs_embeds = None + else: + assert input_ids is not None + inputs_embeds = self.get_multimodal_embeddings( + input_ids, + image_input=image_input, + ) + input_ids = None + + hidden_states = self.language_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 5dc5d545bb9c5..86123bc092b9b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -219,6 +219,7 @@ _MULTIMODAL_MODELS = { "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501 "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), + "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"), "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501 "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 91bfeb8c55ee5..52fa49ad302bd 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -9,6 +9,7 @@ Model configs may be defined in this directory for the following reasons: from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config +from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the @@ -36,6 +37,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ "ChatGLMConfig", "DeepseekVLV2Config", + "DotsOCRConfig", "EAGLEConfig", "RWConfig", "JAISConfig", diff --git a/vllm/transformers_utils/configs/dotsocr.py b/vllm/transformers_utils/configs/dotsocr.py new file mode 100644 index 0000000000000..6bb3c12d9c7eb --- /dev/null +++ b/vllm/transformers_utils/configs/dotsocr.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Optional + +from transformers.configuration_utils import PretrainedConfig +from transformers.models.qwen2 import Qwen2Config + + +class DotsVisionConfig(PretrainedConfig): + model_type: str = "dots_vit" + + def __init__( + self, + embed_dim: int = 1536, # vision encoder embed size + hidden_size: int = 1536, # after merger hidden size + intermediate_size: int = 4224, + num_hidden_layers: int = 42, + num_attention_heads: int = 12, + num_channels: int = 3, + patch_size: int = 14, + spatial_merge_size: int = 2, + temporal_patch_size: int = 1, + rms_norm_eps: float = 1e-5, + use_bias: bool = False, + attn_implementation="flash_attention_2", + initializer_range=0.02, + init_merger_std=0.02, + is_causal=False, # ve causal forward + post_norm=True, + gradient_checkpointing=False, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.rms_norm_eps = rms_norm_eps + self.use_bias = use_bias + self.attn_implementation = attn_implementation + self.initializer_range = initializer_range + self.init_merger_std = init_merger_std + self.is_causal = is_causal + self.post_norm = post_norm + self.gradient_checkpointing = gradient_checkpointing + + +class DotsOCRConfig(Qwen2Config): + model_type = "dots_ocr" + + def __init__(self, + image_token_id=151665, + video_token_id=151656, + vision_config: Optional[dict] = None, + *args, + **kwargs): + super().__init__(*args, **kwargs) + self.image_token_id = image_token_id + self.video_token_id = video_token_id + self.vision_config = DotsVisionConfig(**(vision_config or {})) + + def save_pretrained(self, save_directory, **kwargs): + self._auto_class = None + super().save_pretrained(save_directory, **kwargs) From 793be8d0579f11b3326f7e2099432c1433033571 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Mon, 22 Sep 2025 10:49:13 +0800 Subject: [PATCH 537/584] [Docs] GSM8K Accuracy Evaluation doc update (#25360) Signed-off-by: David Chen <530634352@qq.com> --- tests/evals/gsm8k/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md index 58572c3a6fbc1..29c5199e1e87a 100644 --- a/tests/evals/gsm8k/README.md +++ b/tests/evals/gsm8k/README.md @@ -19,7 +19,7 @@ pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \ vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000 # Run evaluation -python tests/gsm8k/gsm8k_eval.py --port 8000 +python tests/evals/gsm8k/gsm8k_eval.py --port 8000 ``` ## Configuration Format From 0eecb3166365a29db117c2aff6ca441b484b514d Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Mon, 22 Sep 2025 11:35:39 +0800 Subject: [PATCH 538/584] [Bugfix] Fix hermes tool parser handling of non-string argument types (#22002) Signed-off-by: wangzi <3220100013@zju.edu.cn> Signed-off-by: David Chen <530634352@qq.com> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Chauncey <chaunceyjiang@gmail.com> --- .../tool_parsers/test_hermes_tool_parser.py | 131 ++++++++++++++++++ .../openai/tool_parsers/hermes_tool_parser.py | 42 +++++- 2 files changed, 166 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index 4bab849f47c27..e0e6b2c07e171 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -45,8 +45,39 @@ TOOLS = [{ }, }] +PRODUCT_TOOLS = [{ + "type": "function", + "function": { + "name": "get_product_info", + "description": "Get detailed information of a product based on its " + "product ID.", + "parameters": { + "type": "object", + "properties": { + "inserted": { + "type": "boolean", + "description": "inserted.", + }, + "product_id": { + "type": "integer", + "description": "The product ID of the product.", + }, + }, + "required": ["product_id", "inserted"], + }, + }, +}] + MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}] +PRODUCT_MESSAGES = [{ + "role": + "user", + "content": + "Hi! Do you have any detailed information about the product id " + "7355608 and inserted true?" +}] + @pytest.mark.asyncio async def test_non_streaming_tool_call(): @@ -127,3 +158,103 @@ async def test_streaming_tool_call(): print("\n[Streaming Test Passed]") print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") print(f"Reconstructed Arguments: {arguments}") + + +@pytest.mark.asyncio +async def test_non_streaming_product_tool_call(): + """Test tool call integer and boolean parameters in non-streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + response = await client.chat.completions.create( + model=LORA_MODEL, + messages=PRODUCT_MESSAGES, + tools=PRODUCT_TOOLS, + tool_choice="auto", + temperature=0.66, + ) + + assert response.choices + choice = response.choices[0] + message = choice.message + + assert choice.finish_reason == "tool_calls" + assert message.tool_calls is not None + + tool_call = message.tool_calls[0] + assert tool_call.type == "function" + assert tool_call.function.name == "get_product_info" + + arguments = json.loads(tool_call.function.arguments) + assert "product_id" in arguments + assert "inserted" in arguments + + product_id = arguments.get("product_id") + inserted = arguments.get("inserted") + + assert isinstance(product_id, int) + assert product_id == 7355608 + assert isinstance(inserted, bool) + assert inserted is True + + print("\n[Non-Streaming Product Test Passed]") + print(f"Tool Call: {tool_call.function.name}") + print(f"Arguments: {arguments}") + + +@pytest.mark.asyncio +async def test_streaming_product_tool_call(): + """Test tool call integer and boolean parameters in streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + stream = await client.chat.completions.create( + model=LORA_MODEL, + messages=PRODUCT_MESSAGES, + tools=PRODUCT_TOOLS, + tool_choice="auto", + temperature=0.66, + stream=True, + ) + + tool_call_chunks = {} + async for chunk in stream: + if not chunk.choices: + continue + + delta = chunk.choices[0].delta + if not delta or not delta.tool_calls: + continue + + for tool_chunk in delta.tool_calls: + index = tool_chunk.index + if index not in tool_call_chunks: + tool_call_chunks[index] = {"name": "", "arguments": ""} + + if tool_chunk.function.name: + tool_call_chunks[index]["name"] += tool_chunk.function.name + if tool_chunk.function.arguments: + tool_call_chunks[index][ + "arguments"] += tool_chunk.function.arguments + + assert len(tool_call_chunks) == 1 + reconstructed_tool_call = tool_call_chunks[0] + + assert reconstructed_tool_call["name"] == "get_product_info" + + arguments = json.loads(reconstructed_tool_call["arguments"]) + assert "product_id" in arguments + assert "inserted" in arguments + + # Handle type coercion for streaming test as well + product_id = arguments.get("product_id") + inserted = arguments.get("inserted") + + assert isinstance(product_id, int) + assert product_id == 7355608 + assert isinstance(inserted, bool) + assert inserted is True + + print("\n[Streaming Product Test Passed]") + print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") + print(f"Reconstructed Arguments: {arguments}") diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index e74c420da1d3c..87595953da067 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -368,16 +368,32 @@ class Hermes2ProToolParser(ToolParser): # case -- we now have the first info about arguments available from # autocompleting the JSON elif cur_arguments and not prev_arguments: + # extract the content after {"name": ..., "arguments": + # directly from tool_call_portion as cur_arguments_json, + # since cur_arguments may differ from the original text + # due to partial JSON parsing + # for example, tool_call_portion = + # {"name": "search", "arguments": {"search_request": {" + # but cur_arguments = + # {"search_request": {}} + function_name = current_tool_call.get("name") + match = re.search( + r'\{"name":\s*"' + + re.escape(function_name) + r'"\s*,\s*"arguments":\s*(.*)', + tool_call_portion.strip(), re.DOTALL) + if match: + cur_arguments_json = match.group(1) + else: + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) - cur_arguments_json = json.dumps(cur_arguments, - ensure_ascii=False) logger.debug("finding %s in %s", delta_text, cur_arguments_json) - # get the location where previous args differ from current - if (delta_text not in cur_arguments_json[:-2]): + # get the location where previous args differ from current. + if (delta_text not in cur_arguments_json): return None - args_delta_start_loc = cur_arguments_json[:-2]. \ + args_delta_start_loc = cur_arguments_json. \ rindex(delta_text) + \ len(delta_text) @@ -397,8 +413,20 @@ class Hermes2ProToolParser(ToolParser): # last case -- we have an update to existing arguments. elif cur_arguments and prev_arguments: - if isinstance(delta_text, str) and len(delta_text.rstrip( - )) >= 1 and delta_text.rstrip()[-1] == '}': + # judge whether the tool_call_portion is a complete JSON + try: + json.loads(tool_call_portion) + is_complete_json = True + except Exception: + is_complete_json = False + + # if the delta_text ends with a '}' and tool_call_portion is a + # complete JSON, then the last '}' does not belong to the + # arguments, so we should trim it off + if isinstance(delta_text, str) \ + and len(delta_text.rstrip()) >= 1 \ + and delta_text.rstrip()[-1] == '}' \ + and is_complete_json: delta_text = delta_text.rstrip()[:-1] logger.debug("got diff %s", delta_text) From 6d0b827cbd0510173f6a9e77549d917828e80c76 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 22 Sep 2025 13:58:26 +0800 Subject: [PATCH 539/584] [V0 Deprecation] Remove V0-only methods in multi-modal registry (#25362) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .../multimodal/generation/test_qwen2_vl.py | 1 - vllm/multimodal/registry.py | 32 +------------------ 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index 8336ebc0d59cb..c8a3513ac7ad1 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -209,7 +209,6 @@ def batch_make_video_embeddings( return visual(pixel_values_on_device, grid_thw=video_grid_thw_on_device).cpu() - # V1 Test: this calls a V0 internal. video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 38adbf8f3536a..5d485bc361d11 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -12,8 +12,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, cached_tokenizer_from_config) from vllm.utils import ClassRegistry -from .cache import (BaseMultiModalProcessorCache, - processor_only_cache_from_config) +from .cache import BaseMultiModalProcessorCache from .processing import BaseMultiModalProcessor, BaseProcessingInfo from .profiling import (BaseDummyInputsBuilder, DummyDecoderData, DummyEncoderData, MultiModalProfiler) @@ -176,35 +175,6 @@ class MultiModalRegistry: if mm_limits[key] > 0 } - # TODO: Remove once V0 is gone - def get_max_tokens_by_modality( - self, - model_config: "ModelConfig", - ) -> Mapping[str, int]: - """ - Get the maximum number of tokens from each modality - for profiling the memory usage of a model. - """ - cache = processor_only_cache_from_config(model_config, self) - mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) - max_tokens_per_item = self.get_max_tokens_per_item_by_modality( - model_config, - cache=cache, - ) - - return { - key: mm_limits[key] * max_tokens_per_mm_item - for key, max_tokens_per_mm_item in max_tokens_per_item.items() - } - - # TODO: Remove once V0 is gone - def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: - """ - Get the maximum number of multi-modal tokens - for profiling the memory usage of a model. - """ - return sum(self.get_max_tokens_by_modality(model_config).values()) - def get_mm_limits_per_prompt( self, model_config: "ModelConfig", From f92d952632541889c38f107f0425a5bd4707d9d9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 22 Sep 2025 16:49:19 +0800 Subject: [PATCH 540/584] [V0 Deprecation] Remove `MultiModalPlaceholderMap` (#25366) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- tests/kernels/utils.py | 2 - vllm/attention/backends/abstract.py | 10 --- vllm/attention/backends/placeholder_attn.py | 23 +------ vllm/attention/backends/utils.py | 18 ----- vllm/multimodal/__init__.py | 2 - vllm/multimodal/base.py | 74 +-------------------- vllm/v1/attention/backends/cpu_attn.py | 1 - 7 files changed, 2 insertions(+), 128 deletions(-) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 8d6ce381976b5..39ea07309134b 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -959,7 +959,6 @@ def make_test_metadata( return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, @@ -1009,7 +1008,6 @@ def make_test_metadata( return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index ab7ef2112b083..1b392cd7c88d3 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -10,7 +10,6 @@ from typing import (Any, Dict, Generic, List, Optional, Protocol, Set, Tuple, import torch from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey -from vllm.multimodal import MultiModalPlaceholderMap class AttentionType: @@ -116,15 +115,6 @@ class AttentionMetadata: # in block 0, and 1st slot in block 1, respectively. slot_mapping: torch.Tensor - # The index maps that relate multi-modal embeddings to the corresponding - # placeholders. - # - # N.B. These aren't really related to attention and don't belong on this - # type -- this is just a temporary solution to make them available to - # `model_executable`. - multi_modal_placeholder_index_maps: Optional[Dict[ - str, MultiModalPlaceholderMap.IndexMap]] - # Enable/disable KV scales calculation. This is so that we can disable the # calculation until after prefill and cuda graph capture. enable_kv_scales_calculation: bool diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index f82d28938f45c..cddeb2cf39bf0 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import Dict, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import torch @@ -12,7 +11,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionMetadataBuilder) from vllm.attention.backends.utils import CommonAttentionState -from vllm.multimodal import MultiModalPlaceholderMap from vllm.utils import async_tensor_h2d # Placeholder attention backend for models like Mamba and pooling models that @@ -141,8 +139,6 @@ class PlaceholderAttentionMetadata(AttentionMetadata): num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, enable_kv_scales_calculation=self.enable_kv_scales_calculation, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, @@ -178,7 +174,6 @@ class PlaceholderAttentionMetadata(AttentionMetadata): num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=seq_lens_tensor, @@ -210,9 +205,6 @@ class PlaceholderAttentionMetadataBuilder( self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -232,12 +224,6 @@ class PlaceholderAttentionMetadataBuilder( self.context_lens.append(context_len) if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -295,12 +281,6 @@ class PlaceholderAttentionMetadataBuilder( seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } - # Placeholders slot_mapping_tensor = torch.empty(0) block_tables = torch.empty(0) @@ -308,7 +288,6 @@ class PlaceholderAttentionMetadataBuilder( return PlaceholderAttentionMetadata( num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, - multi_modal_placeholder_index_maps=placeholder_index_maps, enable_kv_scales_calculation=True, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 3f15580872c7f..33d8168f8a13c 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention backend utils""" -from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from itertools import accumulate @@ -15,7 +14,6 @@ from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder, from vllm.attention.backends.abstract import AttentionType from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.multimodal import MultiModalPlaceholderMap from vllm.utils import async_tensor_h2d, make_tensor_with_pad logger = init_logger(__name__) @@ -135,9 +133,6 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): self.context_lens: List[int] = [] self.block_tables: List[List[int]] = [] self.curr_seq_lens: List[int] = [] - self.multimodal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -154,12 +149,6 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): inter_data.curr_sliding_window_blocks): self.context_lens.append(context_len) if is_prompt: - mm_maps = inter_data.multi_modal_placeholder_maps - if mm_maps: - for modality, placeholders in mm_maps.items(): - self.multimodal_placeholder_maps[modality].extend( - placeholders) - self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -254,16 +243,10 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): self.runner.pin_memory) seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, device, self.runner.pin_memory) - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - self.multimodal_placeholder_maps.items() - } return self._metadata_cls( # type: ignore num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, - multi_modal_placeholder_index_maps=placeholder_index_maps, enable_kv_scales_calculation=True, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, @@ -320,7 +303,6 @@ class CommonAttentionState(AttentionState): num_prefill_tokens=0, num_decode_tokens=batch_size, slot_mapping=self._graph_slot_mapping[:batch_size], - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=self._graph_seq_lens[:batch_size], diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 7ffa732cf3708..8ea79078465e6 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .base import MultiModalPlaceholderMap from .hasher import MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, MultiModalDataDict, MultiModalKwargs, @@ -27,7 +26,6 @@ __all__ = [ "MultiModalKwargs", "MultiModalKwargsItems", "MultiModalPlaceholderDict", - "MultiModalPlaceholderMap", "MultiModalUUIDDict", "NestedTensors", "MULTIMODAL_REGISTRY", diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index e0edb3e883ed6..faffddd57199d 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -3,83 +3,11 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import Generic, NamedTuple, TypeVar +from typing import Generic, TypeVar _T = TypeVar("_T") -class MultiModalPlaceholderMap: - """ - Relates multi-modal embeddings to their corresponding placeholders. - - Note: This is only used in V0. - """ - - class IndexMap(NamedTuple): - src: list[int] - dest: list[int] - - src_ranges: list[range] - """ - The indices of the multi-modal embeddings that will replace the - corresponding placeholder embeddings pointed to by ``dest_ranges``. - """ - - src_len: int - """ - The total number of flattened multi-modal embeddings. - """ - - dest_ranges: list[range] - """ - The indices of the placeholder embeddings that will be replaced by the - multimodal embeddings. - """ - - dest_len: int - """ - The total number of embeddings in the destination tensor. - """ - - def __init__(self): - self.src_ranges = [] - self.src_len = 0 - self.dest_ranges = [] - self.dest_len = 0 - - def extend(self, other: "MultiModalPlaceholderMap"): - """ - Adds the placeholders from another ``MultiModalPlaceholderMap`` to this - instance based on the source and destination tensors being - concatenated. - """ - - self.src_ranges.extend( - range(self.src_len + r.start, self.src_len + r.stop) - for r in other.src_ranges) - self.src_len += other.src_len - self.dest_ranges.extend( - range(self.dest_len + r.start, self.dest_len + r.stop) - for r in other.dest_ranges) - self.dest_len += other.dest_len - - def index_map(self) -> "IndexMap": - """ - Finalizes the placeholder map into lists of indices that can be used to - index the source and destination tensors. - """ - - src_indices = [i for r in self.src_ranges for i in r] - dest_indices = [i for r in self.dest_ranges for i in r] - - if len(src_indices) != len(dest_indices): - raise ValueError( - f"The number of source ({len(src_indices)}) and destination " - f"indices ({len(dest_indices)}) must be the same.") - - return self.IndexMap(src=src_indices, dest=dest_indices) - - class MediaIO(ABC, Generic[_T]): @abstractmethod diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 6627164c98798..7e485fea2689d 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -425,7 +425,6 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): num_prompt_req], # prefill query_start_loc=query_start_loc_cpu[:num_reqs + 1], # for logits index - multi_modal_placeholder_index_maps=None, enable_kv_scales_calculation=False, ) From 21467f9a1c6219cbd66640e539b1f23221cff375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Mon, 22 Sep 2025 10:50:39 +0200 Subject: [PATCH 541/584] Enable Eagle3 speculative decoding for GPT-OSS model (#25246) Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> --- vllm/config/speculative.py | 2 +- vllm/model_executor/models/gpt_oss.py | 19 ++++++++++++++-- vllm/v1/spec_decode/eagle.py | 32 +++++++++++++++++++-------- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 2c861723c3966..d533930e1c7aa 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -527,7 +527,7 @@ class SpeculativeConfig: "speculative decoding is > 1, but got " f"{self.disable_by_batch_size=}") - eagle3_target_supported = ["llama", "qwen"] + eagle3_target_supported = ["llama", "qwen", "gpt_oss"] if self.method == "eagle3" and self.target_model_config and not any( supported_model in self.target_model_config.hf_text_config.model_type diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 4fe59f91124dd..7c755a00e1c98 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -27,7 +27,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors from vllm.utils import cdiv -from .interfaces import SupportsPP +from .interfaces import SupportsEagle3, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -238,6 +238,7 @@ class GptOssModel(nn.Module): self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], self.config.hidden_size)) + self.aux_hidden_state_layers = tuple[int, ...]() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embedding(input_ids) @@ -261,8 +262,12 @@ class GptOssModel(nn.Module): x = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + aux_hidden_states = [] for i in range(self.start_layer, self.end_layer): layer = self.layers[i] + if i in self.aux_hidden_state_layers: + aux_hidden_states.append(x if residual is None else x + + residual) x, residual = layer(x, positions, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -270,6 +275,9 @@ class GptOssModel(nn.Module): "residual": residual }) x, _ = self.norm(x, residual) + + if len(aux_hidden_states) > 0: + return x, aux_hidden_states return x def _load_weights_mxfp4( @@ -610,7 +618,7 @@ class GptOssModel(nn.Module): weights, stacked_params_mapping) -class GptOssForCausalLM(nn.Module, SupportsPP): +class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3): packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]} hf_to_vllm_mapper = WeightsMapper( @@ -658,6 +666,13 @@ class GptOssForCausalLM(nn.Module, SupportsPP): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5dacf60886966..dc97d5c8f39d4 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -823,15 +823,29 @@ class EagleProposer: else: target_language_model = target_model # share embed_tokens with the target model if needed - if get_pp_group().world_size == 1 \ - and self.model.model.embed_tokens.weight.shape \ - == target_language_model.model.embed_tokens.weight.shape: - logger.info( - "Assuming the EAGLE head shares the same vocab embedding" - " with the target model.") - del self.model.model.embed_tokens - self.model.model.embed_tokens = ( - target_language_model.model.embed_tokens) + if get_pp_group().world_size == 1: + if hasattr(target_language_model.model, 'embed_tokens'): + target_embed_tokens = target_language_model.model.embed_tokens + elif hasattr(target_language_model.model, 'embedding'): + target_embed_tokens = target_language_model.model.embedding + else: + raise AttributeError( + "Target model does not have 'embed_tokens' or 'embedding' " + "attribute") + + # Check if shapes match and we found the embedding + eagle_shape = self.model.model.embed_tokens.weight.shape + target_shape = target_embed_tokens.weight.shape + if eagle_shape == target_shape: + logger.info( + "Assuming the EAGLE head shares the same vocab embedding" + " with the target model.") + del self.model.model.embed_tokens + self.model.model.embed_tokens = target_embed_tokens + else: + logger.info( + "The EAGLE head's vocab embedding will be loaded separately" + " from the target model.") else: logger.info( "The EAGLE head's vocab embedding will be loaded separately" From a66d13138137147d5b3fbdfe64afb103fbc52f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Mon, 22 Sep 2025 11:55:04 +0200 Subject: [PATCH 542/584] [TPU][Bugfix][CI] Fix broken tests/build dependency (#25255) Signed-off-by: NickLucche <nlucches@redhat.com> --- .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index 1073a4ee30afa..e76528a178205 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ - && python3 -m pip install --progress-bar off hf-transfer + && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" export VLLM_USE_V1=1 export VLLM_XLA_CHECK_RECOMPILATION=1 diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 505664f3aecd0..69366cd503219 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ - && python3 -m pip install --progress-bar off hf-transfer + && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" export VLLM_USE_V1=1 export VLLM_XLA_CHECK_RECOMPILATION=1 From 4cf71cc88a19f97bf6f5db9802395edd646375ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Mon, 22 Sep 2025 12:12:57 +0200 Subject: [PATCH 543/584] [TPU] Deprecate `xm.mark_step` in favor of ``torch_xla.sync` (#25254) Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> --- tests/tpu/test_moe_pallas.py | 3 +- tests/v1/tpu/test_topk_topp_sampler.py | 11 ++++--- vllm/lora/punica_wrapper/punica_tpu.py | 4 +-- .../model_loader/default_loader.py | 9 +++-- vllm/v1/worker/tpu_model_runner.py | 33 ++++++++++--------- 5 files changed, 31 insertions(+), 29 deletions(-) diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py index 407a824d81748..1e5d9d923d004 100644 --- a/tests/tpu/test_moe_pallas.py +++ b/tests/tpu/test_moe_pallas.py @@ -6,6 +6,7 @@ Run `pytest tests/kernels/moe/test_moe_pallas.py`. """ import pytest import torch +import torch_xla # yapf conflicts with isort for this block # yapf: disable @@ -77,7 +78,7 @@ def test_pallas_moe( expert_map=e_map, renormalize=False, ) - xm.mark_step() + torch_xla.sync(wait=False) # Compare outputs torch.testing.assert_close( diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py index 05751badc7619..665cf8cd2629e 100644 --- a/tests/v1/tpu/test_topk_topp_sampler.py +++ b/tests/v1/tpu/test_topk_topp_sampler.py @@ -4,6 +4,7 @@ import math import pytest import torch +import torch_xla from vllm.platforms import current_platform from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p @@ -63,7 +64,7 @@ def test_topp_result_sums_past_p(): probs.masked_fill_(logits_masked.isinf(), 0) masked_prob_sum = probs.sum(dim=-1) - xm.mark_step() + torch_xla.sync() # Perform assertion on CPU. assert torch.all(torch.ge(masked_prob_sum.cpu() + TOLERANCE, p.cpu())) @@ -82,7 +83,7 @@ def test_topp_basic(): k=torch.tensor([3, 3]), p=torch.tensor([0.79, 0.79])) - xm.mark_step() + torch_xla.sync() # Expect the smallest elements to be dropped. expected_result = logits.clone().cpu() @@ -104,7 +105,7 @@ def test_topp_select_all(): k=torch.tensor([3, 3]), p=torch.tensor([1.0, 1.0])) - xm.mark_step() + torch_xla.sync() assert torch.allclose(logits.cpu(), result.cpu()) @@ -122,7 +123,7 @@ def test_topp_with_ties(): k=torch.tensor([4]), p=torch.tensor([0.2])) - xm.mark_step() + torch_xla.sync() # All tie values are included in the top-p set. Tie breaking is left # to be done during final sampling (all tie tokens have equal @@ -146,7 +147,7 @@ def test_both_topk_topp(): k=torch.tensor([1, 3]), p=torch.tensor([0.79, 0.79])) - xm.mark_step() + torch_xla.sync() # Since for the first batch k=1, expect only the largest element gets # selected. diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index 07dc337a1cc87..5896da516540b 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional, Union import torch import torch.nn.functional as F -import torch_xla.core.xla_model as xm +import torch_xla from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink from vllm.lora.punica_wrapper.utils import convert_mapping @@ -323,7 +323,7 @@ class PunicaWrapperTPU(PunicaWrapperBase): extra_vocab_size: int, ): # Make sure we don't accidentally collect outside operations - xm.mark_step() + torch_xla.sync() # Pad the prompt mapping to avoid running into recompiles on the TPU # TODO: Should this happen inside mapping internally? If so how can we diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index d1bdec21fd974..4b7bcd37d4bc2 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -211,16 +211,15 @@ class DefaultModelLoader(BaseModelLoader): from vllm.platforms.tpu import USE_TPU_COMMONS if not USE_TPU_COMMONS: - # In PyTorch XLA, we should call `xm.mark_step` + # In PyTorch XLA, we should call `torch_xla.sync` # frequently so that not too many ops are accumulated - # in the XLA program. import torch_xla.core.xla_model - # as xm - import torch_xla.core.xla_model as xm + # in the XLA program. + import torch_xla def _xla_weights_iterator(iterator: Generator): for weights in iterator: yield weights - xm.mark_step() + torch_xla.sync(wait=False) weights_iterator = _xla_weights_iterator(weights_iterator) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index dd11b1dcbe94c..4cbf991a14c11 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -10,6 +10,7 @@ import numpy as np import torch import torch.nn as nn # TPU XLA related +import torch_xla import torch_xla.core.xla_model as xm import torch_xla.distributed.spmd as xs import torch_xla.runtime as xr @@ -846,10 +847,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # 2. A list or tuple (length: num_items) of tensors, each of shape # (feature_size, hidden_size) in case the feature size is dynamic # depending on the input multimodal items. - xm.mark_step() + torch_xla.sync(wait=False) curr_group_outputs = self.model.get_multimodal_embeddings( **mm_kwargs_group) - xm.mark_step() + torch_xla.sync(wait=False) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -952,7 +953,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_embeds = self._gather_mm_embeddings(scheduler_output) else: mm_embeds = [] - xm.mark_step() + torch_xla.sync(wait=False) # Prepare inputs, the requests might be split into multiple # executions, combine the result of each execution. start_index = 0 @@ -969,7 +970,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): end_index = self._prepare_inputs(scheduler_output, start_index) input_ids, inputs_embeds = self._get_model_inputs( self.input_ids, mm_embeds) - xm.mark_step() + torch_xla.sync(wait=False) # Run the decoder with set_forward_context( attn_metadata, @@ -1183,7 +1184,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Sync all pending XLA execution during model initialization and weight # loading. - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() if not hasattr(self, "model"): self.model = model @@ -1267,10 +1268,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, lora_requests) -> None: - xm.mark_step() # Captures input updates + torch_xla.sync(wait=False) # Captures input updates super()._set_active_loras(prompt_lora_mapping, token_lora_mapping, lora_requests) - xm.mark_step() # Captures metadata updates + torch_xla.sync(wait=False) # Captures metadata updates def _precompile_mm_encoder(self) -> None: if not self.supports_mm_inputs: @@ -1297,10 +1298,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_items, ) # Run multimodal encoder. - xm.mark_step() + torch_xla.sync(wait=False) mm_embeds = self.model.get_multimodal_embeddings( **batched_dummy_mm_inputs) - xm.mark_step() + torch_xla.sync(wait=False) num_patches = mm_embeds[0].shape[0] items_size = num_patches * num_items @@ -1325,7 +1326,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): a, b = self._get_model_inputs(placeholders_ids, [mm_embeds]) assert a is None - xm.mark_step() + torch_xla.sync(wait=False) # Pre-compile `get_input_embeddings` when mm_embeddings are not # present. Chunk is only made of text, no mm_placeholders. @@ -1336,7 +1337,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): placeholders_ids = placeholders_ids.to(self.device) a, b = self._get_model_inputs(placeholders_ids, []) assert a is None - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() end = time.perf_counter() @@ -1532,11 +1533,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Isolate encoder graph from post-processing to minimize # impact of recompilation until it's fixed. start = time.perf_counter() - xm.mark_step() + torch_xla.sync(wait=False) dummy_encoder_outputs = \ self.model.get_multimodal_embeddings( **batched_dummy_mm_inputs) - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() end = time.perf_counter() logger.info( @@ -1559,7 +1560,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self._dummy_run(num_tokens, self.num_reqs_most_model_len, self.num_blocks_per_most_len_req) - xm.mark_step() + torch_xla.sync(wait=False) xm.wait_device_ops() self.encoder_cache.clear() gc.collect() @@ -1927,11 +1928,11 @@ def replace_set_lora(model): # to a tensor doesn't seem to work anymore. This might be fixed with a # later release of torch_xla. self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias) - xm.mark_step() + torch_xla.sync(wait=False) def _tpu_reset_lora(self, index: int): self._original_reset_lora(index) - xm.mark_step() + torch_xla.sync(wait=False) for _, module in model.named_modules(): if isinstance(module, BaseLayerWithLoRA): From b6f01bd9a7941c3c06175feaf7588fa55c9db646 Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Mon, 22 Sep 2025 18:22:29 +0800 Subject: [PATCH 544/584] refactor: abstract graph mode support into platform interface (#25161) Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> --- vllm/config/__init__.py | 2 +- vllm/platforms/cuda.py | 4 ++++ vllm/platforms/interface.py | 7 +++++++ vllm/platforms/rocm.py | 4 ++++ vllm/platforms/xpu.py | 13 +++++++------ 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index e31a78ba33baa..9bb8087a511d5 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -503,7 +503,7 @@ class VllmConfig: if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - if current_platform.is_cuda_alike() or current_platform.is_xpu(): + if current_platform.support_static_graph_mode(): # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 05f129f513a0a..7baa5a9742f44 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -498,6 +498,10 @@ class CudaPlatformBase(Platform): def support_hybrid_kv_cache(cls) -> bool: return True + @classmethod + def support_static_graph_mode(cls) -> bool: + return True + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index c43580ac5da13..8a05c84d4242e 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -587,6 +587,13 @@ class Platform: """ return False + @classmethod + def support_static_graph_mode(cls) -> bool: + """ + Returns if the graph mode is supported by the current platform. + """ + return False + @classmethod def use_sync_weight_loader(cls) -> bool: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 9470434aa428b..0c7b9c2a4abf6 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -477,3 +477,7 @@ class RocmPlatform(Platform): @classmethod def support_hybrid_kv_cache(cls) -> bool: return True + + @classmethod + def support_static_graph_mode(cls) -> bool: + return True diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 4d3bef4b42947..eb591ae4454e4 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -113,12 +113,9 @@ class XPUPlatform(Platform): # lazy import to avoid circular import from vllm.config import CompilationLevel, CUDAGraphMode compilation_config = vllm_config.compilation_config - if compilation_config.cudagraph_mode is None or \ - compilation_config.cudagraph_mode.max_cudagraph_mode() \ - != CUDAGraphMode.NONE: - logger.info("[XPU] CUDA graph is not supported on XPU, disabling " - "cudagraphs. Fallback to cudagraph_mode=NONE") - compilation_config.cudagraph_mode = CUDAGraphMode.NONE + + assert compilation_config.cudagraph_mode == CUDAGraphMode.NONE, \ + "CUDA graph mode should be NONE on XPU" if vllm_config.lora_config is not None: compilation_config.level = CompilationLevel.NO_COMPILATION @@ -169,6 +166,10 @@ class XPUPlatform(Platform): def support_hybrid_kv_cache(cls) -> bool: return True + @classmethod + def support_static_graph_mode(cls) -> bool: + return False + @classmethod def is_pin_memory_available(cls): return True From 417a164af66a11978f11a9ba64d4d76b4cf3e72a Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 22 Sep 2025 19:04:32 +0800 Subject: [PATCH 545/584] [Misc] Remove unused encoder-decoder error strings (#25374) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/attention/backends/utils.py | 5 --- vllm/utils/__init__.py | 58 -------------------------------- 2 files changed, 63 deletions(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 33d8168f8a13c..63ee8f50825c5 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -18,11 +18,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad logger = init_logger(__name__) -# Error string(s) for encoder/decoder -# unsupported attention scenarios -STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported " - "with encoder/decoder models.") - PAD_SLOT_ID = -1 # Switch to numpy implementation of compute_slot_mapping diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 834ec9b1d30b4..b74b746a35830 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -88,64 +88,6 @@ DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 -# Exception strings for non-implemented encoder/decoder scenarios - -# Reminder: Please update docs/features/compatibility_matrix.md -# If the feature combo become valid - -STR_NOT_IMPL_ENC_DEC_SWA = \ - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ - "Prefix caching for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL = \ - "Chunked prefill for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = ( - "Models with logits_soft_cap " - "require FlashInfer backend, which is " - "currently not supported for encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently " - "supported with encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_PP = ("Pipeline parallelism is not " - "currently supported with " - "encoder/decoder models.") - -STR_NOT_IMPL_ENC_DEC_MM = ("Multimodal is not currently " - "supported with encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not " - "currently supported with encoder/" - "decoder models.") - -STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only " - "backends currently supported with encoder/" - "decoder models.") - -# Efficiently import all enc/dec error strings -# rather than having to import all of the above -STR_NOT_IMPL_ENC_DEC_ERR_STRS = { - "STR_NOT_IMPL_ENC_DEC_SWA": STR_NOT_IMPL_ENC_DEC_SWA, - "STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE": STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - "STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL": - STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL, - "STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP": STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP, - "STR_NOT_IMPL_ENC_DEC_LORA": STR_NOT_IMPL_ENC_DEC_LORA, - "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP, - "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, - "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, - "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, -} - # Constants related to forcing the attention backend selection # String name of register which may be set in order to From 64c824cd787dd610008e1fc91dfe7c06debf909b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 22 Sep 2025 12:08:25 +0100 Subject: [PATCH 546/584] Make pickle import check fast (#25379) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 5 +- .../{ => pre_commit}/check_pickle_imports.py | 79 ++++--------------- 2 files changed, 18 insertions(+), 66 deletions(-) rename tools/{ => pre_commit}/check_pickle_imports.py (61%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a4ea888af3f3e..bf36db7d15ed9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -155,11 +155,10 @@ repos: additional_dependencies: [regex] - id: check-pickle-imports name: Prevent new pickle/cloudpickle imports - entry: python tools/check_pickle_imports.py + entry: python tools/pre_commit/check_pickle_imports.py language: python types: [python] - pass_filenames: false - additional_dependencies: [pathspec, regex] + additional_dependencies: [regex] - id: validate-config name: Validate configuration has default values and that each field has a docstring entry: python tools/validate_config.py diff --git a/tools/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py similarity index 61% rename from tools/check_pickle_imports.py rename to tools/pre_commit/check_pickle_imports.py index fe717121db40d..acbbc1f181d69 100644 --- a/tools/check_pickle_imports.py +++ b/tools/pre_commit/check_pickle_imports.py @@ -1,20 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import sys import regex as re -try: - import pathspec -except ImportError: - print( - "ERROR: The 'pathspec' library is required. " - "Install it with 'pip install pathspec'.", - file=sys.stderr) - sys.exit(2) - # List of files (relative to repo root) that are allowed to import pickle or # cloudpickle # @@ -25,7 +15,7 @@ except ImportError: # Before adding new uses of pickle/cloudpickle, please consider safer # alternatives like msgpack or pydantic that are already in use in vLLM. Only # add to this list if absolutely necessary and after careful security review. -ALLOWED_FILES = set([ +ALLOWED_FILES = { # pickle 'vllm/v1/serial_utils.py', 'vllm/v1/executor/multiproc_executor.py', @@ -36,11 +26,9 @@ ALLOWED_FILES = set([ 'tests/tokenization/test_cached_tokenizer.py', 'vllm/distributed/utils.py', 'vllm/distributed/parallel_state.py', - 'vllm/engine/multiprocessing/client.py', 'vllm/distributed/device_communicators/all_reduce_utils.py', 'vllm/distributed/device_communicators/shm_broadcast.py', 'vllm/distributed/device_communicators/shm_object_storage.py', - 'vllm/engine/multiprocessing/engine.py', 'benchmarks/kernels/graph_machete_bench.py', 'benchmarks/kernels/benchmark_lora.py', 'benchmarks/kernels/benchmark_machete.py', @@ -55,65 +43,30 @@ ALLOWED_FILES = set([ 'tests/utils.py', # pickle and cloudpickle 'vllm/utils/__init__.py', - 'vllm/v1/serial_utils.py', - 'vllm/v1/executor/multiproc_executor.py', - 'vllm/transformers_utils/config.py', - 'vllm/model_executor/models/registry.py', - 'vllm/engine/multiprocessing/client.py', - 'vllm/engine/multiprocessing/engine.py', -]) +} PICKLE_RE = re.compile(r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)" r"|from\s+(pickle|cloudpickle)\s+import\b)") -def is_python_file(path): - return path.endswith('.py') - - -def scan_file(path): +def scan_file(path: str) -> int: with open(path, encoding='utf-8') as f: - for line in f: + for i, line in enumerate(f, 1): if PICKLE_RE.match(line): - return True - return False - - -def load_gitignore(repo_root): - gitignore_path = os.path.join(repo_root, '.gitignore') - patterns = [] - if os.path.exists(gitignore_path): - with open(gitignore_path, encoding='utf-8') as f: - patterns = f.read().splitlines() - # Always ignore .git directory - patterns.append('.git/') - return pathspec.PathSpec.from_lines('gitwildmatch', patterns) + print(f"{path}:{i}: " + "\033[91merror:\033[0m " # red color + "Found pickle/cloudpickle import") + return 1 + return 0 def main(): - repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - spec = load_gitignore(repo_root) - bad_files = [] - for dirpath, _, filenames in os.walk(repo_root): - for filename in filenames: - if not is_python_file(filename): - continue - abs_path = os.path.join(dirpath, filename) - rel_path = os.path.relpath(abs_path, repo_root) - # Skip ignored files - if spec.match_file(rel_path): - continue - if scan_file(abs_path) and rel_path not in ALLOWED_FILES: - bad_files.append(rel_path) - if bad_files: - print("\nERROR: The following files import 'pickle' or 'cloudpickle' " - "but are not in the allowed list:") - for f in bad_files: - print(f" {f}") - print("\nIf this is intentional, update the allowed list in " - "tools/check_pickle_imports.py.") - sys.exit(1) - sys.exit(0) + returncode = 0 + for filename in sys.argv[1:]: + if filename in ALLOWED_FILES: + continue + returncode |= scan_file(filename) + return returncode def test_regex(): @@ -149,4 +102,4 @@ if __name__ == '__main__': if '--test-regex' in sys.argv: test_regex() else: - main() + sys.exit(main()) From 3d2c56b7a958aa2c92f61424e828b1d3c3c933d4 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:23:45 +0100 Subject: [PATCH 547/584] Make `mypy` behave like a proper pre-commit hook (#25313) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .github/CODEOWNERS | 1 + .pre-commit-config.yaml | 34 ++++----- pyproject.toml | 21 ------ tools/mypy.sh | 35 --------- tools/pre_commit/mypy.py | 140 +++++++++++++++++++++++++++++++++++ vllm/entrypoints/llm.py | 4 +- vllm/entrypoints/renderer.py | 2 +- vllm/utils/__init__.py | 9 ++- vllm/utils/tensor_schema.py | 7 +- 9 files changed, 166 insertions(+), 87 deletions(-) delete mode 100755 tools/mypy.sh create mode 100755 tools/pre_commit/mypy.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 37bd0ace98a97..9d749fe8d3238 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -72,6 +72,7 @@ mkdocs.yaml @hmellor # Linting .markdownlint.yaml @hmellor .pre-commit-config.yaml @hmellor +/tools/pre_commit @hmellor # CPU /vllm/v1/worker/cpu* @bigPYJ1151 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf36db7d15ed9..8ca414ee4269b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -60,38 +60,32 @@ repos: files: ^requirements/test\.(in|txt)$ - id: mypy-local name: Run mypy for local Python installation - entry: tools/mypy.sh 0 "local" - language: python - types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] + entry: python tools/pre_commit/mypy.py 0 "local" stages: [pre-commit] # Don't run in CI + <<: &mypy_common + language: python + types_or: [python, pyi] + require_serial: true + additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.9 - entry: tools/mypy.sh 1 "3.9" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.9" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 - entry: tools/mypy.sh 1 "3.10" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.10" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.11 - entry: tools/mypy.sh 1 "3.11" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.11" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.12 - entry: tools/mypy.sh 1 "3.12" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.12" + <<: *mypy_common stages: [manual] # Only run in CI - id: shellcheck name: Lint shell scripts diff --git a/pyproject.toml b/pyproject.toml index f43ae69e00bdd..88c5c4067f5ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,27 +110,6 @@ ignore_missing_imports = true check_untyped_defs = true follow_imports = "silent" -# After fixing type errors resulting from follow_imports: "skip" -> "silent", -# move the directory here and remove it from tools/mypy.sh -files = [ - "vllm/*.py", - "vllm/assets", - "vllm/entrypoints", - "vllm/inputs", - "vllm/logging_utils", - "vllm/multimodal", - "vllm/platforms", - "vllm/transformers_utils", - "vllm/triton_utils", - "vllm/usage", -] -# TODO(woosuk): Include the code from Megatron and HuggingFace. -exclude = [ - "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", - # Ignore triton kernels in ops. - 'vllm/attention/ops/.*\.py$' -] - [tool.isort] skip_glob = [ ".buildkite/*", diff --git a/tools/mypy.sh b/tools/mypy.sh deleted file mode 100755 index 63e3b9a916634..0000000000000 --- a/tools/mypy.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -CI=${1:-0} -PYTHON_VERSION=${2:-local} - -if [ "$CI" -eq 1 ]; then - set -e -fi - -if [ $PYTHON_VERSION == "local" ]; then - PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') -fi - -run_mypy() { - echo "Running mypy on $1" - if [ "$CI" -eq 1 ] && [ -z "$1" ]; then - mypy --python-version "${PYTHON_VERSION}" "$@" - return - fi - mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" -} - -run_mypy # Note that this is less strict than CI -run_mypy tests -run_mypy vllm/attention -run_mypy vllm/compilation -run_mypy vllm/distributed -run_mypy vllm/engine -run_mypy vllm/executor -run_mypy vllm/inputs -run_mypy vllm/lora -run_mypy --exclude 'vllm/model_executor/layers/fla/ops' vllm/model_executor -run_mypy vllm/plugins -run_mypy vllm/worker -run_mypy vllm/v1 diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py new file mode 100755 index 0000000000000..039cf6075f631 --- /dev/null +++ b/tools/pre_commit/mypy.py @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Run mypy on changed files. + +This script is designed to be used as a pre-commit hook. It runs mypy +on files that have been changed. It groups files into different mypy calls +based on their directory to avoid import following issues. + +Usage: + python tools/pre_commit/mypy.py <ci> <python_version> <changed_files...> + +Args: + ci: "1" if running in CI, "0" otherwise. In CI, follow_imports is set to + "silent" for the main group of files. + python_version: Python version to use (e.g., "3.10") or "local" to use + the local Python version. + changed_files: List of changed files to check. +""" + +import subprocess +import sys +from typing import Optional + +import regex as re + +FILES = [ + "vllm/*.py", + "vllm/assets", + "vllm/entrypoints", + "vllm/inputs", + "vllm/logging_utils", + "vllm/multimodal", + "vllm/platforms", + "vllm/transformers_utils", + "vllm/triton_utils", + "vllm/usage", +] + +# After fixing errors resulting from changing follow_imports +# from "skip" to "silent", move the following directories to FILES +SEPARATE_GROUPS = [ + "tests", + "vllm/attention", + "vllm/compilation", + "vllm/distributed", + "vllm/engine", + "vllm/executor", + "vllm/inputs", + "vllm/lora", + "vllm/model_executor", + "vllm/plugins", + "vllm/worker", + "vllm/v1", +] + +# TODO(woosuk): Include the code from Megatron and HuggingFace. +EXCLUDE = [ + "vllm/model_executor/parallel_utils", + "vllm/model_executor/models", + "vllm/model_executor/layers/fla/ops", + # Ignore triton kernels in ops. + "vllm/attention/ops", +] + + +def group_files(changed_files: list[str]) -> dict[str, list[str]]: + """ + Group changed files into different mypy calls. + + Args: + changed_files: List of changed files. + + Returns: + A dictionary mapping file group names to lists of changed files. + """ + exclude_pattern = re.compile(f"^{'|'.join(EXCLUDE)}.*") + files_pattern = re.compile(f"^({'|'.join(FILES)}).*") + file_groups = {"": []} + file_groups.update({k: [] for k in SEPARATE_GROUPS}) + for changed_file in changed_files: + # Skip files which should be ignored completely + if exclude_pattern.match(changed_file): + continue + # Group files by mypy call + if files_pattern.match(changed_file): + file_groups[""].append(changed_file) + continue + else: + for directory in SEPARATE_GROUPS: + if re.match(f"^{directory}.*", changed_file): + file_groups[directory].append(changed_file) + break + return file_groups + + +def mypy(targets: list[str], python_version: Optional[str], + follow_imports: Optional[str], file_group: str) -> int: + """ + Run mypy on the given targets. + + Args: + targets: List of files or directories to check. + python_version: Python version to use (e.g., "3.10") or None to use + the default mypy version. + follow_imports: Value for the --follow-imports option or None to use + the default mypy behavior. + file_group: The file group name for logging purposes. + + Returns: + The return code from mypy. + """ + args = ["mypy"] + if python_version is not None: + args += ["--python-version", python_version] + if follow_imports is not None: + args += ["--follow-imports", follow_imports] + print(f"$ {' '.join(args)} {file_group}") + return subprocess.run(args + targets, check=False).returncode + + +def main(): + ci = sys.argv[1] == "1" + python_version = sys.argv[2] + file_groups = group_files(sys.argv[3:]) + + if python_version == "local": + python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + + returncode = 0 + for file_group, changed_files in file_groups.items(): + follow_imports = None if ci and file_group == "" else "skip" + if changed_files: + returncode |= mypy(changed_files, python_version, follow_imports, + file_group) + return returncode + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 092d3f276d1c5..c41f44aa47187 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1468,7 +1468,7 @@ class LLM: def _validate_and_add_requests( self, - prompts: Union[PromptType, Sequence[PromptType]], + prompts: Union[PromptType, Sequence[PromptType], DataPrompt], params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams, Sequence[PoolingParams]], *, @@ -1478,7 +1478,7 @@ class LLM: ) -> None: if isinstance(prompts, (str, dict)): # Convert a single prompt to a list. - prompts = [prompts] + prompts = [prompts] # type: ignore[list-item] num_requests = len(prompts) if isinstance(params, Sequence) and len(params) != num_requests: diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index fb859d57be9fe..d7ce57c728ba6 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -280,7 +280,7 @@ class CompletionRenderer(BaseRenderer): if truncate_prompt_tokens < 0: truncate_prompt_tokens = self.model_config.max_model_len - if max_length is not None and truncate_prompt_tokens > max_length: + if max_length is not None and truncate_prompt_tokens > max_length: # type: ignore[operator] raise ValueError( f"truncate_prompt_tokens ({truncate_prompt_tokens}) " f"cannot be greater than max_length ({max_length}). " diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index b74b746a35830..022e35a399c53 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -551,9 +551,10 @@ class AsyncMicrobatchTokenizer: # If every request uses identical kwargs we can run a single # batched tokenizer call for a big speed-up. if can_batch and len(prompts) > 1: - encode_fn = partial(self.tokenizer, prompts, **kwargs) + batch_encode_fn = partial(self.tokenizer, prompts, + **kwargs) results = await self._loop.run_in_executor( - self._executor, encode_fn) + self._executor, batch_encode_fn) for i, fut in enumerate(result_futures): if not fut.done(): @@ -889,7 +890,7 @@ def get_open_port() -> int: def get_open_ports_list(count: int = 5) -> list[int]: """Get a list of open ports.""" - ports = set() + ports = set[int]() while len(ports) < count: ports.add(get_open_port()) return list(ports) @@ -1279,7 +1280,7 @@ def as_list(maybe_list: Iterable[T]) -> list[T]: def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]: if isinstance(obj, str) or not isinstance(obj, Iterable): - obj = [obj] + return [obj] # type: ignore[list-item] return obj diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py index 21d3249fe1547..d75dbcd5401b2 100644 --- a/vllm/utils/tensor_schema.py +++ b/vllm/utils/tensor_schema.py @@ -22,9 +22,8 @@ class TensorShape: self.dims = dims self.dynamic_dims = dynamic_dims if dynamic_dims else set() - def resolve(self, **bindings: dict[str, - int]) -> tuple[Union[int, str], ...]: - resolved = [] + def resolve(self, **bindings: int) -> tuple[Union[int, str], ...]: + resolved = list[Union[int, str]]() for dim in self.dims: if isinstance(dim, str) and dim in bindings: resolved.append(bindings[dim]) @@ -159,7 +158,7 @@ class TensorSchema: def validate(self) -> None: type_hints = get_type_hints(self.__class__, include_extras=True) - shape_env = {} + shape_env = dict[str, int]() for field_name, field_type in type_hints.items(): # Check if field is missing From ac243886b02399d327314b2d8bbf9989fd07f247 Mon Sep 17 00:00:00 2001 From: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Date: Mon, 22 Sep 2025 09:29:54 -0500 Subject: [PATCH 548/584] [Kernel] MI-300X triton moe configs (#23445) Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com> --- ...N=128,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=256,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=512,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=192,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=384,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ ...N=768,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++ 6 files changed, 1200 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..40d86ff8ba324 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..6014d827d7417 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..3622659f3e915 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..311d2e829a050 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..91c4b916b8649 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..8fee30ec70660 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} From c10101a3eba053d22842edd95941d49e4324c727 Mon Sep 17 00:00:00 2001 From: Csrayz <jover@cmbchina.com> Date: Mon, 22 Sep 2025 22:53:13 +0800 Subject: [PATCH 549/584] [Bugfix] Fix several issues with p2p xPyD in GET type (#23993) Signed-off-by: Csrayz <jover@cmbchina.com> Signed-off-by: ivyilike <pww123@cmbchina.com> Co-authored-by: ivyilike <pww123@cmbchina.com> --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 5 +++- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 26 ++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index ec72905a0d3ec..3dadfa595ef1e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -178,6 +178,9 @@ class P2pNcclConnector(KVConnectorBase_V1): # Load the KV for each request each layer for request in metadata.requests: + request_id = request.request_id + ip, port = self.parse_request_id(request_id, False) + remote_address = ip + ":" + str(port + self._rank) for layer_name in forward_context.no_compile_layers: layer = forward_context.no_compile_layers[layer_name] @@ -191,7 +194,7 @@ class P2pNcclConnector(KVConnectorBase_V1): layer = kv_cache[forward_context.virtual_engine] kv_cache = self.p2p_nccl_engine.recv_tensor( - request.request_id + "#" + layer_name) + request.request_id + "#" + layer_name, remote_address) if kv_cache is None: logger.warning("🚧kv_cache is None, %s", request.request_id) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index fa7cc66ab654d..959bf0277a3f5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -134,7 +134,6 @@ class P2pNcclEngine: # PUT or PUT_ASYNC # tensor_id: torch.Tensor self.send_queue: deque[SendQueueItem] = deque() - self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} if self.send_type == "PUT_ASYNC": self._send_thread = threading.Thread(target=self.send_async, daemon=True) @@ -143,6 +142,7 @@ class P2pNcclEngine: # tensor_id: torch.Tensor/(addr, dtype, shape) self.recv_store: dict[str, Any] = {} self.recv_request_id_to_tensor_ids: dict[str, set[str]] = {} + self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} self.socks: dict[str, Any] = {} # remote_address: client socket self.comms: dict[str, Any] = {} # remote_address: (ncclComm_t, rank) @@ -223,18 +223,26 @@ class P2pNcclEngine: # GET with self.send_store_cv: tensor_size = tensor.element_size() * tensor.numel() + if tensor_size > self.buffer_size_threshold: + logger.warning( + "❗[GET]tensor_id:%s, tensor_size:%d, is greater than" + "buffer size threshold :%d, skip send to %s, rank:%d", + tensor_id, tensor_size, self.buffer_size_threshold, + remote_address, self.rank) + return False while (self.buffer_size + tensor_size > self.buffer_size_threshold): - oldest_tenser_id = next(iter(self.send_store)) - oldest_tenser = self.send_store.pop(oldest_tenser_id) - oldest_tenser_size = oldest_tenser.element_size( - ) * oldest_tenser.numel() - self.buffer_size -= oldest_tenser_size - logger.info( + assert len(self.send_store) > 0 + oldest_tensor_id = next(iter(self.send_store)) + oldest_tensor = self.send_store.pop(oldest_tensor_id) + oldest_tensor_size = oldest_tensor.element_size( + ) * oldest_tensor.numel() + self.buffer_size -= oldest_tensor_size + logger.debug( "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d," - " buffer_size:%d, oldest_tenser_size:%d, rank:%d", + " buffer_size:%d, oldest_tensor_size:%d, rank:%d", remote_address, tensor_id, tensor_size, self.buffer_size, - oldest_tenser_size, self.rank) + oldest_tensor_size, self.rank) self.send_store[tensor_id] = tensor self.buffer_size += tensor_size From 175811e3b53f8f13eb3e8ac6aae02050f6e1412f Mon Sep 17 00:00:00 2001 From: Burkhard Ringlein <ngl@zurich.ibm.com> Date: Mon, 22 Sep 2025 17:20:28 +0200 Subject: [PATCH 550/584] [V1][Attention] Split triton_attn in triton-only and rocm specific backends (#24648) Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com> --- vllm/engine/arg_utils.py | 1 + vllm/platforms/interface.py | 1 + vllm/platforms/rocm.py | 10 + vllm/v1/attention/backends/rocm_attn.py | 426 ++++++++++++++++++++++ vllm/v1/attention/backends/triton_attn.py | 167 +++------ 5 files changed, 482 insertions(+), 123 deletions(-) create mode 100644 vllm/v1/attention/backends/rocm_attn.py diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b09d43f705580..d4d801b155e17 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1494,6 +1494,7 @@ class EngineArgs: "FLEX_ATTENTION", "TREE_ATTN", "XFORMERS_VLLM_V1", + "ROCM_ATTN_VLLM_V1", ] if (envs.is_set("VLLM_ATTENTION_BACKEND") and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 8a05c84d4242e..cad04ea14c01c 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -67,6 +67,7 @@ class _Backend(enum.Enum): FLEX_ATTENTION = enum.auto() TREE_ATTN = enum.auto() XFORMERS_VLLM_V1 = enum.auto() + ROCM_ATTN_VLLM_V1 = enum.auto() class PlatformEnum(enum.Enum): diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 0c7b9c2a4abf6..6a49bd4a33866 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -231,7 +231,17 @@ class RocmPlatform(Platform): logger.info("Using Flash Attention backend on V1 engine.") return ("vllm.v1.attention.backends." "rocm_aiter_fa.AiterFlashAttentionBackend") + elif (envs.VLLM_ROCM_USE_AITER and + envs.VLLM_USE_AITER_UNIFIED_ATTENTION) or \ + envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION or \ + selected_backend == _Backend.ROCM_ATTN_VLLM_V1: + # rocm specific backend, with aiter and/or + # triton prefix-prefill + logger.info("Using Rocm/Aiter Attention backend on V1 engine.") + return ("vllm.v1.attention.backends." + "rocm_attn.RocmAttentionBackend") else: + # default case, using triton unified attention logger.info("Using Triton Attention backend on V1 engine.") return ("vllm.v1.attention.backends." "triton_attn.TritonAttentionBackend") diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py new file mode 100644 index 0000000000000..365df5f0d6eca --- /dev/null +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -0,0 +1,426 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Attention layer with PagedAttention and Triton prefix prefill.""" +from dataclasses import dataclass +from functools import cache +from typing import ClassVar, Optional + +import torch + +from vllm import _custom_ops as ops +from vllm import envs +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.attention.ops.chunked_prefill_paged_decode import ( + chunked_prefill_paged_decode) +from vllm.attention.ops.paged_attn import PagedAttention +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, kFp8StaticTensorSym) +from vllm.platforms import current_platform +from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata) +from vllm.v1.kv_cache_interface import AttentionSpec + +logger = init_logger(__name__) + + +@dataclass +class RocmAttentionMetadata: + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + + num_actual_tokens: int # Number of tokens excluding padding. + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + # For cascade attention. + use_cascade: bool + common_prefix_len: int + cu_prefix_query_lens: Optional[torch.Tensor] + prefix_kv_lens: Optional[torch.Tensor] + suffix_kv_lens: Optional[torch.Tensor] + + # Optional aot scheduling + scheduler_metadata: Optional[torch.Tensor] = None + prefix_scheduler_metadata: Optional[torch.Tensor] = None + + +class RocmAttentionMetadataBuilder( + AttentionMetadataBuilder[RocmAttentionMetadata]): + cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + + self.block_size = kv_cache_spec.block_size + + model_config = vllm_config.model_config + self.num_heads_q = model_config.get_num_attention_heads( + vllm_config.parallel_config) + self.num_heads_kv = model_config.get_num_kv_heads( + vllm_config.parallel_config) + self.headdim = model_config.get_head_size() + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata + ) -> RocmAttentionMetadata: + attn_metadata = self.build(0, common_attn_metadata) + # When doing full graph capture, setting seq_lens to + # max_model_len will cause graph capture to be extremely + # slow, so here we set it to 1. + attn_metadata.seq_lens.fill_(1) + return attn_metadata + + def build(self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False) -> RocmAttentionMetadata: + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len + + max_seq_len = common_attn_metadata.max_seq_len + query_start_loc = common_attn_metadata.query_start_loc + seq_lens = common_attn_metadata.seq_lens + block_table_tensor = common_attn_metadata.block_table_tensor + slot_mapping = common_attn_metadata.slot_mapping + + use_cascade = common_prefix_len > 0 + + if use_cascade: + cu_prefix_query_lens = torch.tensor([0, num_actual_tokens], + dtype=torch.int32, + device=self.device) + prefix_kv_lens = torch.tensor([common_prefix_len], + dtype=torch.int32, + device=self.device) + suffix_kv_lens = (common_attn_metadata.seq_lens_cpu - + common_prefix_len) + suffix_kv_lens = suffix_kv_lens.to(self.device) + else: + cu_prefix_query_lens = None + prefix_kv_lens = None + suffix_kv_lens = None + prefix_scheduler_metadata = None + + attn_metadata = RocmAttentionMetadata( + num_actual_tokens=num_actual_tokens, + max_query_len=max_query_len, + query_start_loc=query_start_loc, + max_seq_len=max_seq_len, + seq_lens=seq_lens, + block_table=block_table_tensor, + slot_mapping=slot_mapping, + use_cascade=use_cascade, + common_prefix_len=common_prefix_len, + cu_prefix_query_lens=cu_prefix_query_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, + prefix_scheduler_metadata=prefix_scheduler_metadata, + ) + return attn_metadata + + +class RocmAttentionBackend(AttentionBackend): + + accept_output_buffer: bool = True + + @classmethod + def get_supported_dtypes(cls) -> list[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [32, 64, 96, 128, 160, 192, 224, 256] + + @classmethod + def validate_head_size(cls, head_size: int) -> None: + supported_head_sizes = cls.get_supported_head_sizes() + if head_size not in supported_head_sizes: + attn_type = cls.__name__.removesuffix("Backend") + raise ValueError( + f"Head size {head_size} is not supported by {attn_type}. " + f"Supported head sizes are: {supported_head_sizes}. " + "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " + "FlexAttention backend which supports all head sizes.") + + @staticmethod + def get_name() -> str: + return "ROCM_ATTN_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["RocmAttentionImpl"]: + return RocmAttentionImpl + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return RocmAttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> tuple[int, ...]: + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (2, num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + @staticmethod + def get_builder_cls() -> type["RocmAttentionMetadataBuilder"]: + return RocmAttentionMetadataBuilder + + +@cache +def use_aiter_unified_attention() -> bool: + """Check if aiter unified attention should be used.""" + # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set + # to 1 as default + return envs.VLLM_ROCM_USE_AITER \ + and envs.VLLM_USE_AITER_UNIFIED_ATTENTION + + +class RocmAttentionImpl(AttentionImpl): + + def fused_output_quant_supported(self, quant_key: QuantKey): + return quant_key == kFp8StaticTensorSym + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float] = None, + attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, + sinks: Optional[torch.Tensor] = None, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + if sliding_window is None: + self.sliding_window = (-1, -1) + else: + self.sliding_window = (sliding_window - 1, 0) + self.kv_cache_dtype = kv_cache_dtype + if logits_soft_cap is None: + # In flash-attn, setting logits_soft_cap as 0 means no soft cap. + logits_soft_cap = 0 + self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + RocmAttentionBackend.validate_head_size(head_size) + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "RocmAttentionImpl") + + self.fp8_dtype = current_platform.fp8_dtype() + self.force_prefill_decode_attn = \ + envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION + + if not self.force_prefill_decode_attn: + # If not using prefill decode attention, we use the Triton + # unified attention implementation. + if use_aiter_unified_attention(): + logger.info_once( + "Using aiter unified attention for RocmAttentionImpl") + from aiter.ops.triton.unified_attention import ( + unified_attention) + self.unified_attention = unified_attention + else: + logger.info_once( + "Using vllm unified attention for RocmAttentionImpl") + from vllm.attention.ops.triton_unified_attention import ( + unified_attention) + self.unified_attention = unified_attention + + self.sinks = sinks + if sinks is not None: + assert sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}.") + + def forward( + self, + layer: torch.nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: FlashAttentionMetadata, + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, + output_block_scale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with FlashAttention. + + Args: + query: shape = [num_tokens, num_heads, head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + assert output is not None, "Output tensor must be provided." + + if output_block_scale is not None: + raise NotImplementedError( + "fused block_scale output quantization is not yet supported" + " for RocmAttentionImpl") + + if attn_metadata is None: + # Profiling run. + return output + + assert attn_metadata.use_cascade is False + + # IMPORTANT! + # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in + # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead + # in this method. For example, `view` and `slice` (or `[:n]`) operations + # are surprisingly slow even in the case they do not invoke any GPU ops. + # Minimize the PyTorch ops in this method as much as possible. + # Whenever making a change in this method, please benchmark the + # performance to make sure it does not introduce any overhead. + + use_prefill_decode_attn = self.force_prefill_decode_attn + num_actual_tokens = attn_metadata.num_actual_tokens + + if use_prefill_decode_attn: + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + else: + key_cache, value_cache = kv_cache.unbind(0) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + if use_prefill_decode_attn: + PagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + else: + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + + if self.kv_cache_dtype.startswith("fp8"): + key_cache = key_cache.view(self.fp8_dtype) + value_cache = value_cache.view(self.fp8_dtype) + num_tokens, num_heads, head_size = query.shape + assert layer._q_scale_float == 1.0, \ + "A non 1.0 q_scale is not currently supported." + if current_platform.is_cuda(): + # Skip Q quantization on ROCm and XPU, enable this on cuda + # only, since dequantizing back to f32 in the attention kernel + # is not supported. + query, _ = ops.scaled_fp8_quant( + query.reshape( + (num_tokens, num_heads * head_size)).contiguous(), + layer._q_scale) + query = query.reshape((num_tokens, num_heads, head_size)) + + cu_seqlens_q = attn_metadata.query_start_loc + seqused_k = attn_metadata.seq_lens + max_seqlen_q = attn_metadata.max_query_len + max_seqlen_k = attn_metadata.max_seq_len + block_table = attn_metadata.block_table + + if use_prefill_decode_attn: + # Compute attention and update output up to `num_actual_tokens`. + chunked_prefill_paged_decode( + query=query[:num_actual_tokens], + key=key[:num_actual_tokens], + value=value[:num_actual_tokens], + output=output[:num_actual_tokens], + kv_cache_dtype=self.kv_cache_dtype, + key_cache=key_cache, + value_cache=value_cache, + block_table=block_table, + query_start_loc=cu_seqlens_q, + seq_lens=seqused_k, + max_seq_len=max_seqlen_k, + max_query_len=max_seqlen_q, + k_scale=layer._k_scale, + v_scale=layer._v_scale, + alibi_slopes=self.alibi_slopes, + sliding_window=self.sliding_window[0], + sm_scale=self.scale, + output_scale=output_scale, + sinks=self.sinks, + ) + + else: + descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) + + self.unified_attention( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + seqused_k=seqused_k, + max_seqlen_k=max_seqlen_k, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + sinks=self.sinks, + output_scale=output_scale) + + return output diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 784912a122f68..722c23f150cd3 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -1,24 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with PagedAttention and Triton prefix prefill.""" +"""High-Performance Triton-only Attention layer.""" from dataclasses import dataclass -from functools import cache from typing import ClassVar, Optional import torch -from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) -from vllm.attention.ops.chunked_prefill_paged_decode import ( - chunked_prefill_paged_decode) -from vllm.attention.ops.paged_attn import PagedAttention +from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym) from vllm.platforms import current_platform -from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import (AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata) @@ -144,20 +139,15 @@ class TritonAttentionBackend(AttentionBackend): @classmethod def get_supported_dtypes(cls) -> list[torch.dtype]: - return [torch.float16, torch.bfloat16] - - @classmethod - def get_supported_head_sizes(cls) -> list[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] + return [torch.float16, torch.bfloat16, torch.float32] @classmethod def validate_head_size(cls, head_size: int) -> None: - supported_head_sizes = cls.get_supported_head_sizes() - if head_size not in supported_head_sizes: - attn_type = cls.__name__.removesuffix("Backend") + # Triton Attention supports any head size above 32 + if head_size < 32: raise ValueError( - f"Head size {head_size} is not supported by {attn_type}. " - f"Supported head sizes are: {supported_head_sizes}. " + f"Head size {head_size} is not supported by TritonAttention." + f"Head sizes need to be larger or equal 32 for this backend. " "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " "FlexAttention backend which supports all head sizes.") @@ -182,7 +172,7 @@ class TritonAttentionBackend(AttentionBackend): ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") - return (2, num_blocks, block_size, num_kv_heads, head_size) + return (num_blocks, 2, block_size, num_kv_heads, head_size) @staticmethod def use_cascade_attention(*args, **kwargs) -> bool: @@ -193,15 +183,6 @@ class TritonAttentionBackend(AttentionBackend): return TritonAttentionMetadataBuilder -@cache -def use_aiter_unified_attention() -> bool: - """Check if aiter unified attention should be used.""" - # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set - # to 1 as default - return envs.VLLM_ROCM_USE_AITER \ - and envs.VLLM_USE_AITER_UNIFIED_ATTENTION - - class TritonAttentionImpl(AttentionImpl): def fused_output_quant_supported(self, quant_key: QuantKey): @@ -250,24 +231,6 @@ class TritonAttentionImpl(AttentionImpl): "TritonAttentionImpl") self.fp8_dtype = current_platform.fp8_dtype() - self.force_prefill_decode_attn = \ - envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION - - if not self.force_prefill_decode_attn: - # If not using prefill decode attention, we use the Triton - # unified attention implementation. - if use_aiter_unified_attention(): - logger.info_once( - "Using aiter unified attention for TritonAttentionImpl") - from aiter.ops.triton.unified_attention import ( - unified_attention) - self.unified_attention = unified_attention - else: - logger.info_once( - "Using vllm unified attention for TritonAttentionImpl") - from vllm.attention.ops.triton_unified_attention import ( - unified_attention) - self.unified_attention = unified_attention self.sinks = sinks if sinks is not None: @@ -283,19 +246,19 @@ class TritonAttentionImpl(AttentionImpl): key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, - attn_metadata: FlashAttentionMetadata, + attn_metadata: TritonAttentionMetadata, output: Optional[torch.Tensor] = None, output_scale: Optional[torch.Tensor] = None, output_block_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: - """Forward pass with FlashAttention. + """Forward pass with Paged Attention impl. in Triton. Args: query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] kv_cache: shape = - [2, num_blocks, block_size, num_kv_heads, head_size] + [num_blocks, 2, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -322,40 +285,22 @@ class TritonAttentionImpl(AttentionImpl): # Whenever making a change in this method, please benchmark the # performance to make sure it does not introduce any overhead. - use_prefill_decode_attn = self.force_prefill_decode_attn num_actual_tokens = attn_metadata.num_actual_tokens - - if use_prefill_decode_attn: - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - else: - key_cache, value_cache = kv_cache.unbind(0) + key_cache, value_cache = kv_cache.unbind(1) if self.kv_sharing_target_layer_name is None: # Reshape the input keys and values and store them in the cache. # Skip this if sharing KV cache with an earlier attention layer. - if use_prefill_decode_attn: - PagedAttention.write_to_paged_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - else: - ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(self.fp8_dtype) @@ -379,52 +324,28 @@ class TritonAttentionImpl(AttentionImpl): max_seqlen_k = attn_metadata.max_seq_len block_table = attn_metadata.block_table - if use_prefill_decode_attn: - # Compute attention and update output up to `num_actual_tokens`. - chunked_prefill_paged_decode( - query=query[:num_actual_tokens], - key=key[:num_actual_tokens], - value=value[:num_actual_tokens], - output=output[:num_actual_tokens], - kv_cache_dtype=self.kv_cache_dtype, - key_cache=key_cache, - value_cache=value_cache, - block_table=block_table, - query_start_loc=cu_seqlens_q, - seq_lens=seqused_k, - max_seq_len=max_seqlen_k, - max_query_len=max_seqlen_q, - k_scale=layer._k_scale, - v_scale=layer._v_scale, - alibi_slopes=self.alibi_slopes, - sliding_window=self.sliding_window[0], - sm_scale=self.scale, - output_scale=output_scale, - sinks=self.sinks, - ) + descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) - else: - descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) - - self.unified_attention( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=cu_seqlens_q, - max_seqlen_q=max_seqlen_q, - seqused_k=seqused_k, - max_seqlen_k=max_seqlen_k, - softmax_scale=self.scale, - causal=True, - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=block_table, - softcap=self.logits_soft_cap, - q_descale=None, # Not supported - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - sinks=self.sinks, - output_scale=output_scale) + unified_attention( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + seqused_k=seqused_k, + max_seqlen_k=max_seqlen_k, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + sinks=self.sinks, + output_scale=output_scale, + ) return output From 06a41334c7e5c23b9a053f866c02524945d7d1db Mon Sep 17 00:00:00 2001 From: Bowen Wang <abmfy@icloud.com> Date: Mon, 22 Sep 2025 09:31:05 -0700 Subject: [PATCH 551/584] [EPLB] Reduce EPLB Inference Overhead (#24573) Signed-off-by: Bowen Wang <abmfy@icloud.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> --- .../layers/fused_moe/fused_moe.py | 73 +++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 69 +++++------------- 2 files changed, 92 insertions(+), 50 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6c2a5bda7cbaa..0e334fdf24045 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1017,6 +1017,79 @@ def grouped_topk( return topk_weights.to(torch.float32), topk_ids.to(torch.int32) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) +def eplb_map_to_physical_and_record( + topk_ids: torch.Tensor, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + indices_type: Optional[torch.dtype] = None) -> torch.Tensor: + ''' + Map the logical expert ids to physical expert ids + and record the expert load metrics. + + This will select a pseudo-random replica for each logical expert. + Only used for EPLB. + + Args: + topk_ids: The logical expert ids. + expert_load_view: The expert load view. + logical_to_physical_map: The logical to physical map. + logical_replica_count: The logical replica count. + indices_type: The indices type. + + Returns: + The physical expert ids. + ''' + + # 1. Convert the logical expert ids to physical expert ids + # Directly select a random replica for each logical expert + + # In case `indices_type` is not `torch.long` or `torch.int`, + # e.g. `torch.uint32` as required by dispatch/combine kernels + topk_ids_long = topk_ids.long() + # Use (token position) modulo (replica count) + # to deterministically choose a replica + replica_count = logical_replica_count[topk_ids_long] + # Flatten-position based index, reshaped back to `topk_ids` shape + pos_indices = torch.arange(topk_ids.numel(), + device=topk_ids.device, + dtype=torch.long).reshape_as(topk_ids) + # Compute pseudo-random indices by modulo + replica_indices = (pos_indices % replica_count).unsqueeze(-1) + physical_ids = logical_to_physical_map[topk_ids_long].gather( + -1, replica_indices).squeeze(-1) + + topk_ids = physical_ids + + # 2. Record expert load metrics. + + # TODO(bowen): When using `FusedMoEModularKernel`, this + # can be done in a more unified way, since + # `FusedMoEPrepareAndFinalize` will return the expert + # token count, in some cases directly from the kernel. + # However, now there are many code paths not using + # the modular kernel, e.g. calling `fused_experts`, + # so we decide to keep the logic here. + # + # If later refactor moved all the MoE kernel calls + # to the modular kernel, we can move this logic there + # to achieve better efficiency. + + # `expert_load_view`: (num_physical_experts,) + + # `torch.bincount` is not compilable, so use `scatter_add_` instead. + topk_ids_flatten = topk_ids.flatten() + expert_load_view.scatter_add_( + dim=0, + index=topk_ids_flatten.long(), + src=torch.ones_like(topk_ids_flatten).to(expert_load_view)) + + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) + return topk_ids + + def fused_grouped_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index da513d75da4da..17ad75584a3f6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -43,7 +43,8 @@ from vllm.v1.worker.ubatching import dbo_current_ubatch_id if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts - from .fused_moe import TritonExperts, fused_experts + from .fused_moe import (TritonExperts, eplb_map_to_physical_and_record, + fused_experts) if has_pplx(): from .pplx_prepare_finalize import (PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes) @@ -55,6 +56,16 @@ else: fused_experts = None # type: ignore FusedMoEPermuteExpertsUnpermute = None # type: ignore FusedMoEPrepareAndFinalize = None # type: ignore + + def eplb_map_to_physical_and_record( + topk_ids: torch.Tensor, expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + indices_type: Optional[torch.dtype]) -> torch.Tensor: + # CPU fallback: no EPLB so just return as is + return topk_ids + + if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_grouped_topk as grouped_topk) @@ -1616,55 +1627,13 @@ class FusedMoE(CustomOp): assert logical_to_physical_map is not None assert logical_replica_count is not None - # 1. Convert the logical expert ids to physical expert ids - # Directly select a random replica for each logical expert - - # TODO: maybe optimize this by using specified kernels, - # or compute pseudo-random indices by modulo - - # In case `indices_type` is not `torch.long` or `torch.int`, - # e.g. `torch.uint32` as required by dispatch/combine kernels - topk_ids_long = topk_ids.long() - replica_indices = ( - torch.rand_like(topk_ids, dtype=torch.float) * - logical_replica_count[topk_ids_long]).long().unsqueeze(-1) - physical_ids = logical_to_physical_map[topk_ids_long].gather( - -1, replica_indices).squeeze(-1) - - topk_ids = physical_ids - - # 2. Record expert load metrics. - - # TODO(bowen): When using `FusedMoEModularKernel`, this - # can be done in a more unified way, since - # `FusedMoEPrepareAndFinalize` will return the expert - # token count, in some cases directly from the kernel. - # However, now there are many code paths not using - # the modular kernel, e.g. calling `fused_experts`, - # so we decide to keep the logic here. - # - # If later refactor moved all the MoE kernel calls - # to the modular kernel, we can move this logic there - # to achieve better efficiency. - - # `expert_load_view`: (num_physical_experts,) - - topk_ids_flatten = topk_ids.flatten() - - # Performance optimization: - # `masked_fill` is significantly faster than `masked_select` - invalid_mask = topk_ids_flatten < 0 - # Replace invalid expert ids with 0 (just a dummy position) - # to avoid out-of-bounds errors in scatter_add_ - index = topk_ids_flatten.masked_fill_(invalid_mask, 0) - # `src` is the valid mask, which is 1 for valid and 0 for invalid - src = ~invalid_mask - - expert_load_view.scatter_add_(dim=0, - index=index.long(), - src=src.to(expert_load_view)) - - topk_ids = topk_ids.to(dtype=indices_type) + topk_ids = eplb_map_to_physical_and_record( + topk_ids=topk_ids, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + indices_type=indices_type, + ) assert topk_ids.dtype == indices_type or indices_type is None From cfbee3d0e72582b0b0d910ecbd9a7c1028a7205c Mon Sep 17 00:00:00 2001 From: Daisy-Ma-coder <daisy.ma.0117@gmail.com> Date: Mon, 22 Sep 2025 10:37:43 -0700 Subject: [PATCH 552/584] [CLI env var] Add VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH in env variables (#25274) Signed-off-by: qqma <qqma@amazon.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: qqma <qqma@amazon.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> --- tests/compile/piecewise/test_full_cudagraph.py | 11 +++++++++-- tests/v1/cudagraph/test_cudagraph_mode.py | 11 +++++++++-- vllm/envs.py | 8 ++++++++ vllm/v1/attention/backends/flash_attn.py | 7 +++---- vllm/v1/attention/backends/mla/flashattn_mla.py | 8 +++----- 5 files changed, 32 insertions(+), 13 deletions(-) diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 2454f85342eba..780a0d6b5c0e4 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -46,7 +46,10 @@ backend_configs = { # FA3 on Hopper "FA3": BackendConfig(name="FA3", - env_vars={"VLLM_FLASH_ATTN_VERSION": "3"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "3", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL", }, @@ -66,6 +69,7 @@ backend_configs = { BackendConfig(name="FlashAttentionMLA", env_vars={ "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", }, comp_config={ "cudagraph_mode": "FULL_DECODE_ONLY", @@ -89,7 +93,10 @@ backend_configs = { # FA2 "FA2": BackendConfig(name="FA2", - env_vars={"VLLM_FLASH_ATTN_VERSION": "2"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "2", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL", }), diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 25e01806f4956..1ae9185fafbdd 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -47,7 +47,10 @@ backend_configs = { # FA3 on Hopper "FA3": BackendConfig(name="FA3", - env_vars={"VLLM_FLASH_ATTN_VERSION": "3"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "3", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL", }, @@ -67,6 +70,7 @@ backend_configs = { BackendConfig(name="FlashAttentionMLA", env_vars={ "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", }, comp_config={ "cudagraph_mode": "FULL_DECODE_ONLY", @@ -75,7 +79,10 @@ backend_configs = { # FA2 "FA2": BackendConfig(name="FA2", - env_vars={"VLLM_FLASH_ATTN_VERSION": "2"}, + env_vars={ + "VLLM_FLASH_ATTN_VERSION": "2", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + }, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }), diff --git a/vllm/envs.py b/vllm/envs.py index cbd1d5474e60f..e517088c52903 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -119,6 +119,7 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False + VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16 VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_CUDART_SO_PATH: Optional[str] = None @@ -946,6 +947,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), + # If set, vLLM will pick up the provided Flash Attention MLA + # max number splits for cuda graph decode + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": + lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", + "16")), + # Number of GPUs per worker in Ray, if it is set to be a fraction, # it allows ray to schedule multiple actors on a single GPU, # so that users can colocate other actors on the same GPUs as vLLM. @@ -1379,6 +1386,7 @@ def compute_hash() -> str: environment_variables_to_hash = [ "VLLM_PP_LAYER_PARTITION", "VLLM_MLA_DISABLE", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", "VLLM_USE_TRITON_FLASH_ATTN", "VLLM_USE_TRITON_AWQ", "VLLM_DP_RANK", diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 20f1904b3be6f..d564cf9988ea6 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -8,6 +8,7 @@ import numpy as np import torch from vllm import _custom_ops as ops +from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType, is_quantized_kv_cache) @@ -33,9 +34,6 @@ from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) -# NOTE(woosuk): This is an arbitrary number. Tune it if needed. -_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16 - class FlashAttentionBackend(AttentionBackend): @@ -215,7 +213,8 @@ class FlashAttentionMetadataBuilder( # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH) # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 472095e13615b..4ad9a13b61d8e 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -6,6 +6,7 @@ from typing import ClassVar, Optional, Union import torch +from vllm import envs from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, is_quantized_kv_cache) from vllm.attention.utils.fa_utils import (flash_attn_supports_mla, @@ -24,10 +25,6 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata logger = init_logger(__name__) -# NOTE(matt): This is an arbitrary number, copied from -# woosuk's implementation in standard FlashAttention backend -_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16 - class FlashAttnMLABackend(MLACommonBackend): @@ -97,7 +94,8 @@ class FlashAttnMLAMetadataBuilder( # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH) # TODO(lucas): Until we add support for the DCP custom masking we need # to restrict decodes to q_len == 1 when DCP is enabled. From 1d7f95b85c9b1bdaa576a5275851995b75a9420d Mon Sep 17 00:00:00 2001 From: ElizaWszola <ewszola@redhat.com> Date: Mon, 22 Sep 2025 19:37:46 +0200 Subject: [PATCH 553/584] [Compiler] Disable Inductor standalone compile by default (#25391) Signed-off-by: ElizaWszola <ewszola@redhat.com> --- vllm/envs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index e517088c52903..8941be125ed08 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -126,6 +126,7 @@ if TYPE_CHECKING: VLLM_DP_RANK: int = 0 VLLM_DP_RANK_LOCAL: int = -1 VLLM_DP_SIZE: int = 1 + VLLM_USE_STANDALONE_COMPILE: bool = False VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 VLLM_MOE_DP_CHUNK_SIZE: int = 256 @@ -437,9 +438,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # Feature flag to enable/disable Inductor standalone compile. # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is - # enabled by default. + # disabled by default. "VLLM_USE_STANDALONE_COMPILE": - lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1", + lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "0") == "1", # local rank of the process in the distributed setting, used to determine # the GPU device id From 239ef0c1ac0dfe68d8d2e28c54ecf9aa9bcd945b Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Mon, 22 Sep 2025 14:27:51 -0400 Subject: [PATCH 554/584] [CI Failure] Fix fp8 kv cache on <SM90 (#25396) Signed-off-by: mgoin <mgoin64@gmail.com> --- vllm/platforms/cuda.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 7baa5a9742f44..b10bc03ee16c6 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -286,6 +286,9 @@ class CudaPlatformBase(Platform): TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501 XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501 + use_fp8_kv_cache = (kv_cache_dtype is not None + and kv_cache_dtype.startswith("fp8")) + if selected_backend == _Backend.FLASHINFER: logger.info_once("Using FlashInfer backend on V1 engine.") if cls.has_device_capability(100): @@ -334,10 +337,11 @@ class CudaPlatformBase(Platform): # FlashAttention is the default for SM 8.0+ GPUs if cls.has_device_capability(80): - if has_sink and not cls.is_device_capability(90): + if (has_sink or + use_fp8_kv_cache) and not cls.is_device_capability(90): logger.info_once("Using Triton backend on V1 engine.") return TRITON_ATTN_VLLM_V1 - if is_default_backend_supported := is_attn_backend_supported( + elif is_default_backend_supported := is_attn_backend_supported( FLASH_ATTN_V1, head_size, dtype, allow_import_error=False): logger.info_once("Using Flash Attention backend on " From 922979bfcc461cf0069789728f4afdf1284265fb Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 22 Sep 2025 12:06:05 -0700 Subject: [PATCH 555/584] [DP] support torchrun external launcher with Data Parallelism (#24899) Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> --- .buildkite/test-pipeline.yaml | 12 ++- .../offline_inference/torchrun_dp_example.py | 81 +++++++++++++++++++ .../distributed/test_torchrun_example_moe.py | 81 +++++++++++++++++++ vllm/config/parallel.py | 13 ++- vllm/distributed/parallel_state.py | 4 +- vllm/v1/engine/llm_engine.py | 18 ++++- 6 files changed, 202 insertions(+), 7 deletions(-) create mode 100644 examples/offline_inference/torchrun_dp_example.py create mode 100644 tests/distributed/test_torchrun_example_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fe4796b35786c..c4ea4b675649c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -165,10 +165,18 @@ steps: - tests/v1/test_hybrid_lb_dp.py - tests/v1/engine/test_engine_core_client.py commands: - # test with tp=2 and external_dp=2 + # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with tp=2 and pp=2 + # test with torchrun tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py new file mode 100644 index 0000000000000..8e888a100254e --- /dev/null +++ b/examples/offline_inference/torchrun_dp_example.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +experimental support for data-parallel inference with torchrun +Note the data load balancing and distribution is done out of the vllm engine, +no internal lb supported in external_launcher mode. +""" + +from vllm import LLM, SamplingParams + +# Create prompts, the same across all ranks +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] * 50 + +# Create sampling parameters, the same across all ranks +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Use `distributed_executor_backend="external_launcher"` so that +# this llm engine/instance only creates one worker. +# it is important to set an explicit seed to make sure that +# all ranks have the same random seed, so that sampling can be +# deterministic across ranks. +llm = LLM( + model="microsoft/Phi-mini-MoE-instruct", + tensor_parallel_size=1, + data_parallel_size=2, + pipeline_parallel_size=1, + enable_expert_parallel=False, + distributed_executor_backend="external_launcher", + max_model_len=4096, + gpu_memory_utilization=0.6, + seed=1, +) + +dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank +dp_size = llm.llm_engine.vllm_config.parallel_config.data_parallel_size + +prompts = [ + f"{idx}.{prompt}" for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank +] + +outputs = llm.generate(prompts, sampling_params) + + +# all ranks will have the same outputs +print("-" * 50) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n") + print("-" * 50) +""" +Further tips: + +1. to communicate control messages across all ranks, use the cpu group, +a PyTorch ProcessGroup with GLOO backend. + +```python +from vllm.distributed.parallel_state import get_world_group +cpu_group = get_world_group().cpu_group +torch_rank = dist.get_rank(group=cpu_group) +if torch_rank == 0: + # do something for rank 0, e.g. saving the results to disk. +``` + +2. to communicate data across all ranks, use the model's device group, +a PyTorch ProcessGroup with NCCL backend. +```python +from vllm.distributed.parallel_state import get_world_group +device_group = get_world_group().device_group +``` + +3. to access the model directly in every rank, use the following code: +```python +llm.llm_engine.model_executor.driver_worker.worker.model_runner.model +``` +""" diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py new file mode 100644 index 0000000000000..2d6b930fcc07e --- /dev/null +++ b/tests/distributed/test_torchrun_example_moe.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# unit test for `examples/offline_inference/torchrun_example.py` +import os +import random + +import torch.distributed as dist + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import get_tp_group, get_world_group + +dist.init_process_group(backend="gloo") + +# Create prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] * 10 +dp_size = int(os.getenv("DP_SIZE", "1")) +dp_rank = int(os.getenv("DP_RANK", "0")) + +if dp_size > 1: + # distribute the prompts across the data parallel ranks + prompts = [ + prompt for idx, prompt in enumerate(prompts) + if idx % dp_size == dp_rank + ] + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# to test if all ranks agree on the same kv cache configuration. +llm = LLM(model="microsoft/Phi-mini-MoE-instruct", + tensor_parallel_size=int(os.getenv("TP_SIZE", "1")), + pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")), + enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1, + distributed_executor_backend="external_launcher", + gpu_memory_utilization=random.uniform(0.7, 0.9), + swap_space=random.randint(1, 4), + seed=0) + +outputs = llm.generate(prompts, sampling_params) + +group = get_world_group() if dp_size == 1 else get_tp_group() +cpu_group = group.cpu_group +group_rank = dist.get_rank(group=cpu_group) + + +def test_consistent_across_ranks(obj): + if group_rank == 0: + dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group) + else: + container = [None] + dist.broadcast_object_list(container, + src=group.ranks[0], + group=cpu_group) + assert container[0] == obj + + +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_cpu_blocks) +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_gpu_blocks) + +# make sure we can access the model parameters from the calling process +# of the `LLM` instance. +params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner. + model.parameters()) +test_consistent_across_ranks(len(params)) + +# all ranks should have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + test_consistent_across_ranks(prompt) + test_consistent_across_ranks(generated_text) + print(f"Rank {group_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 37a41bf6de71a..a84d882430166 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib +import os from dataclasses import field from typing import TYPE_CHECKING, Any, Literal, Optional, Union @@ -351,6 +352,10 @@ class ParallelConfig: self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size + if self.distributed_executor_backend == "external_launcher": + logger.info("Using external launcher for distributed inference.") + self.world_size *= self.data_parallel_size + if self.data_parallel_size_local > self.data_parallel_size: raise ValueError( f"data_parallel_size_local ({self.data_parallel_size_local}) " @@ -358,6 +363,13 @@ class ParallelConfig: if self.data_parallel_size > 1 or self.data_parallel_size_local == 0: # Data parallel was specified in the engine args. + if self.distributed_executor_backend == "external_launcher": + # For external launcher, + # we need to set the data parallel rank automatically + self.data_parallel_rank = int(os.environ["RANK"]) \ + // (self.world_size // self.data_parallel_size) + logger.info("Set data_parallel_rank to %d automatically.", + self.data_parallel_rank) if not self._data_parallel_master_port_list: self._data_parallel_master_port_list = get_open_ports_list(5) self.data_parallel_master_port = \ @@ -380,7 +392,6 @@ class ParallelConfig: "be set when data_parallel_size > 1") if self.distributed_executor_backend == "external_launcher": - import os os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" logger.info("Disabling V1 multiprocessing for external launcher.") diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 12571afaa4c13..895971893a661 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1032,7 +1032,9 @@ def init_distributed_environment(world_size: int = -1, distributed_init_method, backend) from vllm.config import get_current_vllm_config config = get_current_vllm_config() - if config is not None and config.parallel_config.data_parallel_size > 1: + if config is not None and config.parallel_config.data_parallel_size > 1 \ + and config.parallel_config.distributed_executor_backend \ + != "external_launcher": parallel_config = config.parallel_config # adjust to take into account data parallelism # offset the rank by the data parallel rank diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 907656d1b24cb..92c861d9e91fe 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -11,6 +11,7 @@ from typing_extensions import TypeVar import vllm.envs as envs from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group +from vllm.distributed.parallel_state import get_dp_group from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType from vllm.logger import init_logger @@ -77,10 +78,15 @@ class LLMEngine: if self.log_stats: self.stat_logger = PrometheusStatLogger(vllm_config) + executor_backend = ( + self.vllm_config.parallel_config.distributed_executor_backend) + parallel_config = vllm_config.parallel_config + self.external_launcher_dp = (parallel_config.data_parallel_size > 1 and + executor_backend == "external_launcher") # important: init dp group before init the engine_core # In the decoupled engine case this is handled in EngineCoreProc. - parallel_config = vllm_config.parallel_config - if not multiprocess_mode and parallel_config.data_parallel_size > 1: + if not multiprocess_mode and parallel_config.data_parallel_size > 1 \ + and not self.external_launcher_dp: self.dp_group = parallel_config.stateless_init_dp_group() else: self.dp_group = None @@ -120,6 +126,11 @@ class LLMEngine: # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore + if self.external_launcher_dp: + # If we use DP in external launcher mode, we reuse the + # existing DP group used for data communication. + self.dp_group = get_dp_group().cpu_group + # Don't keep the dummy data in memory self.reset_mm_cache() @@ -331,5 +342,6 @@ class LLMEngine: return self.collective_rpc("apply_model", args=(func, )) def __del__(self): - if dp_group := getattr(self, "dp_group", None): + if dp_group := getattr(self, "dp_group", + None) and not self.external_launcher_dp: stateless_destroy_torch_distributed_process_group(dp_group) From 8d0ee5a56452bfb671ae00e96d10841a8054b823 Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Mon, 22 Sep 2025 12:16:59 -0700 Subject: [PATCH 556/584] [misc] Remove RFC review hours reference (#25416) --- .github/ISSUE_TEMPLATE/750-RFC.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index 7ee57c42895ca..c0e009855964a 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -43,10 +43,6 @@ body: Any other things you would like to mention. validations: required: false -- type: markdown - attributes: - value: > - Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit). - type: checkboxes id: askllm attributes: From d5e0fca264f7d277ed372f7c075827ed9a0c5e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= <ProExpertProg@users.noreply.github.com> Date: Mon, 22 Sep 2025 15:30:05 -0400 Subject: [PATCH 557/584] [torch.compile] Cleanup compilation tests and custom passes, add debug utils, fix DCE bug (#23091), fix test (#24376), and prep for custom op matching (#24604) (#24542) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: luka <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> --- tests/compile/backend.py | 28 +- tests/compile/test_async_tp.py | 2 + tests/compile/test_config.py | 10 +- tests/compile/test_functionalization.py | 10 +- tests/compile/test_fusion.py | 17 +- tests/compile/test_fusion_all_reduce.py | 6 +- tests/compile/test_fusion_attn.py | 20 +- tests/compile/test_sequence_parallelism.py | 21 +- tests/compile/test_silu_mul_quant_fusion.py | 13 +- vllm/compilation/activation_quant_fusion.py | 18 +- vllm/compilation/collective_fusion.py | 29 +-- vllm/compilation/fix_functionalization.py | 9 +- vllm/compilation/fusion.py | 273 ++------------------ vllm/compilation/fusion_attn.py | 22 +- vllm/compilation/multi_output_match.py | 109 -------- vllm/compilation/noop_elimination.py | 5 +- vllm/compilation/pass_manager.py | 49 +++- vllm/compilation/post_cleanup.py | 20 ++ vllm/compilation/sequence_parallelism.py | 18 +- vllm/compilation/vllm_inductor_pass.py | 110 +++++++- vllm/config/__init__.py | 7 +- vllm/config/compilation.py | 48 +++- vllm/envs.py | 6 + vllm/utils/__init__.py | 15 +- 24 files changed, 404 insertions(+), 461 deletions(-) delete mode 100644 vllm/compilation/multi_output_match.py create mode 100644 vllm/compilation/post_cleanup.py diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 2c4287950dcfe..f25c367433f41 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import weakref from collections.abc import Sequence from copy import deepcopy from typing import Callable, Union @@ -10,7 +11,26 @@ from torch._ops import OpOverload from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.inductor_pass import InductorPass -from vllm.config import get_current_vllm_config +from vllm.compilation.pass_manager import with_pattern_match_debug +from vllm.compilation.vllm_inductor_pass import VllmInductorPass +from vllm.config import VllmConfig, get_current_vllm_config + + +class LazyInitPass(InductorPass): + """ + If there's a pass that we want to initialize lazily in a test, + we can wrap it in LazyInitPass, which will initialize the pass when invoked + and then immediately invoke it. + """ + + def __init__(self, pass_cls: type[VllmInductorPass], + vllm_config: VllmConfig): + self.pass_cls = pass_cls + self.vllm_config = weakref.proxy(vllm_config) # avoid cycle + + def __call__(self, graph: fx.Graph) -> None: + self.pass_ = self.pass_cls(self.vllm_config) + self.pass_(graph) class TestBackend: @@ -40,10 +60,16 @@ class TestBackend: example_inputs, config_patches=self.inductor_config) + @with_pattern_match_debug def post_pass(self, graph: fx.Graph): self.graph_pre_pass = deepcopy(graph) + + VllmInductorPass.dump_prefix = 0 for pass_ in self.custom_passes: pass_(graph) + VllmInductorPass.dump_prefix += 1 + + VllmInductorPass.dump_prefix = None self.graph_post_pass = deepcopy(graph) # assign by reference, will reflect the final state of the graph diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 9a51e6b3514f4..1dc21365d5577 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -294,6 +294,8 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int, compiled_model = torch.compile(model, backend=backend) compiled_model(hidden_states) + assert async_tp_pass.matched_count == 1 + # In pre-nodes, all gather or reduce scatter should exist, # fused_matmul_reduce_scatter or fused_all_gather_matmul should not backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 90e8e0ff95858..7afd6251bbbd5 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -4,7 +4,7 @@ import pytest import vllm from vllm.compilation.counter import compilation_counter -from vllm.config import VllmConfig +from vllm.config import CompilationConfig, VllmConfig from vllm.utils import _is_torch_equal_or_newer @@ -26,6 +26,14 @@ def test_use_cudagraphs_dynamic(monkeypatch): assert not vllm_config.compilation_config.use_cudagraph +def test_custom_op(): + # proper syntax + _ = CompilationConfig(custom_ops=["+quant_fp8", "-silu_and_mul"]) + + with pytest.raises(ValueError, match="Invalid syntax '"): + _ = CompilationConfig(custom_ops=["quant_fp8"]) + + # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked # NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 0c7e6fbccf20c..2ee9aa7476beb 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -8,9 +8,10 @@ import vllm.envs as envs from vllm import LLM, SamplingParams from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.compilation.fusion import FUSED_OPS, FusionPass +from vllm.compilation.fusion import FUSED_OPS, RMSNormQuantFusionPass from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym) @@ -58,11 +59,12 @@ def test_fix_functionalization(model: str, quant_key: QuantKey, vllm_config.compilation_config = CompilationConfig( pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True)) noop_pass = NoOpEliminationPass(vllm_config) - fusion_pass = FusionPass.instance(vllm_config) + fusion_pass = RMSNormQuantFusionPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config) - passes = [noop_pass, fusion_pass, act_quant_fusion_pass - ] if do_fusion else [noop_pass] + passes = [noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass + ] if do_fusion else [noop_pass, cleanup_pass] func_pass = FixFunctionalizationPass(vllm_config) backend_func = TestBackend(*passes, func_pass) backend_no_func = TestBackend(*passes) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index eedb9bdcd5299..3d8897d3f18b8 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -4,11 +4,11 @@ import pytest import torch -import vllm.envs as envs import vllm.plugins from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey, - FusionPass) + RMSNormQuantFusionPass) from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import (CompilationConfig, CompilationLevel, PassConfig, VllmConfig) from vllm.model_executor.layers.layernorm import RMSNorm @@ -79,15 +79,15 @@ class TestModel(torch.nn.Module): @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) -@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) +@pytest.mark.parametrize("hidden_size", [64]) +@pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("static", [True, False]) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @pytest.mark.parametrize("cuda_force_torch", [True, False] if cutlass_fp8_supported() else [True]) -@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], +@pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Only test on CUDA and ROCm") def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, cuda_force_torch): @@ -104,9 +104,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, with vllm.config.set_current_vllm_config(vllm_config): # Reshape pass is needed for the fusion pass to work noop_pass = NoOpEliminationPass(vllm_config) - fusion_pass = FusionPass.instance(vllm_config) + fusion_pass = RMSNormQuantFusionPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) - backend = TestBackend(noop_pass, fusion_pass) + backend = TestBackend(noop_pass, fusion_pass, cleanup_pass) model = TestModel(hidden_size, eps, static, cuda_force_torch) # First dimension dynamic @@ -128,6 +129,8 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) + assert fusion_pass.matched_count == 2 + # In pre-nodes, fp8 quant should be there and fused kernels should not backend.check_before_ops(model.ops_in_model_before()) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index dd31e0db1f59f..60f32c863208d 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -9,6 +9,7 @@ import vllm.envs as envs from vllm.compilation.collective_fusion import AllReduceFusionPass from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig, ModelConfig, PassConfig, VllmConfig) from vllm.distributed import tensor_model_parallel_all_reduce @@ -215,8 +216,10 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int, all_reduce_fusion_pass = AllReduceFusionPass(vllm_config) noop_pass = NoOpEliminationPass(vllm_config) func_pass = FixFunctionalizationPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) - backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass) + backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass, + cleanup_pass) token_num = batch_size * seq_len model = test_model_cls(hidden_size, token_num) @@ -227,6 +230,7 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int, compiled_model = torch.compile(model, backend=backend) compiled_model(hidden_states, residual) + assert all_reduce_fusion_pass.matched_count == 1 backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False) backend.check_after_ops(model.ops_in_model_after()) del all_reduce_fusion_pass diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index c3f1c7481d1b3..c4cac95531926 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -6,18 +6,19 @@ from typing import Optional import pytest import torch._dynamo -from tests.compile.backend import TestBackend +from tests.compile.backend import LazyInitPass, TestBackend from tests.models.utils import check_outputs_equal from tests.v1.attention.utils import (BatchSpec, _Backend, create_common_attn_metadata) from vllm import LLM, SamplingParams from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention import Attention +from vllm.attention import Attention, AttentionMetadata from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel, ModelConfig, PassConfig, SchedulerConfig, VllmConfig, set_current_vllm_config) @@ -104,7 +105,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str, # AttnFusionPass needs attention layers to be registered in config upon init # so we initialize it during compilation. - attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw) + attn_pass = LazyInitPass(AttnFusionPass, vllm_config) backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass) llm2 = LLM(model, enforce_eager=True, @@ -197,7 +198,8 @@ class AttentionQuantPatternModel(torch.nn.Module): device=self.device, ) - def build_attn_metadata(self, batch_size: int, use_hnd: bool): + def build_attn_metadata(self, batch_size: int, use_hnd: bool) \ + -> AttentionMetadata: """Initialize attention metadata.""" # Create common attn metadata @@ -447,9 +449,10 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, # Create test backend with fusion passes enabled noop_pass = NoOpEliminationPass(vllm_config) - attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw - ) - test_backend = TestBackend(noop_pass, attn_pass) + attn_pass = LazyInitPass(AttnFusionPass, vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) + + test_backend = TestBackend(noop_pass, attn_pass, cleanup_pass) # Compile model with fusion enabled model_compiled = torch.compile(model_fused, @@ -485,6 +488,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=True) + # access the underlying `AttnFusionPass` on the `LazyInitPass` + assert attn_pass.pass_.matched_count == sum(attn_fusion_supported) + # Check attention ops in the graph before and after fusion attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass)) attn_nodes_post = list(find_op_nodes(ATTN_OP, diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index fb9f9dde22799..b2734e915bbbf 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -6,10 +6,12 @@ import torch import vllm.envs as envs from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.compilation.fusion import FusionPass +from vllm.compilation.fusion import RMSNormQuantFusionPass from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.sequence_parallelism import SequenceParallelismPass +from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig, PassConfig, VllmConfig) from vllm.distributed import tensor_model_parallel_all_reduce @@ -104,7 +106,7 @@ class TestQuantModel(torch.nn.Module): # Initialize weights torch.nn.init.normal_(self.gate_proj, std=0.02) - self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False) + self.fp8_linear = Fp8LinearOp(act_quant_static=True) self.scale = torch.rand(1, dtype=torch.float32) # Create a weight that is compatible with torch._scaled_mm, @@ -137,8 +139,7 @@ class TestQuantModel(torch.nn.Module): # layer normalization norm_output, residual_output = self.norm(all_reduce, residual) - # for static input quantization - # self.fp8_linear is initialized with use_per_token_if_dynamic=False + # scaled_mm with static input quantization fp8_linear_result = self.fp8_linear.apply(norm_output, self.w, self.wscale, @@ -253,16 +254,20 @@ def sequence_parallelism_pass_on_test_model( dtype=dtype, seed=42) - sequence_parallelism_pass = SequenceParallelismPass(vllm_config) noop_pass = NoOpEliminationPass(vllm_config) + sequence_parallelism_pass = SequenceParallelismPass(vllm_config) func_pass = FixFunctionalizationPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) - passes_for_backend = [noop_pass, sequence_parallelism_pass] + passes_for_backend: list[VllmInductorPass] = \ + [noop_pass, sequence_parallelism_pass] if enable_fusion: - fusion_pass = FusionPass.instance(vllm_config) + fusion_pass = RMSNormQuantFusionPass(vllm_config) passes_for_backend.append(fusion_pass) + passes_for_backend.append(cleanup_pass) + backend_no_func = TestBackend(*passes_for_backend) backend_func = TestBackend(*passes_for_backend, func_pass) @@ -279,6 +284,8 @@ def sequence_parallelism_pass_on_test_model( compiled_model_func = torch.compile(model, backend=backend_func) compiled_model_func(hidden_states, residual) + assert sequence_parallelism_pass.matched_count == 1 + # In pre-nodes, all reduce should be there, # reduce scatter and all gather should not backend_no_func.check_before_ops(model.ops_in_model_before()) diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index ae190d25cad62..c445f4dde2cc4 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -15,6 +15,7 @@ from vllm.compilation.activation_quant_fusion import ( # yapf: enable from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -69,6 +70,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs): super().__init__() + from vllm.compilation.activation_quant_fusion import ( + silu_and_mul_nvfp4_quant_supported) + assert silu_and_mul_nvfp4_quant_supported + self.silu_and_mul = SiluAndMul() # create nvfp4 weight @@ -127,7 +132,11 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class, pass_config=PassConfig(enable_fusion=True, enable_noop=True)) fusion_pass = ActivationQuantFusionPass(config) - backend = TestBackend(NoOpEliminationPass(config), fusion_pass) + passes = [ + NoOpEliminationPass(config), fusion_pass, + PostCleanupPass(config) + ] + backend = TestBackend(*passes) model = model_class(hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x) @@ -151,6 +160,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class, atol=atol, rtol=rtol) + assert fusion_pass.matched_count == 1 + # In pre-nodes, quant op should be present and fused kernels should not backend.check_before_ops(model.ops_in_model_before()) diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py index f2fbb1200eecc..74462fb37ca97 100644 --- a/vllm/compilation/activation_quant_fusion.py +++ b/vllm/compilation/activation_quant_fusion.py @@ -17,7 +17,7 @@ from vllm.platforms import current_platform from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) @@ -152,7 +152,7 @@ class SiluMulNvfp4QuantPattern(ActivationQuantPattern): register_replacement(pattern, replacement, inputs, fwd_only, pm_pass) -class ActivationQuantFusionPass(VllmInductorPass): +class ActivationQuantFusionPass(VllmPatternMatcherPass): """ This pass fuses a pre-defined set of custom ops into fused ops. It uses the torch pattern matcher to find the patterns and replace them. @@ -176,16 +176,12 @@ class ActivationQuantFusionPass(VllmInductorPass): pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern() pattern_silu_mul_nvfp4.register(self.patterns) + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.Graph): - self.begin() - self.dump_graph(graph, "before_act_quant_fusion") - - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns in ActivationQuantFusionPass", - count) - - self.dump_graph(graph, "after_act_quant_fusion") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) def uuid(self): return VllmInductorPass.hash_source(self, ActivationQuantPattern, diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 0658b59a2e215..331cd8a873929 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -20,7 +20,7 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass FP8_DTYPE = current_platform.fp8_dtype() @@ -348,7 +348,7 @@ class AllGatherCutlassScaledMMPattern(BasePattern): pm.fwd_only, pm_pass) -class AsyncTPPass(VllmInductorPass): +class AsyncTPPass(VllmPatternMatcherPass): @enable_fake_mode def __init__(self, config: VllmConfig): @@ -378,18 +378,17 @@ class AsyncTPPass(VllmInductorPass): AllGatherCutlassScaledMMPattern( self.model_dtype, self.device).register(self.patterns) + self.dump_patterns(config, self.patterns) + def is_applicable_for_shape(self, shape: Optional[int]) -> bool: # only do replace for specific shapes tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): - self.begin() - self.dump_graph(graph, "before_async_tp_pass") - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns with async TP pass.", count) - self.dump_graph(graph, "after_async_tp_pass") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) if flashinfer_comm is not None: @@ -1068,7 +1067,7 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern): pm.fwd_only, pm_pass) -class AllReduceFusionPass(VllmInductorPass): +class AllReduceFusionPass(VllmPatternMatcherPass): def __init__(self, config: VllmConfig): super().__init__(config) @@ -1124,6 +1123,7 @@ class AllReduceFusionPass(VllmInductorPass): fuse_rms_quant=config.compilation_config.pass_config.enable_fusion) self.register_patterns() + self.dump_patterns(config, self.patterns) @enable_fake_mode def register_patterns(self): @@ -1172,15 +1172,14 @@ class AllReduceFusionPass(VllmInductorPass): self.disabled = False + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): if self.disabled: + logger.debug("AllReduceFusionPass disabled") return - self.begin() - self.dump_graph(graph, "before_all_reduce_fusion_pass") - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns", count) - self.dump_graph(graph, "after_all_reduce_fusion_pass") - self.end_and_log() + + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) def __del__(self): if getattr(self, "disabled", True): diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 6bc721eec3d45..54403c1f7ca3d 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -26,6 +26,7 @@ class FixFunctionalizationPass(VllmInductorPass): To add new nodes to defunctionalize, add to the if-elif chain in __call__. """ + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.Graph): # XPU does not support auto-functionalization yet. # Will enable this when switch to vllm-xpu-kernels. @@ -34,9 +35,6 @@ class FixFunctionalizationPass(VllmInductorPass): "pass currently.") return - self.begin() - self.dump_graph(graph, "before_fix_functionalization") - self.nodes_to_remove: list[torch.fx.Node] = [] count = 0 for node in graph.nodes: @@ -111,7 +109,7 @@ class FixFunctionalizationPass(VllmInductorPass): count += 1 - self.dump_graph(graph, "before_fix_functionalization_cleanup") + self.dump_graph(graph, "before_cleanup") # Remove the nodes all at once count_removed = len(self.nodes_to_remove) @@ -120,8 +118,7 @@ class FixFunctionalizationPass(VllmInductorPass): logger.debug("De-functionalized %s nodes, removed %s nodes", count, count_removed) - self.dump_graph(graph, "after_fix_functionalization") - self.end_and_log() + self.nodes_to_remove.clear() def _remove(self, node_or_nodes: Union[torch.fx.Node, Iterable[torch.fx.Node]]): diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index afa739c966a5b..3034b6eaeaca1 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable, NamedTuple, Optional +from typing import Any, NamedTuple import torch import torch._inductor.pattern_matcher as pm @@ -16,10 +16,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale) from vllm.platforms import current_platform -from .fx_utils import find_getitem_maybe from .inductor_pass import enable_fake_mode -from .multi_output_match import MultiOutputMatch -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) FP8_DTYPE = current_platform.fp8_dtype() @@ -50,8 +48,7 @@ QUANT_OPS: dict[QuantKey, OpOverload] = { torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 } if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): - QUANT_OPS[ - kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default # noqa: E501 + QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default class FusedRMSQuantKey(NamedTuple): @@ -80,68 +77,6 @@ FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = { } -class QuantMultiOutputMatch(MultiOutputMatch): - - def __init__(self, match: pm.Match, quant_op, fused_op): - super().__init__(match) - assert isinstance(quant_op, OpOverload) - assert isinstance(fused_op, OpOverload) - self.QUANT_OP = quant_op # in-place quant op - self.FUSED_OP = fused_op # in-place fused quant op - - def insert_fused_node(self, fused_return_mapping: dict[int, tuple[fx.Node, - int]], - **kwargs): - """ - This utility function inserts an auto-functionalized node for FUSED_OP. - It also correctly sets its meta value and rebinds the users of the - unfused nodes to use the fused node instead. - - :param fused_return_mapping: A dictionary, mapping from getitem indices - of the fused node result to a tuple of the old node and a getitem index. - :param kwargs: kwargs that get directly forwarded to the auto_fn node - - Example: - If we want to replace this graph: - _, x1, x2 = auto_fn(op1) - _, y1, y2 = auto_fn(op2) - - with - _, x1, y2, x2 = auto_fn(FUSED_OP) - - we would call: - insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)} - - Note that the 0th element is None for auto-functionalized in-place ops. - Hence, others appear 1-indexed. - """ - fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs) - indices = fused_return_mapping.keys() - getitem_nodes = self.insert_getitems(fused_node, indices) - - # Prepare the meta value, use a list so it's mutable - meta_val = [None] * (max(indices) + 1) - - # Iterate through elements of the tuple produced by fused_node - for idx, getitem_node in zip(indices, getitem_nodes): - old_node, old_idx = fused_return_mapping[idx] - - # If the old value was never used, the old_getitem might not exist - old_getitem = find_getitem_maybe(old_node, old_idx) - if old_getitem is not None: - # Rebind the users of match getitem nodes to use the new nodes. - # The old nodes will be removed by DCE at the end of the pass. - old_getitem.replace_all_uses_with(getitem_node) - getitem_node.meta["val"] = old_getitem.meta["val"] - - # Extract the appropriate meta value - # It is present even if the getitem node does not exist - meta_val[idx] = old_node.meta["val"][old_idx] - - # Fix the meta value on the new fused node - fused_node.meta["val"] = tuple(meta_val) - - class RMSNormQuantPattern: def __init__(self, epsilon: float, key: FusedRMSQuantKey): @@ -224,8 +159,7 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): symmetric=symmetric)) super().__init__(epsilon, key) - def register(self, pm_pass: PatternMatcherPass, - record_match: Callable[[MultiOutputMatch], bool]): + def register(self, pm_pass: PatternMatcherPass): def pattern(result: torch.Tensor, input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, @@ -271,36 +205,7 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): inputs, pm.fwd_only, pm_pass, - extra_check=lambda m: record_match( - self.Match(m, self.QUANT_OP, self.FUSED_OP))) - - class Match(QuantMultiOutputMatch): - - def process(self): - # Find the nodes in the match that we need to rebind - rms_node = self.find_auto_fn(RMS_ADD_OP) - quant_node = self.find_auto_fn(self.QUANT_OP) - - assert len(rms_node.users) == 2 - assert len(quant_node.users) == 1 - - # First, insert a new auto_functionalized node for the fused op, - # as well as getitem nodes to extract the result and residual. - # The auto_fn node returns a tuple of (None, result, residual). - # - # The resulting graph looks like this: - # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...) # noqa - # result_node_new = at[1] - # residual_node_new = at[2] - with self.inserting_after_match(): - # Missing epsilon, scalars cannot be inputs to the pattern - kwargs = self.match.kwargs.copy() - - # 0 is always None - fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)} - self.insert_fused_node(fused_return_mapping, - **kwargs, - epsilon=rms_node.kwargs["epsilon"]) + ) class RMSNormDynamicQuantPattern(RMSNormQuantPattern): @@ -317,8 +222,7 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern): symmetric=symmetric)) super().__init__(epsilon, key) - def register(self, pm_pass: PatternMatcherPass, - record_match: Callable[[MultiOutputMatch], bool]): + def register(self, pm_pass: PatternMatcherPass): def pattern(result: torch.Tensor, result_rms: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, @@ -366,39 +270,7 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern): inputs, pm.fwd_only, pm_pass, - extra_check=lambda m: record_match( - self.Match(m, self.QUANT_OP, self.FUSED_OP))) - - class Match(QuantMultiOutputMatch): - - def process(self): - # Find the nodes in the match that we need to rebind - rms_node = self.find_auto_fn(RMS_OP) - quant_node = self.find_auto_fn(self.QUANT_OP) - - assert len(rms_node.users) == 1 - assert len(quant_node.users) == 2 - - # First, insert a new auto_functionalized node for the fused op, - # as well as getitem nodes to extract the result and scale. - # The auto_fn node returns a tuple of (None, result, scale). - # - # The resulting graph looks like this: - # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...) # noqa - # result_node_new = at[1] - # scale_node_new = at[2] - with self.inserting_after_match(): - # Missing epsilon, scalars cannot be inputs to the pattern - kwargs = self.match.kwargs.copy() - del kwargs["result_rms"] # not used in the fused op - - fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)} - self.insert_fused_node( - fused_return_mapping, - epsilon=rms_node.kwargs["epsilon"], - scale_ub=None, # not used but required - residual=None, # not used but required - **kwargs) + ) class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): @@ -415,8 +287,7 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): symmetric=symmetric)) super().__init__(epsilon, key) - def register(self, pm_pass: PatternMatcherPass, - record_match: Callable[[MultiOutputMatch], bool]): + def register(self, pm_pass: PatternMatcherPass): def pattern(result: torch.Tensor, input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, @@ -464,137 +335,49 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): inputs, pm.fwd_only, pm_pass, - extra_check=lambda m: record_match( - self.Match(m, self.QUANT_OP, self.FUSED_OP))) - - class Match(QuantMultiOutputMatch): - - def process(self): - # Find the nodes in the match that we need to rebind - rms_node = self.find_auto_fn(RMS_ADD_OP) - quant_node = self.find_auto_fn(self.QUANT_OP) - - assert len(rms_node.users) == 2 - assert len(quant_node.users) == 2 - - # First, insert a new auto_functionalized node for the fused op, - # as well as getitem nodes to extract result, scale, and residual. - # The auto_fn node returns a tuple (None, result, scale, residual). - # - # The resulting graph looks like this: - # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...) # noqa - # result_node_new = at[1] - # scale_node_new = at[2] - # residual_node_new = at[3] - with self.inserting_after_match(): - # Missing epsilon, scalars cannot be inputs to the pattern - kwargs = self.match.kwargs.copy() - - fused_return_mapping = { - 1: (quant_node, 1), # result - 2: (quant_node, 2), # scale - 3: (rms_node, 2), # residual - } - self.insert_fused_node( - fused_return_mapping, - epsilon=rms_node.kwargs["epsilon"], - scale_ub=None, # not used but required - **kwargs) + ) -class FusionPass(VllmInductorPass): +class RMSNormQuantFusionPass(VllmPatternMatcherPass): """ - This pass fuses a pre-defined set of custom ops into fused ops. - It uses the torch pattern matcher to find the patterns and replace them. - It also manually processes multi-output matches, as those are broken in - the torch pattern matcher. - - Because patterns can only be registered once, the pass is a singleton. - This will be addressed in a future version of PyTorch: - https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 + This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op. + It also supports fused_add_rms_norm. """ - _instance: 'Optional[FusionPass]' = None - - @classmethod - def instance(cls, config: VllmConfig): - """ - Get the singleton instance of the FusionPass. - If the instance exists, the config is updated but - initialization is not repeated. - """ - if cls._instance is None: - cls._instance = FusionPass(config) - else: - cls._instance.pass_config = config.compilation_config.pass_config - return cls._instance - @enable_fake_mode def __init__(self, config: VllmConfig): - assert self.__class__._instance is None, \ - "FusionPass singleton instance already exists" super().__init__(config) - self.matches: list[MultiOutputMatch] = [] self.patterns: PatternMatcherPass = PatternMatcherPass( - pass_name="fusion_pass") + pass_name="rmsnorm_quant_fusion_pass") for epsilon in [1e-5, 1e-6]: # Fuse rms_norm + static fp8 quant RMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(self.patterns) - # Matches for patterns below have 2 or more outputs, - # so we need to process them manually (see process_matches) - - # Fuse rms_norm + static fp8 quant + # Fuse fused_add_rms_norm + static fp8 quant FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register( - self.patterns, self.record_match) + self.patterns) # Fuse rms_norm + dynamic per-token fp8 quant - RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register( - self.patterns, self.record_match) + RMSNormDynamicQuantPattern(epsilon, + FP8_DTYPE).register(self.patterns) # Fuse fused_add_rms_norm + dynamic per-token fp8 quant FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register( - self.patterns, self.record_match) + self.patterns) - # WARNING: This is a hack to clear the pattern matcher cache - # and allow multiple values of epsilon. - torch._inductor.pattern_matcher._seen_patterns.clear() - - def record_match(self, match: MultiOutputMatch) -> bool: - # Hijack the extra_check to record the match and - # save it for post-processing. - self.matches.append(match) - - # Return False to prevent automatic replacement. - return False - - def process_matches(self, graph: fx.Graph): - """ - Manually process multi-output matches and replace them with fused nodes. - See MultiOutputMatch for more details. - """ - for match in self.matches: - match.process() - - # Finally, remove matched nodes - graph.eliminate_dead_code() - assert all(node not in graph.nodes for match in self.matches - for node in match.match.nodes) + self.dump_patterns(config, self.patterns) + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): - self.begin() - self.dump_graph(graph, "before_fusion") + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns", count) - self.dump_graph(graph, "after_pattern_match") - - # Manually process multi-output matches (and run DCE) - self.process_matches(graph) - logger.debug("Post-processed %s matches", len(self.matches)) - self.dump_graph(graph, "after_fusion") - self.matches.clear() - self.end_and_log() + def uuid(self) -> Any: + return self.hash_source(self, RMSNormQuantPattern, + RMSNormStaticQuantPattern, + RMSNormDynamicQuantPattern, + FusedAddRMSNormStaticQuantPattern, + FusedAddRMSNormDynamicQuantPattern) diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index e3677b3dd62d8..2c6cf8f12fdc1 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -18,7 +18,7 @@ from vllm.utils import round_up from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) @@ -245,7 +245,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): pm_pass) -class AttnFusionPass(VllmInductorPass): +class AttnFusionPass(VllmPatternMatcherPass): """ This pass fuses post-attention quantization onto attention if supported. @@ -282,20 +282,12 @@ class AttnFusionPass(VllmInductorPass): "were found in CompilationConfig.static_forward_context " "so no fusion patterns were registered.") + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.graph.Graph) -> None: - self.begin() - self.dump_graph(graph, "before_attn_fusion") - - count = self.patterns.apply(graph) - - # TODO: Move this to pass_manager.py after the fx graph broken issue - # has been resolved. - # see https://github.com/vllm-project/vllm/issues/23091 - graph.eliminate_dead_code() - - logger.debug("Fused quantization onto %s attention nodes", count) - self.dump_graph(graph, "after_attn_fusion") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Fused quant onto %s attention nodes", self.matched_count) def uuid(self): return VllmInductorPass.hash_source(self, AttentionQuantPattern, diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py deleted file mode 100644 index 6d1893777cec6..0000000000000 --- a/vllm/compilation/multi_output_match.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import abc -import operator -from abc import abstractmethod -from collections.abc import Iterable - -from torch import fx -from torch._higher_order_ops.auto_functionalize import auto_functionalized -from torch._inductor import pattern_matcher as pm -from torch._ops import OpOverload -from torch.fx import Node - -from vllm.compilation.fx_utils import find_auto_fn - - -class MultiOutputMatch(abc.ABC): - """ - This class provides utilities to process multi-output matches and - manually insert replacements. - - This is necessary because the automatic replacement for multi-output - matches is broken: https://github.com/pytorch/pytorch/issues/137280 - """ - - def __init__(self, match: pm.Match): - self.match = match - - @abstractmethod - def process(self): - """ - Process a multi-output match and manually insert the replacement. - - This method should: - 1. Insert the replacement nodes after the last node in the match. - 2. Rebind the users of nodes in the match to use the new nodes. - 3. Set meta["val"] for de-functionalization. - - The result of an auto-functionalized node is a tuple of tensors. - The first element is the return value of the function, usually None. - The remaining elements are the mutated args of the function. - - All auto-functionalized nodes must contain a proper meta["val"], - as it is used by de-functionalization. meta["val"] has to contain the - value of the node (tuple of tensors) that would be returned by the - functionalized node during tracing. - - Existing nodes in the graph all have this property set, but we have - to set it manually for new nodes we insert. - - Example: - # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None - at = auto_functionalized(torch.ops._C.foo.default, a, b, c) - # at.meta["val"] = (None, a, c) - """ - raise NotImplementedError - - @property - def nodes(self) -> list[fx.Node]: - return self.match.nodes - - @property - def graph(self) -> fx.Graph: - return self.match.graph - - def find_auto_fn(self, op) -> fx.Node: - """ - Find the first auto_functionalized node with the given op in the match. - """ - return find_auto_fn(self.nodes, op) - - def inserting_after_match(self): - """ - Insert nodes after the last node in the match. - This is done to avoid use-before-definition errors after inserting - replacement nodes. - """ - - # match.nodes is not guaranteed to be sorted. - # Find the last node in the match. - for last_node_in_match in reversed(self.graph.nodes): - if last_node_in_match in self.match.nodes: - break - else: - raise ValueError("No nodes in graph") - - return self.graph.inserting_after(last_node_in_match) - - def insert_getitems(self, tuple_node: fx.Node, - indices: Iterable[int]) -> tuple[fx.Node, ...]: - """ - Insert operator.getitem nodes to extract elements from a tuple node. - - :param tuple_node: The tuple node to extract elements from. - :param indices: The indices of the elements to extract. - :return: Tuple of the new getitem nodes, corresponding to the indices. - """ - with self.graph.inserting_after(tuple_node): - return tuple( - self.graph.call_function(operator.getitem, (tuple_node, idx)) - for idx in indices) - - def insert_auto_fn(self, op: OpOverload, kwargs) -> Node: - """ - Insert an auto_functionalized node with the given op and kwargs. - """ - return self.graph.call_function(auto_functionalized, (op, ), - kwargs=kwargs) diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 17e85e70218da..2c453daf873d2 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -64,9 +64,8 @@ class NoOpEliminationPass(VllmInductorPass): out: "f16[s0, 4096]" = at[1] """ + @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.Graph): - self.begin() - self.dump_graph(graph, "before_noop_elimination") count = 0 # Remove no-op reshapes/views: for node in graph.nodes: @@ -121,8 +120,6 @@ class NoOpEliminationPass(VllmInductorPass): count += 1 logger.debug("Removed %s no-op reshapes and slices", count) - self.dump_graph(graph, "after_noop_elimination") - self.end_and_log() # ---------------------- Reshape helpers ---------------------- def reshape_dims_equivalent(self, dim: Union[int, torch.fx.Node], diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 1b1cbe4fa12c2..e323fa1f77349 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,15 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools from torch import fx as fx +from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.utils import set_env_var + +from .post_cleanup import PostCleanupPass +from .vllm_inductor_pass import VllmInductorPass if current_platform.is_cuda_alike(): from .activation_quant_fusion import ActivationQuantFusionPass - from .fusion import FusionPass + from .fusion import RMSNormQuantFusionPass from .fusion_attn import AttnFusionPass if current_platform.is_cuda(): @@ -19,11 +25,28 @@ from .fix_functionalization import FixFunctionalizationPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context from .noop_elimination import NoOpEliminationPass from .sequence_parallelism import SequenceParallelismPass -from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) +def with_pattern_match_debug(fn): + """ + Function decorator that turns on inductor pattern match debug + for the duration of the call. + Used to avoid logging builtin Inductor pattern matching. + """ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + if (debug_val := envs.VLLM_PATTERN_MATCH_DEBUG) is not None: + # optionally check rank here + with set_env_var("TORCHINDUCTOR_PATTERN_MATCH_DEBUG", debug_val): + return fn(*args, **kwargs) + return fn(*args, **kwargs) + + return wrapper + + class PostGradPassManager(CustomGraphPass): """ The pass manager for post-grad passes. @@ -40,16 +63,26 @@ class PostGradPassManager(CustomGraphPass): """ def __init__(self): - self.passes: list[VllmInductorPass] = [] + self.passes: list[InductorPass] = [] + @with_pattern_match_debug def __call__(self, graph: fx.Graph): + VllmInductorPass.dump_prefix = 0 # reset dump index + shape = get_pass_context().runtime_shape for pass_ in self.passes: if pass_.is_applicable_for_shape(shape): pass_(graph) + VllmInductorPass.dump_prefix += 1 + + # post-cleanup goes before fix_functionalization + # because it requires a functional graph + self.post_cleanup(graph) + VllmInductorPass.dump_prefix += 1 # always run fix_functionalization last self.fix_functionalization(graph) + VllmInductorPass.dump_prefix = None # Cleanup index def configure(self, config: VllmConfig): self.pass_config = config.compilation_config.pass_config @@ -61,14 +94,18 @@ class PostGradPassManager(CustomGraphPass): if self.pass_config.enable_async_tp: self.passes += [AsyncTPPass(config)] + if self.pass_config.enable_fi_allreduce_fusion: + self.passes += [AllReduceFusionPass(config)] + if self.pass_config.enable_fusion: - self.passes += [FusionPass.instance(config)] + self.passes += [RMSNormQuantFusionPass(config)] self.passes += [ActivationQuantFusionPass(config)] if self.pass_config.enable_attn_fusion: self.passes += [AttnFusionPass(config)] - if self.pass_config.enable_fi_allreduce_fusion: - self.passes += [AllReduceFusionPass(config)] + + # needs a functional graph + self.post_cleanup = PostCleanupPass(config) self.fix_functionalization = FixFunctionalizationPass(config) def add(self, pass_: InductorPass): diff --git a/vllm/compilation/post_cleanup.py b/vllm/compilation/post_cleanup.py new file mode 100644 index 0000000000000..6a31f3935da7c --- /dev/null +++ b/vllm/compilation/post_cleanup.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from torch import fx + +from vllm.compilation.vllm_inductor_pass import VllmInductorPass + + +class PostCleanupPass(VllmInductorPass): + """ + This pass performs cleanup after custom passes. + It topologically sorts the graph and removes unused nodes. + This is needed because the pattern matcher does not guarantee producing + a topologically sorted graph, and there may be unused nodes left around. + """ + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph) -> None: + from torch._inductor.pattern_matcher import stable_topological_sort + stable_topological_sort(graph) + graph.eliminate_dead_code() diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 1758ed4c86d27..a6ca50c925a2a 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -15,7 +15,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from .inductor_pass import enable_fake_mode -from .vllm_inductor_pass import VllmInductorPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) @@ -417,7 +417,7 @@ class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): pm.fwd_only, pm_pass) -class SequenceParallelismPass(VllmInductorPass): +class SequenceParallelismPass(VllmPatternMatcherPass): """ This pass enables sequence parallelism for models. It identifies patterns where an AllReduce operation is followed by @@ -466,19 +466,13 @@ class SequenceParallelismPass(VllmInductorPass): LastAllReduceRMSNormPattern(epsilon, self.model_dtype, self.device).register(self.patterns) - - # WARNING: This is a hack to clear the pattern matcher cache - # and allow multiple values of epsilon. - torch._inductor.pattern_matcher._seen_patterns.clear() + self.dump_patterns(config, self.patterns) def is_applicable_for_shape(self, shape: Optional[int]) -> bool: tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): - self.begin() - self.dump_graph(graph, "before_sequence_parallelism_pass") - count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns with sequence parallelism", count) - self.dump_graph(graph, "after_sequence_parallelism_pass") - self.end_and_log() + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index b822b05b0f1ec..837770d181993 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -1,10 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import functools +import operator import time +from pathlib import Path +from typing import ClassVar, Optional +import regex as re import torch from torch._dynamo.utils import lazy_format_graph_code +from torch._inductor.pattern_matcher import (PatternMatcherPass, + PatternPrettyPrinter) from vllm.config import VllmConfig from vllm.logger import init_logger @@ -19,6 +25,8 @@ class VllmInductorPass(InductorPass): An inductor pass with access to vLLM PassConfig. It provides timing, logging, and dumping utilities. """ + dump_prefix: ClassVar[Optional[int]] = None + """Keep track of pass index for debug dump ordering.""" def __init__(self, config: VllmConfig): self.pass_config = config.compilation_config.pass_config @@ -28,8 +36,24 @@ class VllmInductorPass(InductorPass): else None self.pass_name = self.__class__.__name__ + @staticmethod + def time_and_log(call_fn): + + @functools.wraps(call_fn) + def wrapped(self: VllmInductorPass, graph: torch.fx.Graph): + self.begin() + self.dump_graph(graph, "before") + call_fn(self, graph) + self.dump_graph(graph, "after") + self.end_and_log() + + return wrapped + def dump_graph(self, graph: torch.fx.Graph, stage: str): - lazy_format_graph_code(stage, graph.owning_module) + i = VllmInductorPass.dump_prefix + i_str = "" if i is None else f".{i}" + lazy_format_graph_code(f"post_grad{i_str}.{self.pass_name}.{stage}", + graph.owning_module) def begin(self): self._start_time = time.perf_counter_ns() @@ -40,6 +64,88 @@ class VllmInductorPass(InductorPass): logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms) +class VllmPatternMatcherPass(VllmInductorPass): + """ + A VllmInductorPass that uses the Inductor pattern matcher. + Its main use is providing the dump_patterns utility that dumps the + Inductor pattern matcher patterns into a file, which greatly aids debugging. + + TODO(luka) move more utilities to this pass. + """ + matched_count: int = 0 + """The number of matched patterns in the pass.""" + + _OP_OVERLOAD_PATTERN: ClassVar[re.Pattern] = re.compile( + r"<OpOverload\(op='([^']*)', overload='([^']*)'\)>") + + def _replace_op_overloads(self, string: str) -> str: + """Replace <OpOverload(..., ...)> with nicer formulations""" + return self._OP_OVERLOAD_PATTERN.sub( + lambda m: f"torch.ops.{m.group(1)}.{m.group(2)}", + string, + ) + + def dump_patterns(self, config: VllmConfig, pm_pass: PatternMatcherPass): + """ + If debug dumping is enabled, dump the Inductor pattern-matcher patterns + into the debug_dump_path folder next to the dumped fx graphs. + + This method does its best to print something that looks like Python code + for easier debugging and potentially navigation. If any errors appear in + the output, please add to this method. + + TODO(luka): use pattern object to manually produce pattern graph + """ + debug_dump_path = config.compilation_config.debug_dump_path + if not debug_dump_path: + return + + rank = config.parallel_config.rank + debug_dump_path = Path(debug_dump_path) / f"rank_{rank}" + debug_dump_path.mkdir(parents=True, exist_ok=True) + + from vllm.utils import unique_filepath + file_path = unique_filepath( + lambda i: debug_dump_path / f"patterns.{self.pass_name}.{i}.py") + + with file_path.open("w") as f: + print( + f'# This file was produced by VllmPatternMatcherPass.' + f'dump_patterns for {self.pass_name}.\n' + f'# It does its best to produce valid-Python-looking code but' + f' please add to dump_patterns if there are any errors.\n\n' + f'from torch._higher_order_ops.auto_functionalize import ' + f'auto_functionalized as auto_functionalized\n' + f'from torch._inductor.pattern_matcher import *', + file=f) + + for node, patterns in pm_pass.patterns.items(): + # fix the operator.getitem repr + if node[1] == operator.getitem: + node_repr = f"({repr(node[0])}, operator.getitem)" + else: + node_repr = repr(node) + + node_repr = self._replace_op_overloads(node_repr) + + print(f"\n\n# Patterns for op: {node_repr}", file=f) + for i, pattern in enumerate(patterns): + # reserve auto_functionalized ahead of time + pp = PatternPrettyPrinter() + pp.namespace.create_name("auto_functionalized", None) + + # Assemble pattern + out_node = pp.pretty_print(pattern.pattern) + pattern_repr = "\n".join([f"def pattern_{i}():"] + [ + f"{pp.memoized_objs_names[key]} = " + f"{pp.memoized_objs_pp[key]}" + for key in pp.memoized_objs_names + ] + [f"return {out_node}"]).replace("\n", "\n ") + + pattern_repr = self._replace_op_overloads(pattern_repr) + print(f"{pattern_repr}\n", file=f) + + class PrinterInductorPass(VllmInductorPass): def __init__(self, name: str, config: VllmConfig): diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 9bb8087a511d5..92fc68f8927ca 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -905,10 +905,9 @@ def set_current_vllm_config(vllm_config: VllmConfig, except Exception: raise else: - logger.debug("enabled custom ops: %s", - vllm_config.compilation_config.enabled_custom_ops) - logger.debug("disabled custom ops: %s", - vllm_config.compilation_config.disabled_custom_ops) + if check_compile: + vllm_config.compilation_config.custom_op_log_check() + if check_compile and \ vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ and compilation_counter.num_models_seen == num_models_seen: diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 22b38daf46c39..34fa7fcfe7e87 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -487,6 +487,12 @@ class CompilationConfig: "supported with torch>=2.9.0.dev. Set " "use_inductor_graph_partition=False instead.") + for op in self.custom_ops: + if op[0] not in {'+', '-'} and op not in {'all', 'none'}: + raise ValueError(f"Invalid syntax '{op}' for custom op, " + "must be 'all', 'none', '+op' or '-op' " + "(where 'op' is the registered op name)") + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -532,8 +538,8 @@ class CompilationConfig: for x in self.compile_sizes: if isinstance(x, str): assert x == "cudagraph_capture_sizes", \ - "Unrecognized size type in compile_sizes, " \ - f"expect 'cudagraph_capture_sizes', got {x}" + "Unrecognized size type in compile_sizes, " \ + f"expect 'cudagraph_capture_sizes', got {x}" computed_compile_sizes.extend(self.cudagraph_capture_sizes) else: assert isinstance(x, int) @@ -628,3 +634,41 @@ class CompilationConfig: return use_fx_graph_piecewise_compilation or \ use_inductor_piecewise_compilation + + def custom_op_log_check(self): + """ + This method logs the enabled/disabled custom ops and checks that the + passed custom_ops field only contains relevant ops. + It is called at the end of set_current_vllm_config, + after the custom ops have been instantiated. + """ + + if len(self.enabled_custom_ops) + len(self.disabled_custom_ops) == 0: + logger.debug("No custom ops found in model.") + return + + logger.debug("enabled custom ops: %s", self.enabled_custom_ops) + logger.debug("disabled custom ops: %s", self.disabled_custom_ops) + + all_ops_in_model = (self.enabled_custom_ops | self.disabled_custom_ops) + for op in self.custom_ops: + if op in {"all", "none"}: + continue + + assert op[0] in {'+', '-'}, "Invalid custom op syntax " \ + "(should be checked during init)" + + # check if op name exists in model + op_name = op[1:] + if op_name not in all_ops_in_model: + from vllm.model_executor.custom_op import CustomOp + + # Does op exist at all or is it just not present in this model? + # Note: Only imported op classes appear in the registry. + missing_str = "doesn't exist (or wasn't imported/registered)" \ + if op_name not in CustomOp.op_registry \ + else "not present in model" + + enable_str = "enabling" if op[0] == '+' else "disabling" + logger.warning_once("Op '%s' %s, %s with '%s' has no effect", + op_name, missing_str, enable_str, op) diff --git a/vllm/envs.py b/vllm/envs.py index 8941be125ed08..eaee2f6cc7716 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -190,6 +190,7 @@ if TYPE_CHECKING: VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" + VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None def get_default_cache_root(): @@ -442,6 +443,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_STANDALONE_COMPILE": lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "0") == "1", + # Debug pattern matching inside custom passes. + # Should be set to the fx.Node name (e.g. 'getitem_34' or 'scaled_mm_3'). + "VLLM_PATTERN_MATCH_DEBUG": + lambda: os.environ.get("VLLM_PATTERN_MATCH_DEBUG", None), + # local rank of the process in the distributed setting, used to determine # the GPU device id "LOCAL_RANK": diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 022e35a399c53..3399d00fbabbd 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3392,7 +3392,7 @@ def length_from_prompt_token_ids_or_embeds( prompt_token_ids: Optional[list[int]], prompt_embeds: Optional[torch.Tensor], ) -> int: - """Calculate the request length (in number of tokens) give either + """Calculate the request length (in number of tokens) give either prompt_token_ids or prompt_embeds. """ prompt_token_len = None if prompt_token_ids is None else len( @@ -3413,3 +3413,16 @@ def length_from_prompt_token_ids_or_embeds( f" prompt_token_ids={prompt_token_len}" f" prompt_embeds={prompt_embeds_len}") return prompt_token_len + + +@contextlib.contextmanager +def set_env_var(key, value): + old = os.environ.get(key) + os.environ[key] = value + try: + yield + finally: + if old is None: + del os.environ[key] + else: + os.environ[key] = old From 8db29392899b12843787b0a03880f70a6e6ac88e Mon Sep 17 00:00:00 2001 From: Or Ozeri <oro@il.ibm.com> Date: Mon, 22 Sep 2025 22:30:36 +0300 Subject: [PATCH 558/584] [KV offload][5/N] Add `CPUOffloadingSpec` (#24251) Signed-off-by: Or Ozeri <oro@il.ibm.com> --- docs/features/disagg_prefill.md | 6 ++ .../{test_cpu.py => test_cpu_manager.py} | 0 tests/v1/kv_offload/test_cpu_offloading.py | 62 +++++++++++++++ vllm/v1/kv_offload/cpu.py | 75 +++++++++++++++++++ vllm/v1/kv_offload/factory.py | 3 + 5 files changed, 146 insertions(+) rename tests/v1/kv_offload/{test_cpu.py => test_cpu_manager.py} (100%) create mode 100644 tests/v1/kv_offload/test_cpu_offloading.py create mode 100644 vllm/v1/kv_offload/cpu.py diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md index 996ef00a6b960..69f70b8ff5ac3 100644 --- a/docs/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -31,6 +31,12 @@ Now supports 5 types of connectors: --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}' ``` +- **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker): + + ```bash + --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "num_cpu_blocks": 1000}}' + ``` + ## Benchmarks Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks. diff --git a/tests/v1/kv_offload/test_cpu.py b/tests/v1/kv_offload/test_cpu_manager.py similarity index 100% rename from tests/v1/kv_offload/test_cpu.py rename to tests/v1/kv_offload/test_cpu_manager.py diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py new file mode 100644 index 0000000000000..fc8ca09bea3de --- /dev/null +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time + +import pytest + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +CPU_BLOCK_SIZES = [16, 48] + + +@pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES) +def test_cpu_offloading(cpu_block_size: int) -> None: + """ + Tests OffloadingConnector with CPUOffloadingSpec. + """ + + # configure OffloadingConnector (spec_name=CPUOffloadingSpec by default) + kv_transfer_config = KVTransferConfig( + kv_connector="OffloadingConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "num_cpu_blocks": 100, + "block_size": cpu_block_size + }, + ) + + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + gpu_memory_utilization=0.5, + kv_transfer_config=kv_transfer_config, + ) + + prompts = ["Hi " * 100] + sampling_params = SamplingParams(temperature=0, max_tokens=20) + + # run generation - this should trigger saving KV cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cold_time = time.time() - start_time + + # run generation again - should hit the GPU prefix cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + gpu_hit_time = time.time() - start_time + + # reset prefix cache to avoid GPU hit. + llm.reset_prefix_cache() + + # sleep for a sec to make sure CPU finished storing + time.sleep(1) + + # run generation again - this should trigger loading from CPU + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cpu_hit_time = time.time() - start_time + + print("Generation times:") + print(f" Cold: {cold_time * 1000:.2f}ms") + print(f" GPU hit: {gpu_hit_time * 1000:.2f}ms") + print(f" CPU hit: {cpu_hit_time * 1000:.2f}ms") diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py new file mode 100644 index 0000000000000..b85d375fe63e2 --- /dev/null +++ b/vllm/v1/kv_offload/cpu.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterator +from typing import Optional + +import torch + +from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.platforms import current_platform +from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager +from vllm.v1.kv_offload.backends.cpu import CPUBackend +from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec +from vllm.v1.kv_offload.spec import OffloadingSpec +from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler +from vllm.v1.kv_offload.worker.worker import OffloadingHandler + + +class CPUOffloadingSpec(OffloadingSpec): + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + + num_cpu_blocks = self.extra_config.get("num_cpu_blocks") + if not num_cpu_blocks: + raise Exception("num_cpu_blocks must be specified " + "in kv_connector_extra_config") + self.num_cpu_blocks: int = num_cpu_blocks + + # scheduler-side + self._manager: Optional[OffloadingManager] = None + + # worker-side + self._handler: Optional[OffloadingHandler] = None + + def get_manager(self) -> OffloadingManager: + if not self._manager: + kv_events_config = self.vllm_config.kv_events_config + enable_events = (kv_events_config is not None + and kv_events_config.enable_kv_cache_events) + self._manager = LRUOffloadingManager(CPUBackend( + block_size=self.offloaded_block_size, + num_blocks=self.num_cpu_blocks), + enable_events=enable_events) + return self._manager + + def get_handlers( + self, kv_caches: dict[str, torch.Tensor] + ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], + OffloadingHandler]]: + if not self._handler: + if not current_platform.is_cuda(): + raise Exception("CPU Offloading is currently only supported" + " on CUDA GPUs") + + layer_names = list(kv_caches.keys()) + layers = get_layers_from_vllm_config(self.vllm_config, + AttentionLayerBase, + layer_names) + attn_backends = { + layer_name: layers[layer_name].get_attn_backend() + for layer_name in layer_names + } + + self._handler = CpuGpuOffloadingHandler( + attn_backends=attn_backends, + gpu_block_size=self.gpu_block_size, + cpu_block_size=self.offloaded_block_size, + num_cpu_blocks=self.num_cpu_blocks, + gpu_caches=kv_caches) + + assert self._handler is not None + yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler + yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py index 6365ab4a6db75..f9bef6cea9038 100644 --- a/vllm/v1/kv_offload/factory.py +++ b/vllm/v1/kv_offload/factory.py @@ -51,3 +51,6 @@ class OffloadingSpecFactory: # Register various specs here. +OffloadingSpecFactory.register_spec("CPUOffloadingSpec", + "vllm.v1.kv_offload.cpu", + "CPUOffloadingSpec") From f552d5e578077574276aa9d83139b91e1d5ae163 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 23 Sep 2025 04:18:24 +0800 Subject: [PATCH 559/584] [CI/Build] Skip Qwen3-VL initialization tests until models are actually released (#25394) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- tests/models/registry.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 29b6980aaa42a..6047a7a3e98dc 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -562,10 +562,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501 max_model_len=4096, - min_transformers_version="4.57"), # noqa: E501 + min_transformers_version="4.57", + is_available_online=False), "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501 - max_model_len=4096, - min_transformers_version="4.57"), + max_model_len=4096, + min_transformers_version="4.57", + is_available_online=False), "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", trust_remote_code=True), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", From 8bed1791091510012c33f563ee8ff02e13bf6297 Mon Sep 17 00:00:00 2001 From: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:14:44 -0700 Subject: [PATCH 560/584] [TPU] update torch_xla dependency for PyPI compatibility (#25278) Signed-off-by: Johnny Yang <johnnyyang@google.com> Co-authored-by: Chengji Yao <chengjiyao@google.com> --- requirements/tpu.txt | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 7ea239b48ea26..4241cbb2b0333 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -14,14 +14,4 @@ nixl==0.3.0 tpu_info==0.4.0 # Install torch_xla ---pre ---extra-index-url https://download.pytorch.org/whl/nightly/cpu ---find-links https://storage.googleapis.com/libtpu-wheels/index.html ---find-links https://storage.googleapis.com/libtpu-releases/index.html ---find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html ---find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.9.0.dev20250730 -torchvision==0.24.0.dev20250730 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" - +torch_xla[tpu, pallas]==2.8.0 \ No newline at end of file From 45d7d852d362b055b7c02eb967c4ea00d6f9e130 Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Mon, 22 Sep 2025 19:38:19 -0400 Subject: [PATCH 561/584] [Frontend] Responses API MCP tools for built in tools and to pass through headers (#24628) Signed-off-by: Alec Solder <alecs@fb.com> Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com> Co-authored-by: Alec Solder <alecs@fb.com> Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com> --- .../openai/test_response_api_mcp_tools.py | 106 +++++++++ .../openai/test_response_api_with_harmony.py | 9 +- tests/test_envs.py | 216 ++++++++++++++++++ vllm/entrypoints/context.py | 37 ++- vllm/entrypoints/harmony_utils.py | 4 +- vllm/entrypoints/openai/serving_responses.py | 25 +- vllm/entrypoints/tool_server.py | 29 ++- vllm/envs.py | 66 +++++- 8 files changed, 463 insertions(+), 29 deletions(-) create mode 100644 tests/entrypoints/openai/test_response_api_mcp_tools.py create mode 100644 tests/test_envs.py diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py new file mode 100644 index 0000000000000..b0eb84712c199 --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import pytest_asyncio +from openai import OpenAI + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "openai/gpt-oss-20b" + + +@pytest.fixture(scope="module") +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module") +def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch): + args = ["--enforce-eager", "--tool-server", "demo"] + + with monkeypatch_module.context() as m: + m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") + m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv") + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="function") +def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch): + args = ["--enforce-eager", "--tool-server", "demo"] + + with monkeypatch_module.context() as m: + m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") + m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv") + m.setenv("GPT_OSS_SYSTEM_TOOL_MCP_LABELS", + "code_interpreter,container") + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def mcp_disabled_client(mcp_disabled_server): + async with mcp_disabled_server.get_async_client() as async_client: + yield async_client + + +@pytest_asyncio.fixture +async def mcp_enabled_client(mcp_enabled_server): + async with mcp_enabled_server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.") +async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, + model_name: str): + response = await mcp_enabled_client.responses.create( + model=model_name, + # TODO: Ideally should be able to set max tool calls + # to prevent multi-turn, but it is not currently supported + # would speed up the test + input=("What's the first 4 digits after the decimal point of " + "cube root of `19910212 * 20250910`? " + "Show only the digits. The python interpreter is not stateful " + "and you must print to see the output."), + tools=[{ + "type": "mcp", + "server_label": "code_interpreter", + # URL unused for DemoToolServer + "server_url": "http://localhost:8888" + }], + ) + assert response is not None + assert response.status == "completed" + assert response.usage.output_tokens_details.tool_output_tokens > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.") +async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, + model_name: str): + response = await mcp_disabled_client.responses.create( + model=model_name, + # TODO: Ideally should be able to set max tool calls + # to prevent multi-turn, but it is not currently supported + # would speed up the test + input=("What's the first 4 digits after the decimal point of " + "cube root of `19910212 * 20250910`? " + "Show only the digits. The python interpreter is not stateful " + "and you must print to see the output."), + tools=[{ + "type": "mcp", + "server_label": "code_interpreter", + # URL unused for DemoToolServer + "server_url": "http://localhost:8888" + }], + ) + assert response is not None + assert response.status == "completed" + assert response.usage.output_tokens_details.tool_output_tokens == 0 diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index f3c3148577b85..23d8373d97809 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -454,7 +454,13 @@ async def test_web_search(client: OpenAI, model_name: str): async def test_code_interpreter(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, - input="Multiply 64548*15151 using builtin python interpreter.", + # TODO: Ideally should be able to set max tool calls + # to prevent multi-turn, but it is not currently supported + # would speed up the test + input=("What's the first 4 digits after the decimal point of " + "cube root of `19910212 * 20250910`? " + "Show only the digits. The python interpreter is not stateful " + "and you must print to see the output."), tools=[{ "type": "code_interpreter", "container": { @@ -464,6 +470,7 @@ async def test_code_interpreter(client: OpenAI, model_name: str): ) assert response is not None assert response.status == "completed" + assert response.usage.output_tokens_details.tool_output_tokens > 0 def get_weather(latitude, longitude): diff --git a/tests/test_envs.py b/tests/test_envs.py new file mode 100644 index 0000000000000..f81a6e2e415cd --- /dev/null +++ b/tests/test_envs.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +from unittest.mock import patch + +import pytest + +from vllm.envs import env_list_with_choices, env_with_choices + + +class TestEnvWithChoices: + """Test cases for env_with_choices function.""" + + def test_default_value_returned_when_env_not_set(self): + """Test default is returned when env var is not set.""" + env_func = env_with_choices("NONEXISTENT_ENV", "default", + ["option1", "option2"]) + assert env_func() == "default" + + def test_none_default_returned_when_env_not_set(self): + """Test that None is returned when env not set and default is None.""" + env_func = env_with_choices("NONEXISTENT_ENV", None, + ["option1", "option2"]) + assert env_func() is None + + def test_valid_value_returned_case_sensitive(self): + """Test that valid value is returned in case sensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "option1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=True) + assert env_func() == "option1" + + def test_valid_lowercase_value_returned_case_insensitive(self): + """Test that lowercase value is accepted in case insensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "option1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["OPTION1", "OPTION2"], + case_sensitive=False) + assert env_func() == "option1" + + def test_valid_uppercase_value_returned_case_insensitive(self): + """Test that uppercase value is accepted in case insensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "OPTION1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=False) + assert env_func() == "OPTION1" + + def test_invalid_value_raises_error_case_sensitive(self): + """Test that invalid value raises ValueError in case sensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "invalid"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=True) + with pytest.raises(ValueError, + match="Invalid value 'invalid' for TEST_ENV"): + env_func() + + def test_case_mismatch_raises_error_case_sensitive(self): + """Test that case mismatch raises ValueError in case sensitive mode.""" + with patch.dict(os.environ, {"TEST_ENV": "OPTION1"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=True) + with pytest.raises(ValueError, + match="Invalid value 'OPTION1' for TEST_ENV"): + env_func() + + def test_invalid_value_raises_error_case_insensitive(self): + """Test that invalid value raises ValueError when case insensitive.""" + with patch.dict(os.environ, {"TEST_ENV": "invalid"}): + env_func = env_with_choices("TEST_ENV", + "default", ["option1", "option2"], + case_sensitive=False) + with pytest.raises(ValueError, + match="Invalid value 'invalid' for TEST_ENV"): + env_func() + + def test_callable_choices_resolved_correctly(self): + """Test that callable choices are resolved correctly.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1"}): + env_func = env_with_choices("TEST_ENV", "default", get_choices) + assert env_func() == "dynamic1" + + def test_callable_choices_with_invalid_value(self): + """Test that callable choices raise error for invalid values.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "invalid"}): + env_func = env_with_choices("TEST_ENV", "default", get_choices) + with pytest.raises(ValueError, + match="Invalid value 'invalid' for TEST_ENV"): + env_func() + + +class TestEnvListWithChoices: + """Test cases for env_list_with_choices function.""" + + def test_default_list_returned_when_env_not_set(self): + """Test that default list is returned when env var is not set.""" + env_func = env_list_with_choices("NONEXISTENT_ENV", + ["default1", "default2"], + ["option1", "option2"]) + assert env_func() == ["default1", "default2"] + + def test_empty_default_list_returned_when_env_not_set(self): + """Test that empty default list is returned when env not set.""" + env_func = env_list_with_choices("NONEXISTENT_ENV", [], + ["option1", "option2"]) + assert env_func() == [] + + def test_single_valid_value_parsed_correctly(self): + """Test that single valid value is parsed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": "option1"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1"] + + def test_multiple_valid_values_parsed_correctly(self): + """Test that multiple valid values are parsed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option2"] + + def test_values_with_whitespace_trimmed(self): + """Test that values with whitespace are trimmed correctly.""" + with patch.dict(os.environ, {"TEST_ENV": " option1 , option2 "}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option2"] + + def test_empty_values_filtered_out(self): + """Test that empty values are filtered out.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,,option2,"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option2"] + + def test_empty_string_returns_default(self): + """Test that empty string returns default.""" + with patch.dict(os.environ, {"TEST_ENV": ""}): + env_func = env_list_with_choices("TEST_ENV", ["default"], + ["option1", "option2"]) + assert env_func() == ["default"] + + def test_only_commas_returns_default(self): + """Test that string with only commas returns default.""" + with patch.dict(os.environ, {"TEST_ENV": ",,,"}): + env_func = env_list_with_choices("TEST_ENV", ["default"], + ["option1", "option2"]) + assert env_func() == ["default"] + + def test_case_sensitive_validation(self): + """Test case sensitive validation.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,OPTION2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"], + case_sensitive=True) + with pytest.raises(ValueError, + match="Invalid value 'OPTION2' in TEST_ENV"): + env_func() + + def test_case_insensitive_validation(self): + """Test case insensitive validation.""" + with patch.dict(os.environ, {"TEST_ENV": "OPTION1,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"], + case_sensitive=False) + assert env_func() == ["OPTION1", "option2"] + + def test_invalid_value_in_list_raises_error(self): + """Test that invalid value in list raises ValueError.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,invalid,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + with pytest.raises(ValueError, + match="Invalid value 'invalid' in TEST_ENV"): + env_func() + + def test_callable_choices_resolved_correctly(self): + """Test that callable choices are resolved correctly.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1,dynamic2"}): + env_func = env_list_with_choices("TEST_ENV", [], get_choices) + assert env_func() == ["dynamic1", "dynamic2"] + + def test_callable_choices_with_invalid_value(self): + """Test that callable choices raise error for invalid values.""" + + def get_choices(): + return ["dynamic1", "dynamic2"] + + with patch.dict(os.environ, {"TEST_ENV": "dynamic1,invalid"}): + env_func = env_list_with_choices("TEST_ENV", [], get_choices) + with pytest.raises(ValueError, + match="Invalid value 'invalid' in TEST_ENV"): + env_func() + + def test_duplicate_values_preserved(self): + """Test that duplicate values in the list are preserved.""" + with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}): + env_func = env_list_with_choices("TEST_ENV", [], + ["option1", "option2"]) + assert env_func() == ["option1", "option1", "option2"] diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 8619452f2445f..ea81fdbcd825e 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -8,6 +8,7 @@ from abc import ABC, abstractmethod from contextlib import AsyncExitStack from typing import TYPE_CHECKING, Optional, Union +from openai.types.responses.tool import Mcp from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm.entrypoints.harmony_utils import ( @@ -21,6 +22,24 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) +# This is currently needed as the tool type doesn't 1:1 match the +# tool namespace, which is what is used to look up the +# connection to the tool server +_TOOL_NAME_TO_TYPE_MAP = { + "browser": "web_search_preview", + "python": "code_interpreter", + "container": "container", +} + + +def _map_tool_name_to_tool_type(tool_name: str) -> str: + if tool_name not in _TOOL_NAME_TO_TYPE_MAP: + available_tools = ', '.join(_TOOL_NAME_TO_TYPE_MAP.keys()) + raise ValueError( + f"Built-in tool name '{tool_name}' not defined in mapping. " + f"Available tools: {available_tools}") + return _TOOL_NAME_TO_TYPE_MAP[tool_name] + class TurnTokens: """Tracks token counts for a single conversation turn.""" @@ -59,8 +78,8 @@ class ConversationContext(ABC): @abstractmethod async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack, - request_id: str) -> None: + exit_stack: AsyncExitStack, request_id: str, + mcp_tools: dict[str, Mcp]) -> None: pass @abstractmethod @@ -96,8 +115,8 @@ class SimpleContext(ConversationContext): raise NotImplementedError("Should not be called.") async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack, - request_id: str) -> None: + exit_stack: AsyncExitStack, request_id: str, + mcp_tools: dict[str, Mcp]) -> None: pass async def cleanup_session(self) -> None: @@ -318,13 +337,17 @@ class HarmonyContext(ConversationContext): ] async def init_tool_sessions(self, tool_server: Optional[ToolServer], - exit_stack: AsyncExitStack, - request_id: str) -> None: + exit_stack: AsyncExitStack, request_id: str, + mcp_tools: dict[str, Mcp]): if tool_server: for tool_name in self.available_tools: if tool_name not in self._tool_sessions: + tool_type = _map_tool_name_to_tool_type(tool_name) + headers = mcp_tools[ + tool_type].headers if tool_type in mcp_tools else None tool_session = await exit_stack.enter_async_context( - tool_server.new_session(tool_name, request_id)) + tool_server.new_session(tool_name, request_id, + headers)) self._tool_sessions[tool_name] = tool_session exit_stack.push_async_exit(self.cleanup_session) diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 1364a41be950d..57e4bb1e1da52 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -126,8 +126,10 @@ def get_developer_message( function_tools: list[Union[Tool, ChatCompletionToolsParam]] = [] for tool in tools: if tool.type in ("web_search_preview", "code_interpreter", - "container"): + "container", "mcp"): # These are built-in tools that are added to the system message. + # Adding in MCP for now until we support MCP tools executed + # server side pass elif tool.type == "function": diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 6e243671af242..99bb464db1d13 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -460,8 +460,12 @@ class OpenAIServingResponses(OpenAIServing): async with AsyncExitStack() as exit_stack: try: + mcp_tools = { + tool.server_label: tool + for tool in request.tools if tool.type == "mcp" + } await context.init_tool_sessions(self.tool_server, exit_stack, - request.request_id) + request.request_id, mcp_tools) async for _ in result_generator: pass except asyncio.CancelledError: @@ -748,11 +752,16 @@ class OpenAIServingResponses(OpenAIServing): # New conversation. reasoning_effort = (request.reasoning.effort if request.reasoning else None) - # Temporary: OpenAI types doesn't have container tool - # so we used MCP to cover that, up for change tool_types = [tool.type for tool in request.tools] - if envs.VLLM_GPT_OSS_USE_CONTAINER_TOOL: - tool_types.append("container") + + # Allow the MCP Tool type to enable built in tools if the + # server_label is allowlisted in + # envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS + if envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS: + for tool in request.tools: + if (tool.type == "mcp" and tool.server_label + in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS): + tool_types.append(tool.server_label) enable_browser = ("web_search_preview" in tool_types and self.tool_server is not None and self.tool_server.has_tool("browser")) @@ -1653,8 +1662,12 @@ class OpenAIServingResponses(OpenAIServing): async with AsyncExitStack() as exit_stack: processer = None if self.use_harmony: + mcp_tools = { + tool.server_label: tool + for tool in request.tools if tool.type == "mcp" + } await context.init_tool_sessions(self.tool_server, exit_stack, - request.request_id) + request.request_id, mcp_tools) processer = self._process_harmony_streaming_events else: processer = self._process_simple_streaming_events diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py index 056a571fb2fd1..4c627b865ef92 100644 --- a/vllm/entrypoints/tool_server.py +++ b/vllm/entrypoints/tool_server.py @@ -18,7 +18,6 @@ if TYPE_CHECKING: async def list_server_and_tools(server_url: str): from mcp import ClientSession from mcp.client.sse import sse_client - async with sse_client(url=server_url) as streams, ClientSession( *streams) as session: initialize_response = await session.initialize() @@ -86,8 +85,12 @@ class ToolServer(ABC): pass @abstractmethod - def new_session(self, tool_name: str, - session_id: str) -> AbstractAsyncContextManager[Any]: + def new_session( + self, + tool_name: str, + session_id: str, + headers: Optional[dict[str, str]] = None + ) -> AbstractAsyncContextManager[Any]: """ Create a session for the tool. """ @@ -144,16 +147,21 @@ class MCPToolServer(ToolServer): return self.harmony_tool_descriptions.get(tool_name) @asynccontextmanager - async def new_session(self, tool_name: str, session_id: str): + async def new_session(self, + tool_name: str, + session_id: str, + headers: Optional[dict[str, str]] = None): from mcp import ClientSession from mcp.client.sse import sse_client url = self.urls.get(tool_name) - headers = {"x-session-id": session_id} + request_headers = {"x-session-id": session_id} + if headers is not None: + request_headers.update(headers) if not url: raise KeyError(f"Tool '{tool_name}' is not supported") - async with sse_client(url=url, - headers=headers) as streams, ClientSession( - *streams) as session: + async with sse_client( + url=url, headers=request_headers) as streams, ClientSession( + *streams) as session: await session.initialize() yield session @@ -189,7 +197,10 @@ class DemoToolServer(ToolServer): raise ValueError(f"Unknown tool {tool_name}") @asynccontextmanager - async def new_session(self, tool_name: str, session_id: str): + async def new_session(self, + tool_name: str, + session_id: str, + headers: Optional[dict[str, str]] = None): if tool_name not in self.tools: raise KeyError(f"Tool '{tool_name}' is not supported") yield self.tools[tool_name] diff --git a/vllm/envs.py b/vllm/envs.py index eaee2f6cc7716..ee5efff8bcd92 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -185,11 +185,11 @@ if TYPE_CHECKING: VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False - VLLM_GPT_OSS_USE_CONTAINER_TOOL: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" + GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = [] VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None @@ -261,6 +261,58 @@ def env_with_choices( return _get_validated_env +def env_list_with_choices( + env_name: str, + default: list[str], + choices: Union[list[str], Callable[[], list[str]]], + case_sensitive: bool = True) -> Callable[[], list[str]]: + """ + Create a lambda that validates environment variable + containing comma-separated values against allowed choices + + Args: + env_name: Name of the environment variable + default: Default list of values if not set + choices: List of valid string options or callable that returns list + case_sensitive: Whether validation should be case sensitive + + Returns: + Lambda function for environment_variables + dict that returns list of strings + """ + + def _get_validated_env_list() -> list[str]: + value = os.getenv(env_name) + if value is None: + return default + + # Split comma-separated values and strip whitespace + values = [v.strip() for v in value.split(",") if v.strip()] + + if not values: + return default + + # Resolve choices if it's a callable (for lazy loading) + actual_choices = choices() if callable(choices) else choices + + # Validate each value + for val in values: + if not case_sensitive: + check_value = val.lower() + check_choices = [choice.lower() for choice in actual_choices] + else: + check_value = val + check_choices = actual_choices + + if check_value not in check_choices: + raise ValueError(f"Invalid value '{val}' in {env_name}. " + f"Valid options: {actual_choices}.") + + return values + + return _get_validated_env_list + + def get_vllm_port() -> Optional[int]: """Get the port from VLLM_PORT environment variable. @@ -1320,10 +1372,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), - # Allows vllm use container tool - "VLLM_GPT_OSS_USE_CONTAINER_TOOL": - lambda: bool(int(os.getenv("VLLM_GPT_OSS_USE_CONTAINER_TOOL", "0"))), - # Allows harmony instructions to be injected on system messages "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool( @@ -1343,6 +1391,14 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: os.getenv("VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME", "VLLM_OBJECT_STORAGE_SHM_BUFFER"), + + # Valid values are container,code_interpreter,web_search_preview + # ex GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter + "GPT_OSS_SYSTEM_TOOL_MCP_LABELS": + env_list_with_choices("GPT_OSS_SYSTEM_TOOL_MCP_LABELS", [], + ["container", + "code_interpreter", + "web_search_preview"]), } # --8<-- [end:env-vars-definition] From d588cd24061011f76da721c89f9e2171a2b2c4c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= <ProExpertProg@users.noreply.github.com> Date: Mon, 22 Sep 2025 20:07:43 -0400 Subject: [PATCH 562/584] [Bugfix] fix custom op test (#25429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič <lgovedic@redhat.com> --- .../model_executor/test_enabled_custom_ops.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 86139d598582d..92ce10a9efc0b 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional import pytest import torch @@ -34,15 +35,15 @@ class Relu3(ReLUSquaredActivation): [ # Default values based on compile level # - All by default (no Inductor compilation) - ("", 0, False, [True] * 4, True), - ("", 1, True, [True] * 4, True), - ("", 2, False, [True] * 4, True), + (None, 0, False, [True] * 4, True), + (None, 1, True, [True] * 4, True), + (None, 2, False, [True] * 4, True), # - None by default (with Inductor) - ("", 3, True, [False] * 4, False), - ("", 4, True, [False] * 4, False), + (None, 3, True, [False] * 4, False), + (None, 4, True, [False] * 4, False), # - All by default (without Inductor) - ("", 3, False, [True] * 4, True), - ("", 4, False, [True] * 4, True), + (None, 3, False, [True] * 4, True), + (None, 4, False, [True] * 4, True), # Explicitly enabling/disabling # # Default: all @@ -54,7 +55,7 @@ class Relu3(ReLUSquaredActivation): # All but SiluAndMul ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True), # All but ReLU3 (even if ReLU2 is on) - ("-relu3,relu2", 3, False, [1, 1, 1, 0], True), + ("-relu3,+relu2", 3, False, [1, 1, 1, 0], True), # RMSNorm and SiluAndMul ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False), # All but RMSNorm @@ -67,12 +68,13 @@ class Relu3(ReLUSquaredActivation): # All but RMSNorm ("all,-rms_norm", 4, True, [0, 1, 1, 1], True), ]) -def test_enabled_ops(env: str, torch_level: int, use_inductor: bool, +def test_enabled_ops(env: Optional[str], torch_level: int, use_inductor: bool, ops_enabled: list[int], default_on: bool): + custom_ops = env.split(',') if env else [] vllm_config = VllmConfig( compilation_config=CompilationConfig(use_inductor=bool(use_inductor), level=torch_level, - custom_ops=env.split(","))) + custom_ops=custom_ops)) with set_current_vllm_config(vllm_config): assert CustomOp.default_on() == default_on From f31ff874607c311dda7aebe52f73c20f5cad923d Mon Sep 17 00:00:00 2001 From: Russell Bryant <rbryant@redhat.com> Date: Mon, 22 Sep 2025 20:09:52 -0400 Subject: [PATCH 563/584] [Core] Drop overly aggressive whisper assertion (#25408) Signed-off-by: Russell Bryant <rbryant@redhat.com> --- vllm/v1/core/sched/scheduler.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ef77d9e2d3ffd..7fc4776b02611 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -463,10 +463,6 @@ class Scheduler(SchedulerInterface): # always padded to the maximum length. If we support other # encoder-decoder models, this will need to be updated if we # want to only allocate what is needed. - assert ("whisper" - in self.vllm_config.model_config.model.lower()), ( - "Whisper is the only supported " - "encoder-decoder model.") num_encoder_tokens =\ self.scheduler_config.max_num_encoder_input_tokens else: From 090197034faf3b193c4467cedeb9281e3078892d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com> Date: Tue, 23 Sep 2025 02:10:59 +0200 Subject: [PATCH 564/584] [Bugfix] Fix missing `clear_connector_metadata` (#25397) Signed-off-by: NickLucche <nlucches@redhat.com> --- .../unit/test_kv_connector_lifecyle.py | 59 +++++++++++++++++++ .../worker/kv_connector_model_runner_mixin.py | 1 + 2 files changed, 60 insertions(+) create mode 100644 tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py new file mode 100644 index 0000000000000..fe6296cf12ea0 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa: E501 + SharedStorageConnectorMetadata) +from vllm.distributed.kv_transfer.kv_transfer_state import ( + ensure_kv_transfer_initialized, get_kv_transfer_group) +from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput +from vllm.v1.worker.kv_connector_model_runner_mixin import ( + KVConnectorModelRunnerMixin) + +# Importing utils registers TestSharedStorageConnector with the factory +from .utils import create_vllm_config + + +def _make_empty_scheduler_output(): + return SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=CachedRequestData.make_empty(), + num_scheduled_tokens={}, + total_num_scheduled_tokens=0, + scheduled_spec_decode_tokens={}, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=[], + finished_req_ids=set(), + free_encoder_mm_hashes=[], + structured_output_request_ids={}, + grammar_bitmask=None, + kv_connector_metadata=SharedStorageConnectorMetadata(), + ) + + +def test_kv_connector_mixin_clears_metadata(): + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_connector = "TestSharedStorageConnector" + vllm_config.kv_transfer_config.kv_role = "kv_both" + vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = ("unit") + + # Initialize the global connector instance + ensure_kv_transfer_initialized(vllm_config) + + try: + # Minimal scheduler output with empty metadata; mixin should still + # bind/clear metadata even if no loads happen + scheduler_output = _make_empty_scheduler_output() + + # Invoke the no-forward path which uses the mixin context manager + KVConnectorModelRunnerMixin.kv_connector_no_forward( + scheduler_output, vllm_config) + + # Verify clear_connector_metadata was called on the connector + connector = get_kv_transfer_group() + assert connector._connector_metadata is None + # Test connector wrapper records method calls + assert connector.call_record.get("bind_connector_metadata", 0) == 1 + assert connector.call_record.get("clear_connector_metadata", 0) == 1 + finally: + # Ensure we clean up the global connector between tests + KVConnectorModelRunnerMixin.ensure_kv_transfer_shutdown() diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 016a90c196ba3..7eaff924ecc1f 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -123,6 +123,7 @@ class KVConnectorModelRunnerMixin: output.kv_connector_stats = KVConnectorModelRunnerMixin.\ get_kv_connector_stats() + kv_connector.clear_connector_metadata() @staticmethod def get_kv_connector_stats() -> Optional[KVConnectorStats]: From ac0048c0aea977e520aa19f3b9155fc19d696df7 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni@redhat.com> Date: Mon, 22 Sep 2025 20:26:17 -0400 Subject: [PATCH 565/584] [BugFix] [DP/EP] Fix slow execution when BS <= DP (#25407) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Chris Bamford <chrisbam4d@gmail.com> --- vllm/v1/worker/gpu_model_runner.py | 7 ++++--- vllm/v1/worker/gpu_worker.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b0cd0f4133079..89b9a3c34f2ac 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -55,7 +55,7 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, check_use_alibi, get_dtype_size, + GiB_bytes, cdiv, check_use_alibi, get_dtype_size, is_pin_memory_available, length_from_prompt_token_ids_or_embeds, round_up, supports_dynamo) @@ -2913,12 +2913,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Note: Overriding max_query_len to be the prefill tokens max_query_len = num_prefill_tokens elif uniform_decode: - num_reqs = num_tokens // max_query_len + assert not create_mixed_batch + num_reqs = cdiv(num_tokens, max_query_len) assert num_reqs <= max_num_reqs, \ "Do not capture num_reqs > max_num_reqs for uniform batch" num_scheduled_tokens_list = [max_query_len] * num_reqs if num_tokens % max_query_len != 0: - num_scheduled_tokens_list[-1] += num_tokens % max_query_len + num_scheduled_tokens_list[-1] = num_tokens % max_query_len else: num_reqs = min(num_tokens, max_num_reqs) min_tokens_per_req = num_tokens // num_reqs diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 8b1e1bb8f45ca..ca8734d28b459 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -487,7 +487,7 @@ class Worker(WorkerBase): sort_by="self_cuda_time_total")) def execute_dummy_batch(self) -> None: - self.model_runner._dummy_run(1) + self.model_runner._dummy_run(1, uniform_decode=True) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) From 0b7bed9c386d41945b990d3ac5311ac7d9c8b499 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Mon, 22 Sep 2025 21:20:53 -0400 Subject: [PATCH 566/584] [Performance] Remove input pads in cutlass_mla and optimize v_proj output handling (#25184) Signed-off-by: Alexander Matveev <amatveev@redhat.com> --- vllm/v1/attention/backends/mla/common.py | 45 ++++++++++++++++--- vllm/v1/attention/backends/mla/cutlass_mla.py | 30 +++++++------ 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 5b307810de930..9bca87c81d179 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -942,6 +942,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): qk_head_dim: int, v_head_dim: int, kv_b_proj: ColumnParallelLinear, + q_pad_num_heads: Optional[int] = None, ) -> None: if kv_sharing_target_layer_name is not None: raise NotImplementedError("KV sharing is not supported for MLA") @@ -959,6 +960,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): self.qk_head_dim = qk_head_dim self.v_head_dim = v_head_dim self.kv_b_proj = kv_b_proj + self.q_pad_num_heads = q_pad_num_heads if use_flashinfer_prefill(): logger.debug_once("Using FlashInfer prefill for MLA") @@ -1134,7 +1136,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): True, #Indicates actual_seq_lens are on GPU or CPU. ) - def _v_up_proj(self, x): + def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) if is_rocm_aiter_fp8bmm_enabled(): @@ -1146,12 +1148,23 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): transpose_bm=True) # Convert from (B, N, V) to (B, N * V) x = x.reshape(-1, self.num_heads * self.v_head_dim) + # Copy result + out.copy_(x) else: + # Convert from (B, N * V) to (N, B, V) + out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1) + # Multiply (N, B, L) x (N, L, V) -> (N, B, V) - x = torch.bmm(x, self.W_UV) + torch.bmm(x, self.W_UV, out=out) # Reuse "out" to make it "hot" + # Convert from (N, B, V) to (B, N * V) - x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) - return x + out_new = out.transpose(0, 1).reshape( + -1, self.num_heads * self.v_head_dim) + + # Adjust output buffer shape back to the original (B, N * V) + N, B, V = out.shape + out.resize_((B, N * V)) + out.copy_(out_new) # Copy result def process_weights_after_loading(self, act_dtype: torch.dtype): @@ -1559,6 +1572,15 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): # Convert from (B, N, P) to (N, B, P) decode_q_nope = decode_q_nope.transpose(0, 1) + # Pads the head_dim if necessary (for the underlying kernel) + if self.q_pad_num_heads is not None: + B, N, L = decode_q_pe.shape + decode_pe_padded = decode_q_pe.new_empty( + (B, self.q_pad_num_heads, L)) + decode_pe_padded.resize_((B, N, L)) + decode_pe_padded.copy_(decode_q_pe) + decode_q_pe = decode_pe_padded + if is_rocm_aiter_fp8bmm_enabled(): # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L) decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope, @@ -1567,8 +1589,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): group_size=128, transpose_bm=True) else: + # Pads the head_dim if necessary (for the underlying kernel) + N, B, P = decode_q_nope.shape + _, _, L = self.W_UK_T.shape + if self.q_pad_num_heads is not None: + decode_ql_nope = decode_q_nope.new_empty( + (self.q_pad_num_heads, B, L)) + decode_ql_nope.resize_((N, B, L)) + + else: + decode_ql_nope = decode_q_nope.new_empty((N, B, L)) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) - decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) + torch.bmm(decode_q_nope, self.W_UK_T, out=decode_ql_nope) # Convert from (N, B, L) to (B, N, L) decode_ql_nope = decode_ql_nope.transpose(0, 1) @@ -1603,5 +1636,5 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group()) # v_up projection - output[:num_decode_tokens] = self._v_up_proj(attn_out) + self._v_up_proj(attn_out, out=output[:num_decode_tokens]) return output_padded diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index ae534f3207b51..d44e20f2cb6be 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -74,6 +74,8 @@ class SM100Workspace: g_sm100_workspace = SM100Workspace(128 * 1024 * 1024) # 128MB +MAX_HEADS = 128 + class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): can_return_lse_for_decode: bool = True @@ -92,10 +94,18 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) + super().__init__(num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + q_pad_num_heads=MAX_HEADS, + **mla_args) unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): @@ -157,14 +167,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): MAX_HEADS = 128 assert H <= MAX_HEADS, f"H must be <= {MAX_HEADS}, but got {H}" - if H < MAX_HEADS: - q_nope_padded = q_nope.new_empty((B_q, MAX_HEADS, D_q_nope)) - q_nope_padded[:, :H] = q_nope - q_nope = q_nope_padded - - q_pe_padded = q_pe.new_empty((B_q, MAX_HEADS, D_q_pe)) - q_pe_padded[:, :H] = q_pe - q_pe = q_pe_padded assert len(page_table.shape) == 2 B_block_table, block_num = page_table.shape @@ -206,9 +208,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): ) if H < MAX_HEADS: + # Extract the subsets of the outputs + lse = lse[:, :H] if self.need_to_return_lse_for_decode else lse out = out[:, :H] - if self.need_to_return_lse_for_decode: - lse = lse[:, :H].contiguous() return out, lse From 9949aa2ef1cc77721dea9077e0f76b9deb6fd066 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 22 Sep 2025 21:42:45 -0400 Subject: [PATCH 567/584] [Perf] Apply torch.compile for `per_block_cast_to_fp8` (#24611) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/utils/deep_gemm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 38d92f01192b1..4083193d7650e 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -135,7 +135,7 @@ DEFAULT_BLOCK_SIZE = [128, 128] # Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38 -# TODO(wentao): optimize this function, using triton or cuda kernel +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def per_block_cast_to_fp8( x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, @@ -187,4 +187,4 @@ __all__ = [ "is_deep_gemm_e8m0_used", "is_deep_gemm_supported", "should_use_deepgemm_for_fp8_linear", -] +] \ No newline at end of file From 6fa78d8f23b88da47ce6e13815e5e82226f2af00 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Tue, 23 Sep 2025 09:48:12 +0800 Subject: [PATCH 568/584] [V0 deprecation] Remove platform v1 controling interface (#25410) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- tests/v1/test_async_llm_dp.py | 5 ----- vllm/engine/arg_utils.py | 27 --------------------------- vllm/platforms/cpu.py | 17 ----------------- vllm/platforms/cuda.py | 4 ---- vllm/platforms/interface.py | 14 -------------- vllm/platforms/rocm.py | 5 ----- vllm/platforms/tpu.py | 5 ----- vllm/platforms/xpu.py | 4 ---- 8 files changed, 81 deletions(-) diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 32da58011be98..cef0f362cff86 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -13,7 +13,6 @@ from vllm import SamplingParams from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs import PromptType -from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core_client import DPAsyncMPClient @@ -29,10 +28,6 @@ engine_args = AsyncEngineArgs( data_parallel_size=DP_SIZE, ) -if not current_platform.supports_v1(engine_args.create_model_config()): - pytest.skip(reason="Requires V1-supporting platform.", - allow_module_level=True) - async def generate( engine: AsyncLLM, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d4d801b155e17..17df82c081aeb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1502,12 +1502,6 @@ class EngineArgs: _raise_or_fallback(feature_name=name, recommend_to_remove=True) return False - # Platforms must decide if they can support v1 for this model - if not current_platform.supports_v1(model_config=model_config): - _raise_or_fallback( - feature_name=f"device type={current_platform.device_type}", - recommend_to_remove=False) - return False ############################################################# # Experimental Features - allow users to opt in. @@ -1524,12 +1518,6 @@ class EngineArgs: recommend_to_remove=False) return False - # The platform may be supported on V1, but off by default for now. - if not current_platform.default_v1( # noqa: SIM103 - model_config=model_config) and _warn_or_fallback( - current_platform.device_name): - return False - if (current_platform.is_cpu() and model_config.get_sliding_window() is not None): _raise_or_fallback(feature_name="sliding window (CPU backend)", @@ -1796,21 +1784,6 @@ def _raise_or_fallback(feature_name: str, recommend_to_remove: bool): logger.warning(msg) -def _warn_or_fallback(feature_name: str) -> bool: - if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: - logger.warning( - "Detected VLLM_USE_V1=1 with %s. Usage should " - "be considered experimental. Please report any " - "issues on Github.", feature_name) - should_exit = False - else: - logger.info( - "%s is experimental on VLLM_USE_V1=1. " - "Falling back to V0 Engine.", feature_name) - should_exit = True - return should_exit - - def human_readable_int(value): """Parse human-readable integers like '1k', '2M', etc. Including decimal values with decimal multipliers. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cd41832bc2ea4..1e15dc6a91aa0 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -328,23 +328,6 @@ class CpuPlatform(Platform): def supports_structured_output(cls) -> bool: return True - @classmethod - def supports_v1(cls, model_config) -> bool: - """Returns whether the current platform can support v1 for the supplied - model configuration. - """ - return True - - @classmethod - def default_v1(cls, model_config) -> bool: - """Returns whether the current platform can use v1 by default for the - supplied model configuration. - """ - arch = cls.get_cpu_architecture() - return (cls.supports_v1(model_config) - and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC, - CpuArchEnum.ARM, CpuArchEnum.S390X)) - @classmethod def opaque_attention_op(cls) -> bool: return True diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index b10bc03ee16c6..d5f3599acb1cc 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -384,10 +384,6 @@ class CudaPlatformBase(Platform): def supports_fp8(cls) -> bool: return cls.has_device_capability(89) - @classmethod - def supports_v1(cls, model_config: "ModelConfig") -> bool: - return True - @classmethod def use_custom_allreduce(cls) -> bool: return True diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index cad04ea14c01c..3f13ae72fe4d7 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -482,20 +482,6 @@ class Platform: or parallel_config.distributed_executor_backend == "external_launcher") - @classmethod - def supports_v1(cls, model_config: ModelConfig) -> bool: - """Returns whether the current platform can support v1 for the supplied - model configuration. - """ - return False - - @classmethod - def default_v1(cls, model_config: ModelConfig) -> bool: - """ - Returns whether the current platform supports v1 by default. - """ - return cls.supports_v1(model_config) - @classmethod def use_custom_allreduce(cls) -> bool: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 6a49bd4a33866..878718489fa88 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -396,11 +396,6 @@ class RocmPlatform(Platform): else: return torch.float8_e4m3fn - @classmethod - def supports_v1(cls, model_config: "ModelConfig") -> bool: - # V1 support on AMD gpus is experimental - return True - @classmethod def use_custom_allreduce(cls) -> bool: # We only enable custom allreduce for MI300 series diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 9852d948bc4b8..e4c73b1bae6fb 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -174,11 +174,6 @@ class TpuPlatform(Platform): def use_all_gather(cls) -> bool: return True - @classmethod - def supports_v1(cls, model_config: ModelConfig) -> bool: - # V1 support on TPU is experimental - return True - @classmethod def validate_request( cls, diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index eb591ae4454e4..034e039006c48 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -194,10 +194,6 @@ class XPUPlatform(Platform): def get_device_communicator_cls(cls) -> str: return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator" # noqa - @classmethod - def supports_v1(cls, model_config: ModelConfig) -> bool: - return True - @classmethod def device_count(cls) -> int: return torch.xpu.device_count() From c625f9043c5beb5921e94c6a5b3ac18372bab4db Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Tue, 23 Sep 2025 09:52:09 +0800 Subject: [PATCH 569/584] [V0 deprecation] Remove `_set_default_args_v0` function (#25409) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/engine/arg_utils.py | 83 ++++++---------------------------------- 1 file changed, 11 insertions(+), 72 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 17df82c081aeb..8c7a1b413cdb7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1147,20 +1147,15 @@ class EngineArgs: else: envs.set_vllm_use_v1(use_v1) - # Set default arguments for V0 or V1 Engine. - if use_v1: - self._set_default_args_v1(usage_context, model_config) - # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 - if current_platform.is_cpu( - ) and current_platform.get_cpu_architecture() in ( - CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM): - logger.info( - "Chunked prefill is not supported for ARM and POWER " - "and S390X CPUs; " - "disabling it for V1 backend.") - self.enable_chunked_prefill = False - else: - self._set_default_args_v0(model_config) + # Set default arguments for V1 Engine. + self._set_default_args(usage_context, model_config) + # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 + if current_platform.is_cpu() and current_platform.get_cpu_architecture( + ) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM): + logger.info("Chunked prefill is not supported for ARM and POWER " + "and S390X CPUs; " + "disabling it for V1 backend.") + self.enable_chunked_prefill = False assert self.enable_chunked_prefill is not None sliding_window: Optional[int] = None @@ -1528,64 +1523,8 @@ class EngineArgs: return True - def _set_default_args_v0(self, model_config: ModelConfig) -> None: - """Set Default Arguments for V0 Engine.""" - - max_model_len = model_config.max_model_len - use_long_context = max_model_len > 32768 - if self.enable_chunked_prefill is None: - # Chunked prefill not supported for Multimodal or MLA in V0. - if model_config.is_multimodal_model or model_config.use_mla: - self.enable_chunked_prefill = False - - # Enable chunked prefill by default for long context (> 32K) - # models to avoid OOM errors in initial memory profiling phase. - elif use_long_context: - is_gpu = current_platform.is_cuda() - use_sliding_window = (model_config.get_sliding_window() - is not None) - use_spec_decode = self.speculative_config is not None - - if (is_gpu and not use_sliding_window and not use_spec_decode - and not self.enable_lora): - self.enable_chunked_prefill = True - logger.warning( - "Chunked prefill is enabled by default for models " - "with max_model_len > 32K. Chunked prefill might " - "not work with some features or models. If you " - "encounter any issues, please disable by launching " - "with --enable-chunked-prefill=False.") - - if self.enable_chunked_prefill is None: - self.enable_chunked_prefill = False - - if not self.enable_chunked_prefill and use_long_context: - logger.warning( - "The model has a long context length (%s). This may cause" - "OOM during the initial memory profiling phase, or result " - "in low performance due to small KV cache size. Consider " - "setting --max-model-len to a smaller value.", max_model_len) - - # Disable prefix caching for multimodal models for VLLM_V0. - if self.enable_prefix_caching and model_config.is_multimodal_model: - logger.warning( - "--enable-prefix-caching is not supported for multimodal " - "models in V0 and has been disabled.") - self.enable_prefix_caching = False - - if self.enable_prompt_embeds: - logger.warning( - "--enable-prompt-embeds and --enable-prefix-caching " - "are not supported together in V0. Prefix caching has " - "been disabled.") - self.enable_prefix_caching = False - - # Set max_num_seqs to 256 for VLLM_V0. - if self.max_num_seqs is None: - self.max_num_seqs = 256 - - def _set_default_args_v1(self, usage_context: UsageContext, - model_config: ModelConfig) -> None: + def _set_default_args(self, usage_context: UsageContext, + model_config: ModelConfig) -> None: """Set Default Arguments for V1 Engine.""" # V1 always uses chunked prefills and prefix caching From 4741239db7b75949d577f99eb1c3a9e463dc77f3 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 22 Sep 2025 22:04:15 -0400 Subject: [PATCH 570/584] [Bug] Fix Long Context OOM Issue (#25290) Signed-off-by: yewentao256 <zhyanwentao@126.com> --- vllm/v1/attention/backends/mla/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 9bca87c81d179..a177117a50bd1 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -481,7 +481,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): # which would result in up-projected context being # 2*(192*128)*(64*1024) = 3gb # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) + 64 * 1024) assert self.chunked_prefill_workspace_size >= \ scheduler_config.max_num_seqs * cache_config.block_size if self.dcp_world_size > 1: From fc97733da801f1d6464e706f41ebd5180f2abb51 Mon Sep 17 00:00:00 2001 From: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:04:47 +0800 Subject: [PATCH 571/584] [feat] Support MRoPE + YaRN (#25384) Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com> Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com> --- .../layers/rotary_embedding/__init__.py | 22 ++++++++++--- .../layers/rotary_embedding/mrope.py | 31 +++++++++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index c9653aa9e4405..3576368981c7c 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -153,11 +153,23 @@ def get_rope( if k in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow") } - rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim, - original_max_position, - base, is_neox_style, - scaling_factor, dtype, - **extra_kwargs) + if "mrope_section" in rope_scaling: + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_scaling["mrope_section"], + mrope_interleaved=rope_scaling.get("mrope_interleaved", + False), + scaling_factor=scaling_factor, + **extra_kwargs) + else: + rotary_emb = YaRNScalingRotaryEmbedding( + head_size, rotary_dim, original_max_position, base, + is_neox_style, scaling_factor, dtype, **extra_kwargs) elif scaling_type == "deepseek_yarn": scaling_factor = rope_scaling["factor"] original_max_position = rope_scaling[ diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 17d04a1ad715c..9bf0d6bd15e74 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -12,6 +12,7 @@ from vllm.triton_utils import tl, triton from .base import RotaryEmbedding from .common import apply_rotary_emb_dispatch +from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale @triton.jit @@ -213,7 +214,27 @@ class MRotaryEmbedding(RotaryEmbedding): dtype: torch.dtype, mrope_section: Optional[list[int]] = None, mrope_interleaved: bool = False, + # YaRN parameters. + *, + scaling_factor: Optional[float] = None, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, ) -> None: + + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + if self.scaling_factor is not None: + # Get n-d magnitude scaling corrected for interpolation + self.mscale = float( + yarn_get_mscale(self.scaling_factor) * attn_factor) + else: + self.mscale = 1.0 + # In Qwen2.5-VL, the maximum index value is related to the duration of # the input video. We enlarge max_position_embeddings to 4 times to get # a larger the cos and sin cache. @@ -226,6 +247,16 @@ class MRotaryEmbedding(RotaryEmbedding): if self.mrope_section: assert sum(self.mrope_section) == rotary_dim // 2 + def _compute_inv_freq(self, base: float) -> torch.Tensor: + if self.scaling_factor is None: + return super()._compute_inv_freq(base) + return YaRNScalingRotaryEmbedding._compute_inv_freq(self, base) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + if self.scaling_factor is None: + return super()._compute_cos_sin_cache() + return YaRNScalingRotaryEmbedding._compute_cos_sin_cache(self) + def forward_native( self, positions: torch.Tensor, From f225ea7dd98e9f29752e5c032cd4a8ee1d712f16 Mon Sep 17 00:00:00 2001 From: Kunshang Ji <kunshang.ji@intel.com> Date: Tue, 23 Sep 2025 11:09:00 +0800 Subject: [PATCH 572/584] [XPU] Fix `compile_size` is `None` case. (#25433) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> --- vllm/platforms/xpu.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 034e039006c48..af61db5e312a4 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -113,6 +113,8 @@ class XPUPlatform(Platform): # lazy import to avoid circular import from vllm.config import CompilationLevel, CUDAGraphMode compilation_config = vllm_config.compilation_config + if compilation_config.compile_sizes is None: + compilation_config.compile_sizes = [] assert compilation_config.cudagraph_mode == CUDAGraphMode.NONE, \ "CUDA graph mode should be NONE on XPU" From eea178398997b94b6df5560b597e2b7895b2ae3a Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 22 Sep 2025 20:21:48 -0700 Subject: [PATCH 573/584] [benchmarks]allow skip ready check for bench serve (#25420) Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com> --- vllm/benchmarks/serve.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 7382782f11655..2a042802d0d54 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -531,18 +531,22 @@ async def benchmark( extra_body=extra_body, ) - test_output = await wait_for_endpoint( - request_func, - test_input, - session, - timeout_seconds=ready_check_timeout_sec, - ) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") + if ready_check_timeout_sec > 0: + test_output = await wait_for_endpoint( + request_func, + test_input, + session, + timeout_seconds=ready_check_timeout_sec, + ) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark " + "arguments are correctly specified. " + f"Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") else: - print("Initial test run completed. Starting main benchmark run...") + print("Skipping endpoint ready check.") if lora_modules: # For each input request, choose a LoRA module at random. @@ -1151,7 +1155,8 @@ def add_cli_args(parser: argparse.ArgumentParser): type=int, default=600, help="Maximum time to wait for the endpoint to become ready " - "in seconds (default: 600 seconds / 10 minutes).", + "in seconds (default: 600 seconds / 10 minutes). If set to 0, " + "the ready check will be skipped." ) From 78237e43bf3ac70f31b27684dc27d7eca0fa3677 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Mon, 22 Sep 2025 23:26:32 -0400 Subject: [PATCH 574/584] [Bugfix] Remove contiguous output req for context parallel MLA (#25414) Signed-off-by: Michael Goin <mgoin64@gmail.com> --- vllm/attention/ops/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 189b57e8e8b82..6253e1e56b0f1 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -134,6 +134,5 @@ def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor, cp_attn_lse = cp_attn_lse.contiguous() lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) - assert out.is_contiguous() out = cp_group.reduce_scatter(out, dim=1) return out From fafbe11af4f211abe02ef874f113bc28232eb0e8 Mon Sep 17 00:00:00 2001 From: Michael Yao <haifeng.yao@daocloud.io> Date: Tue, 23 Sep 2025 11:42:58 +0800 Subject: [PATCH 575/584] [Docs] Fix griffe warnings in vllm/lora/ops (#25369) Signed-off-by: windsonsea <haifeng.yao@daocloud.io> --- vllm/lora/ops/triton_ops/lora_kernel_metadata.py | 6 +++--- vllm/lora/ops/xla_ops/lora_ops.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py index 39e647b9b88a4..e27604728ed06 100644 --- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py +++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py @@ -83,8 +83,8 @@ class LoRAKernelMeta: Prepare kernel metadata tensors for the current forward pass. Args: - token_lora_tensor (torch.Tensor): Tensor containing lora indices - for each input token. + token_lora_mapping (torch.Tensor): Tensor containing lora indices + for each input token. """ self._reset() @@ -136,7 +136,7 @@ class LoRAKernelMeta: Args: token_nums (int): Number of input tokens in the current forward - pass. + pass of the kernel. """ return ( self.token_lora_mapping[:token_nums], diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py index 9118f3351ef0a..29bfd5753a588 100644 --- a/vllm/lora/ops/xla_ops/lora_ops.py +++ b/vllm/lora/ops/xla_ops/lora_ops.py @@ -93,7 +93,6 @@ def bgmv_shrink( inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. - output_tensor (torch.Tensor): (Unused) output tensor (placeholder). lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. scaling (float, optional): Scalar multiplier applied to the output. From e8db44f8834161259b25ebb13300bd502d37af9f Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath <varunsundar08@gmail.com> Date: Tue, 23 Sep 2025 00:01:09 -0400 Subject: [PATCH 576/584] [DP/EP][GPTOSS] Use triton matmul-ogs kernels for GPTOSS DP/EP (#24588) Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com> --- .../model_executor/layers/fused_moe/config.py | 22 ++- .../fused_moe/deepep_ht_prepare_finalize.py | 18 ++ .../fused_moe/gpt_oss_triton_kernels_moe.py | 176 ++++++++++++++---- vllm/model_executor/layers/fused_moe/layer.py | 82 +++++--- .../layers/fused_moe/modular_kernel.py | 2 +- .../layers/quantization/mxfp4.py | 51 +++-- 6 files changed, 275 insertions(+), 76 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index b14bc06e913cf..3052bdb4dc1ec 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -288,7 +288,11 @@ class FusedMoEQuantConfig: @property def use_mxfp4_w4a4(self) -> bool: - return self.quant_dtype == "mxfp4" + return (self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4") + + @property + def use_mxfp4_w4a16(self) -> bool: + return (self._a1.dtype is None and self._w1.dtype == "mxfp4") @property def use_nvfp4_w4a4(self) -> bool: @@ -453,6 +457,22 @@ def int8_w8a8_moe_quant_config( ) +def mxfp4_w4a16_moe_quant_config( + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> FusedMoEQuantConfig: + """ + Construct a quant config for unquantized activations and mxfp4 weights. + """ + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias), + _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias), + ) + + def mxfp4_w4a4_moe_quant_config( w1_scale: Union[torch.Tensor, "PrecisionConfig"], w2_scale: Union[torch.Tensor, "PrecisionConfig"], diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index f390f0a25875e..a250a6218715e 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -11,6 +11,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceContiguous, TopKWeightAndReduceDelegate) from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) +from vllm.utils import round_up class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): @@ -18,6 +19,23 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): Prepare/Finalize using DeepEP High-Throughput kernels. """ + @staticmethod + def maybe_roundup_layer_hidden_size(hidden_size: int, + dtype: torch.dtype) -> int: + # Round up hidden size so it is compatible with DeepEP High Throughput + # kernels. + # DeepEP intranode kernels make copies in units of, + # 32(warp-size) int4 elements. Round up hidden size to respect this. + # For example, an input hidden size of 2880 with dtype torch.bfloat16 + # will be rounded up to 3072. + hidden_size_bytes = hidden_size * dtype.itemsize + xfer_atom_size = 512 # 32 * 16 (size(int4)) + if hidden_size_bytes % xfer_atom_size == 0: + return hidden_size + + hidden_size_bytes = round_up(hidden_size_bytes, xfer_atom_size) + return hidden_size_bytes // dtype.itemsize + def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int, dp_size: int, rank_expert_offset: int): super().__init__() diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index f12d3807517ff..0e84a9241e905 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -9,7 +9,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( - TopKWeightAndReduceDelegate) + TopKWeightAndReduceNoOP) +from vllm.triton_utils import tl, triton from vllm.utils import has_triton_kernels logger = init_logger(__name__) @@ -19,13 +20,55 @@ if has_triton_kernels(): import triton_kernels.swiglu from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, matmul_ogs) - from triton_kernels.routing import routing + from triton_kernels.routing import (RoutingData, routing, + routing_from_bitmatrix) + from triton_kernels.tensor import Bitmatrix except (ModuleNotFoundError, AttributeError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " "version is compatible. Error: %s", e) +@triton.jit +def pack_bitmatrix( + bitmatrix, + topk_ids, + n_rows, # n_rows in bitmatrix / topk_ids + bm_cols: tl.constexpr, # n int32_t bitpacks in bitmatrix + n_expts_act, # num_topk + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + """ + Packs topk_ids into a bitmatrix. + code reference: + https://github.com/triton-lang/triton/blob/dd1bbc52b34d202dfe5ffea1e04fb16166c5c04e/python/triton_kernels/bench/distributed.py#L264 + """ + pid_m = tl.program_id(0) + offsets_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offsets_k = tl.arange(0, BLOCK_SIZE_K) + offsets = offsets_m[:, None] * n_expts_act + offsets_k[None, :] + mask = (offsets_m < n_rows)[:, None] & (offsets_k < n_expts_act)[None, :] + indices = tl.load(topk_ids + offsets, mask=mask, other=-1) + div = indices // 32 + rem = indices % 32 + one = tl.cast(1, tl.uint32) + + # Iterate through all the relevant bitmatrix columns. + for i in range(bm_cols): + # When BLOCK_SIZE_K=32, offs is just the column index. + offs = tl.arange(0, BLOCK_SIZE_K // 32) + i * (BLOCK_SIZE_K // 32) + # All topks that need to go into this column has the correct bit set. + # Other bits are 0. x is a 2D tensor. + x = tl.where(div[:, :, None] == offs[None, None, :], + (one << rem)[:, :, None], 0) + # Reduce x to get a single int32_t bitpack. + y = tl.reduce_or(x, axis=1) + bitmatrix_ptrs = bitmatrix + offsets_m[:, + None] * bm_cols + offs[None, :] + tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows) + + def triton_kernel_moe_forward( hidden_states: torch.Tensor, w1, # Tensor or triton_kernels.Tensor @@ -124,34 +167,88 @@ def triton_kernel_fused_experts( return intermediate_cache3 -class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): +def make_routing_data( + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + num_local_experts: int, +) -> tuple["RoutingData", torch.Tensor, torch.Tensor]: - def __init__( - self, - max_num_tokens: int, - num_dispatchers: int, - quant_config: FusedMoEQuantConfig, - ): + topk_ids = topk_ids.to(torch.int16) + topk_weights = topk_weights.to(torch.bfloat16) + + n_rows, num_topk = topk_ids.size() + + BLOCK_SIZE_M = 512 + BLOCK_SIZE_K = 32 + + bm_cols = triton.cdiv(num_local_experts, BLOCK_SIZE_K) # n_bitpacks + bitmatrix = torch.zeros((n_rows, bm_cols), + dtype=torch.uint32, + device=topk_ids.device) + + grid = (triton.cdiv(n_rows, BLOCK_SIZE_M), ) + pack_bitmatrix[grid]( + bitmatrix, + topk_ids, + n_rows, + bm_cols, + num_topk, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_K=BLOCK_SIZE_K, + ) + + bitmatrix_shape = [n_rows, bm_cols * 32] + bitmatrix_shape_max = [n_rows, None] + bitmatrix = Bitmatrix(bitmatrix, + shape=bitmatrix_shape, + shape_max=bitmatrix_shape_max, + scratchpad=None) + + # matmul_ogs expects invalid topk_weights to be -1s + topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights) + routing_data, gather_indx, scatter_indx = routing_from_bitmatrix( + bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk) + + return routing_data, gather_indx, scatter_indx + + +class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__(self, quant_config: FusedMoEQuantConfig): + super().__init__(quant_config) + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Weight application and reduction happens in the fused_experts kernel. + return TopKWeightAndReduceNoOP() + + def _make_routing_data( + self, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + num_local_experts: int, + ) -> tuple["RoutingData", torch.Tensor, torch.Tensor]: + return make_routing_data(topk_ids, topk_weights, num_local_experts) + + +class OAITritonExperts(BaseOAITritonExperts): + + def __init__(self, quant_config: FusedMoEQuantConfig): + # TODO (varun) : Enable activation quantization + assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16" super().__init__(quant_config) - self.max_num_tokens = max_num_tokens - self.num_dispatchers = num_dispatchers @property def activation_formats( self ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: - return (mk.FusedMoEActivationFormat.BatchedExperts, - mk.FusedMoEActivationFormat.BatchedExperts) + return (mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard) def supports_chunking(self) -> bool: - return False - - def supports_expert_map(self) -> bool: - return False - - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return True def workspace_shapes( self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int, @@ -159,13 +256,10 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_tokens_meta: Optional[mk.ExpertTokensMetadata] ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: # workspace are allocated inside the kernel - assert a.dim() == 2 - num_dp = self.num_dispatchers - num_experts = local_num_experts - max_num_tokens = self.max_num_tokens - workspace2 = (0, 0, 0) - output = (num_experts, max_num_tokens * num_dp, N) - return (output, workspace2, output, a.dtype) + workspace1 = (M, K) + workspace2 = (0, 0) + output = (M, K) + return (workspace1, workspace2, output, a.dtype) def apply( self, @@ -185,17 +279,29 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, ): - return triton_kernel_fused_experts( - output, + if expert_map is not None: + topk_ids = expert_map[topk_ids] + + local_num_experts = w1.size(0) + if global_num_experts == -1: + global_num_experts = local_num_experts + + routing_data, gather_indx, scatter_indx = self._make_routing_data( + topk_ids, topk_weights, local_num_experts) + + experts_output = triton_kernel_fused_experts( + None, hidden_states, w1, w2, - routing_data=None, - gather_indx=None, - scatter_indx=None, + routing_data, + gather_indx, + scatter_indx, activation=activation, quant_config=self.quant_config, apply_router_weight_on_input=False, - global_num_experts=global_num_experts, - expert_map=expert_map, + global_num_experts=local_num_experts, + expert_map=None, # applied already a1q_scale=a1q_scale) + + output.copy_(experts_output, non_blocking=True) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 17ad75584a3f6..1f80e972b7f06 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -800,6 +800,49 @@ def get_compressed_expert_map(expert_map: torch.Tensor) -> str: for local_index, global_index in zip(local_indices, global_indices)) +def maybe_roundup_hidden_size( + hidden_size: int, act_dtype: torch.dtype, + quant_config: Optional[QuantizationConfig], + moe_parallel_config: FusedMoEParallelConfig) -> int: + """ + Given layer hidden size and MoE configurations, round up hidden_size + if necessary. + + Args: + hidden_size(int): Layer hidden-size + act_dtype: Data type of the layer activations. + quant_config(FusedMoEQuantConfig): Fused MoE quantization configuration. + moe_parallel_config(FusedMoEParallelConfig): Fused MoE parallelization + strategy configuration. + + Return: + Rounded up hidden_size if rounding up is required based on the configs. + Original hidden size otherwise. + """ + + if (moe_parallel_config.use_deepep_ht_kernels): + hidden_size = ( + DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size( + hidden_size, act_dtype)) + + # we are padding globally so EP buffer allocation works + if quant_config and quant_config.get_name() == "mxfp4": + + from vllm.model_executor.layers.quantization.mxfp4 import ( + Mxfp4Backend, get_mxfp4_backend) + current_mxfp4_backend = get_mxfp4_backend() + if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + or current_mxfp4_backend + == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS): + hidden_size = round_up(hidden_size, 128) + elif (current_platform.is_rocm() or current_mxfp4_backend + == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): + hidden_size = round_up(hidden_size, 256) + + return hidden_size + + @CustomOp.register("fused_moe") class FusedMoE(CustomOp): """FusedMoE layer for MoE models. @@ -856,6 +899,18 @@ class FusedMoE(CustomOp): params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype + vllm_config = get_current_vllm_config() + + # FIXME (varun): We should have a better way of inferring the activation + # datatype. This works for now as the tensor datatype entering the MoE + # operation is typically unquantized (i.e. float16/bfloat16). + if vllm_config.model_config is not None: + moe_in_dtype = vllm_config.model_config.dtype + else: + # TODO (bnell): This is a hack to get test_mixtral_moe to work + # since model_config is not set in the pytest test. + moe_in_dtype = params_dtype + tp_size_ = (tp_size if tp_size is not None else get_tensor_model_parallel_world_size()) dp_size_ = (dp_size @@ -865,7 +920,6 @@ class FusedMoE(CustomOp): if self.is_sequence_parallel: self.sp_size = tp_size_ - vllm_config = get_current_vllm_config() self.moe_parallel_config: FusedMoEParallelConfig = ( FusedMoEParallelConfig.make( tp_size_=tp_size_, @@ -874,19 +928,10 @@ class FusedMoE(CustomOp): self.global_num_experts = num_experts + num_redundant_experts - # we are padding globally so EP buffer allocation works - if quant_config and quant_config.get_name() == "mxfp4": - from vllm.model_executor.layers.quantization.mxfp4 import ( - Mxfp4Backend, get_mxfp4_backend) - current_mxfp4_backend = get_mxfp4_backend() - if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 - or current_mxfp4_backend - == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS): - hidden_size = round_up(hidden_size, 128) - elif (current_platform.is_rocm() or current_mxfp4_backend - == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or - current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): - hidden_size = round_up(hidden_size, 256) + # Round up hidden size if needed. + hidden_size = maybe_roundup_hidden_size(hidden_size, moe_in_dtype, + quant_config, + self.moe_parallel_config) # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config @@ -967,20 +1012,13 @@ class FusedMoE(CustomOp): raise ValueError("Only softmax scoring function is supported for " "non-grouped topk.") - if vllm_config.model_config is not None: - model_dtype = vllm_config.model_config.dtype - else: - # TODO (bnell): This is a hack to get test_mixtral_moe to work - # since model_config is not set in the pytest test. - model_dtype = params_dtype - moe = FusedMoEConfig( num_experts=self.global_num_experts, experts_per_token=top_k, hidden_dim=hidden_size, num_local_experts=self.local_num_experts, moe_parallel_config=self.moe_parallel_config, - in_dtype=model_dtype, + in_dtype=moe_in_dtype, max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, has_bias=has_bias, ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index a16c254fadf66..5fce24018e647 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -76,7 +76,7 @@ def _moe_problem_size( """ assert w1.dim() == 3 and w2.dim() == 3 E, N, _ = w1.size() - K = w2.size(1) + K = a1.size(-1) if a1.dim() == 2: # Make sure we are using the correct a1 (pre-permute). diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5c3f8a891276b..a71c8d32a22c7 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -13,7 +13,10 @@ from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.fused_moe import modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEQuantConfig, mxfp4_w4a4_moe_quant_config) + FusedMoEQuantConfig, mxfp4_w4a4_moe_quant_config, + mxfp4_w4a16_moe_quant_config) +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + OAITritonExperts) from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) @@ -578,9 +581,14 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer.w13_bias = Parameter(w13_bias, requires_grad=False) layer.w2_bias = Parameter(w2_bias, requires_grad=False) - # FIXME warp need to be adjusted based on batch size - # only apply to batched mode - if self.moe.use_ep: + # Ideally we'd use FusedMoEModularKernel.prepare_finalize object + # (stored in self.fused_experts) to determine if the MoE has a + # batched activation format. As self.fused_experts is not + # initialized at this point, we resort to checking the MoE config + # directly. + is_batched_moe = (self.moe.use_pplx_kernels + or self.moe.use_deepep_ll_kernels) + if is_batched_moe: num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8 else: num_warps = 8 @@ -640,16 +648,21 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): if self.mxfp4_backend == Mxfp4Backend.TRITON: w1_scale = self.w13_precision_config w2_scale = self.w2_precision_config + return mxfp4_w4a16_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) else: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale - - return mxfp4_w4a4_moe_quant_config( - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_scale=w1_scale, - w2_scale=w2_scale, - ) + return mxfp4_w4a4_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) def select_gemm_impl( self, @@ -661,6 +674,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): raise NotImplementedError( "Mxfp4 does not support batched experts format for EP") else: + assert self.moe_quant_config is not None if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16): # B200 code-path @@ -671,13 +685,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): # TODO(bnell): part of quant_config "max_capture_size": self.max_capture_size, } - assert self.moe_quant_config is not None return TrtLlmGenExperts(self.moe, self.moe_quant_config, **kwargs) else: - # Use matmul_ogs from triton_kernels here! - raise NotImplementedError( - "Mxfp4 does not support non-batched experts format for EP") + return OAITritonExperts(self.moe_quant_config) def _route_and_experts( self, @@ -722,10 +733,16 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): logical_to_physical_map=logical_to_physical_map, logical_replica_count=logical_replica_count) + w13_weight = (self.w13_weight_triton_tensor + if layer.w13_weight is None else layer.w13_weight) + w2_weight = (self.w2_weight_triton_tensor + if layer.w2_weight is None else layer.w2_weight) + assert all([w is not None for w in [w13_weight, w2_weight]]) + return self.fused_experts( hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, + w1=w13_weight, + w2=w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, From 5774b0a1da93492e60cc446e6e783a43da89e105 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" <chendi.xue@intel.com> Date: Mon, 22 Sep 2025 23:17:42 -0500 Subject: [PATCH 577/584] [NIXL][OOT platform] support nixl_connector with oot platform and other nixl_backend (#25121) Signed-off-by: Chendi Xue <Chendi.Xue@intel.com> --- docs/features/disagg_prefill.md | 6 +++ docs/serving/expert_parallel_deployment.md | 2 +- .../kv_connector/unit/test_nixl_connector.py | 52 ++++++++++++++++++- .../kv_connector/v1/nixl_connector.py | 33 +++++++++--- vllm/platforms/interface.py | 15 ++++++ 5 files changed, 99 insertions(+), 9 deletions(-) diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md index 69f70b8ff5ac3..cb62213cc7af0 100644 --- a/docs/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -31,6 +31,12 @@ Now supports 5 types of connectors: --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}' ``` +For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as: + + ```bash + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_buffer_device":"cuda", "kv_connector_extra_config":{"backend":["UCX", "GDS"]}' + ``` + - **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker): ```bash diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 7489fc2609831..f823d33df80ea 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -193,7 +193,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok 1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. -2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}` +2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backend":["UCX", "GDS"]}'` 3. **Client Orchestration**: Use the client-side script below to coordinate prefill/decode operations. We are actively working on routing solutions. diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 6e58d158c3f4b..fa698a2eabd91 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -27,6 +27,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, NixlConnectorWorker, NixlKVConnectorStats) from vllm.forward_context import ForwardContext +from vllm.platforms.interface import Platform from vllm.sampling_params import SamplingParams from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput @@ -56,7 +57,7 @@ class FakeNixlWrapper: def get_reg_descs(self, caches_data, memory_type: str) -> list: return [str(uuid.uuid4()) for _ in caches_data] - def register_memory(self, descs) -> None: + def register_memory(self, descs, backends) -> None: pass def get_xfer_descs(self, blocks_data, memory_type: str) -> list: @@ -855,3 +856,52 @@ def test_register_kv_caches(dist_init): assert block_len == expected_block_len, \ f"Block entry {i}: Expected block len {expected_block_len}, " \ f"got {block_len}" + + +class FakePlatform(Platform): + device_type: str = "oot" + + @classmethod + def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]: + """ + Returns a mapping from device_type to a tuple of supported + kv_buffer_device for nixl. + """ + return {'oot': ('oot', )} + + @classmethod + def get_nixl_memory_type(cls) -> Optional[str]: + """ + Returns the nixl memory type for the current platform. + """ + return 'VRAM' + + +@pytest.mark.parametrize("kv_buffer_device, nixl_memory_type", [ + ("oot", "VRAM"), +]) +def test_kv_buffer_to_nixl_memory_types(dist_init, kv_buffer_device, + nixl_memory_type): + """ + Test that register_kv_caches() passes the correct memory types from the + config to the nixl_wrapper. + """ + vllm_config = create_vllm_config() + # Override the default memory types in the config + vllm_config.kv_transfer_config.kv_buffer_device = kv_buffer_device + from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( + _NIXL_SUPPORTED_DEVICE) + _NIXL_SUPPORTED_DEVICE.update(FakePlatform.get_nixl_supported_devices()) + + with patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.current_platform", FakePlatform), \ + patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector._NIXL_SUPPORTED_DEVICE", _NIXL_SUPPORTED_DEVICE): # noqa: E501 + + # Create connector and replace its worker with a fake one for isolation + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + + # Verify get_reg_descs was called with the correct memory_type + assert connector.connector_worker.kv_buffer_device == kv_buffer_device + assert connector.connector_worker.nixl_memory_type == nixl_memory_type diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index d3a08af088c11..82b483447e33d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -58,6 +58,12 @@ except ImportError: logger.warning("NIXL is not available") NixlWrapper = None +try: + from nixl._api import nixl_agent_config +except ImportError: + nixl_agent_config = None + logger.warning("NIXL agent config is not available") + # Supported platforms and types of kv transfer buffer. # {device: tuple of supported kv buffer types} _NIXL_SUPPORTED_DEVICE = { @@ -65,6 +71,8 @@ _NIXL_SUPPORTED_DEVICE = { "tpu": ("cpu", ), "xpu": ("cpu", ), } +# support for oot platform by providing mapping in current_platform +_NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices()) class NixlAgentMetadata( @@ -448,8 +456,15 @@ class NixlConnectorWorker: self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size + self.nixl_backends = \ + vllm_config.kv_transfer_config.get_from_extra_config( + "backends", ["UCX"]) # Agent. - self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None) + non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"] + config = nixl_agent_config(backends=self.nixl_backends) if len( + non_ucx_backends) > 0 and nixl_agent_config is not None else None + + self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config) # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}. self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict) @@ -486,11 +501,15 @@ class NixlConnectorWorker: # used when device memory can not be registered under nixl self.host_xfer_buffers: dict[str, torch.Tensor] = {} self.use_host_buffer = self.kv_buffer_device == "cpu" - if self.kv_buffer_device == "cuda": - self.nixl_memory_type = "VRAM" - elif self.kv_buffer_device == "cpu": - self.nixl_memory_type = "DRAM" - else: + # support for oot platform which can't register nixl memory + # type based on kv_buffer_device + self.nixl_memory_type = current_platform.get_nixl_memory_type() + if self.nixl_memory_type is None: + if self.kv_buffer_device == "cuda": + self.nixl_memory_type = "VRAM" + elif self.kv_buffer_device == "cpu": + self.nixl_memory_type = "DRAM" + if self.nixl_memory_type is None: raise RuntimeError( f"{self.device_type} with {self.kv_buffer_device} kv_buffer " "is not supported.") @@ -766,7 +785,7 @@ class NixlConnectorWorker: descs = self.nixl_wrapper.get_reg_descs(caches_data, self.nixl_memory_type) logger.debug("Registering descs: %s", caches_data) - self.nixl_wrapper.register_memory(descs) + self.nixl_wrapper.register_memory(descs, backends=self.nixl_backends) logger.debug("Done registering descs") self._registered_descs.append(descs) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 3f13ae72fe4d7..7dd935d2eb31c 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -604,6 +604,21 @@ class Platform: return _synced_weight_loader + @classmethod + def get_nixl_supported_devices(cls) -> dict[str, tuple[str, ...]]: + """ + Returns a mapping from device_type to a tuple of supported + kv_buffer_device for nixl. + """ + return {} + + @classmethod + def get_nixl_memory_type(cls) -> Optional[str]: + """ + Returns the nixl memory type for the current platform. + """ + return None + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED From c98be0a232764fb68353e2c9e26d3495a979044d Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 23 Sep 2025 13:17:10 +0800 Subject: [PATCH 578/584] [Model] Enable DP for ViT in Qwen2-VL (#25445) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/model_executor/models/qwen2_vl.py | 78 +++++++++++++++++++------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b3c42c2572566..88813490c0fbc 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -66,6 +66,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -217,17 +218,20 @@ class Qwen2VisionMLP(nn.Module): act_layer: type[nn.Module] = QuickGELU, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.fc1 = ColumnParallelLinear(in_features, hidden_features, quant_config=quant_config, - prefix=f"{prefix}.fc1") + prefix=f"{prefix}.fc1", + disable_tp=use_data_parallel) self.act = act_layer() self.fc2 = RowParallelLinear(hidden_features, in_features, quant_config=quant_config, - prefix=f"{prefix}.fc2") + prefix=f"{prefix}.fc2", + disable_tp=use_data_parallel) def forward(self, x: torch.Tensor) -> torch.Tensor: x_parallel, _ = self.fc1(x) @@ -293,25 +297,28 @@ class Qwen2VisionAttention(nn.Module): projection_size: int, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() # Per attention head and per partition values. - world_size = parallel_state.get_tensor_model_parallel_world_size() - self.tp_size = world_size + self.tp_size = (1 if use_data_parallel else + parallel_state.get_tensor_model_parallel_world_size()) self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( - num_heads, world_size) + num_heads, self.tp_size) self.qkv = ColumnParallelLinear(input_size=embed_dim, output_size=3 * projection_size, quant_config=quant_config, - prefix=f"{prefix}.qkv") + prefix=f"{prefix}.qkv", + disable_tp=use_data_parallel) self.proj = RowParallelLinear(input_size=projection_size, output_size=embed_dim, quant_config=quant_config, - prefix=f"{prefix}.proj") + prefix=f"{prefix}.proj", + disable_tp=use_data_parallel) # Detect attention implementation. self.attn_backend = get_vit_attn_backend( @@ -453,6 +460,7 @@ class Qwen2VisionBlock(nn.Module): norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() if norm_layer is None: @@ -465,12 +473,14 @@ class Qwen2VisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel) self.mlp = Qwen2VisionMLP(dim, mlp_hidden_dim, act_layer=act_layer, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) def forward( self, @@ -531,6 +541,7 @@ class Qwen2VisionPatchMerger(nn.Module): spatial_merge_size: int = 2, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.hidden_size = context_dim * (spatial_merge_size**2) @@ -542,13 +553,15 @@ class Qwen2VisionPatchMerger(nn.Module): self.hidden_size, bias=True, quant_config=quant_config, - prefix=f"{prefix}.mlp.0"), + prefix=f"{prefix}.mlp.0", + disable_tp=use_data_parallel), nn.GELU(), RowParallelLinear(self.hidden_size, d_model, bias=True, quant_config=quant_config, - prefix=f"{prefix}.mlp.2"), + prefix=f"{prefix}.mlp.2", + disable_tp=use_data_parallel), ]) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -600,6 +613,7 @@ class Qwen2VisionTransformer(nn.Module): norm_eps: float = 1e-6, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() @@ -613,6 +627,9 @@ class Qwen2VisionTransformer(nn.Module): num_heads = vision_config.num_heads mlp_ratio = vision_config.mlp_ratio + self.use_data_parallel = use_data_parallel + self.out_hidden_size = vision_config.hidden_size + self.spatial_merge_size = spatial_merge_size self.num_heads = num_heads self.embed_dim = embed_dim @@ -634,7 +651,8 @@ class Qwen2VisionTransformer(nn.Module): mlp_ratio=mlp_ratio, norm_layer=norm_layer, quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(depth) ]) self.merger = Qwen2VisionPatchMerger( @@ -643,6 +661,7 @@ class Qwen2VisionTransformer(nn.Module): norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype()) @@ -659,8 +678,9 @@ class Qwen2VisionTransformer(nn.Module): def device(self) -> torch.device: return self.patch_embed.proj.weight.device - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor: pos_ids = [] + max_grid_size = 0 for t, h, w in grid_thw: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) @@ -678,8 +698,8 @@ class Qwen2VisionTransformer(nn.Module): ).permute(0, 2, 1, 3).flatten() pos_ids.append( torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + max_grid_size = max(max_grid_size, h, w) pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb @@ -698,7 +718,7 @@ class Qwen2VisionTransformer(nn.Module): def forward( self, x: torch.Tensor, - grid_thw: torch.Tensor, + grid_thw: list[list[int]], ) -> torch.Tensor: # patchify x = x.to(device=self.device, dtype=self.dtype) @@ -708,8 +728,9 @@ class Qwen2VisionTransformer(nn.Module): rotary_pos_emb = self.rot_pos_emb(grid_thw) # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, 0]).cumsum( + grid_thw_ = torch.tensor(grid_thw) + cu_seqlens = torch.repeat_interleave(grid_thw_[:, 1] * grid_thw_[:, 2], + grid_thw_[:, 0]).cumsum( dim=0, dtype=torch.int32) cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) @@ -1112,6 +1133,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, "model.": "language_model.model.", }) + supports_encoder_tp_data = True + def get_mrope_input_positions( self, input_tokens: list[int], @@ -1239,6 +1262,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.config = config self.multimodal_config = multimodal_config @@ -1249,6 +1273,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, ) else: self.visual = None @@ -1357,7 +1382,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_embeds = image_input["image_embeds"] else: pixel_values = image_input["pixel_values"] - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -1377,7 +1410,14 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, video_embeds = video_input["video_embeds"] else: pixel_values_videos = video_input["pixel_values_videos"] - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d") + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw_list) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size From ba8d2165b655793ff24cf074d041e0bddfc5a0cc Mon Sep 17 00:00:00 2001 From: Ming Yang <yming@meta.com> Date: Tue, 23 Sep 2025 00:56:00 -0700 Subject: [PATCH 579/584] Handle triton kernel import exception (#25319) Signed-off-by: Ming Yang <minos.future@gmail.com> --- vllm/model_executor/layers/fused_moe/config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 3052bdb4dc1ec..34bfe1c16aac7 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -14,11 +14,16 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.utils import cdiv, has_triton_kernels from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe -if has_triton_kernels(): - from triton_kernels.matmul_ogs import PrecisionConfig - logger = init_logger(__name__) +if has_triton_kernels(): + try: + from triton_kernels.matmul_ogs import PrecisionConfig + except ImportError: + logger.error( + "Failed to import Triton kernels. Please make sure your triton " + "version is compatible.") + def _get_config_dtype_str( dtype: torch.dtype, From 9383cd6f10c55875de7f932e1bf7b0c43f01fc67 Mon Sep 17 00:00:00 2001 From: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Date: Tue, 23 Sep 2025 16:07:27 +0800 Subject: [PATCH 580/584] [Frontend] Add a new xml-based tool parser for qwen3-coder (#25028) Signed-off-by: Zhikaiiii <1658973216@qq.com> --- docs/features/tool_calling.md | 9 + tests/tool_use/test_qwen3coder_tool_parser.py | 112 +- .../openai/tool_parsers/__init__.py | 2 + .../tool_parsers/qwen3xml_tool_parser.py | 1137 +++++++++++++++++ 4 files changed, 1238 insertions(+), 22 deletions(-) create mode 100644 vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 2a48596571d1d..291c313cd57af 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -319,6 +319,15 @@ Supported models: Flags: `--tool-call-parser glm45` +### Qwen3-Coder Models (`qwen3_xml`) + +Supported models: + +* `Qwen/Qwen3-480B-A35B-Instruct` +* `Qwen/Qwen3-Coder-30B-A3B-Instruct` + +Flags: `--tool-call-parser qwen3_xml` + ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index f06fb2b9f2f04..57eaf84d36f23 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -13,6 +13,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ToolCall) from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( Qwen3CoderToolParser) +from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import ( + Qwen3XMLToolParser) from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer @@ -29,6 +31,21 @@ def qwen3_tool_parser(qwen3_tokenizer): return Qwen3CoderToolParser(qwen3_tokenizer) +@pytest.fixture +def qwen3_xml_tool_parser(qwen3_tokenizer): + return Qwen3XMLToolParser(qwen3_tokenizer) + + +@pytest.fixture(params=["original", "xml"]) +def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, + request): + """Parameterized fixture that provides both parser types for testing""" + if request.param == "original": + return qwen3_tool_parser + else: + return qwen3_xml_tool_parser + + @pytest.fixture def sample_tools(): return [ @@ -95,7 +112,7 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall], def stream_delta_message_generator( - qwen3_tool_parser: Qwen3CoderToolParser, + qwen3_tool_parser, qwen3_tokenizer: AnyTokenizer, model_output: str, request: Optional[ChatCompletionRequest] = None @@ -144,9 +161,9 @@ def stream_delta_message_generator( read_offset = new_read_offset -def test_extract_tool_calls_no_tools(qwen3_tool_parser): +def test_extract_tool_calls_no_tools(qwen3_tool_parser_parametrized): model_output = "This is a test response without any tool calls" - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=None) # type: ignore[arg-type] assert not extracted_tool_calls.tools_called assert extracted_tool_calls.tool_calls == [] @@ -294,12 +311,13 @@ circle ], "Let me calculate that area for you."), ], ) -def test_extract_tool_calls(qwen3_tool_parser, sample_tools, model_output, - expected_tool_calls, expected_content): +def test_extract_tool_calls(qwen3_tool_parser_parametrized, sample_tools, + model_output, expected_tool_calls, + expected_content): request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) assert extracted_tool_calls.tools_called @@ -308,7 +326,8 @@ def test_extract_tool_calls(qwen3_tool_parser, sample_tools, model_output, assert extracted_tool_calls.content == expected_content -def test_extract_tool_calls_fallback_no_tags(qwen3_tool_parser, sample_tools): +def test_extract_tool_calls_fallback_no_tags(qwen3_tool_parser_parametrized, + sample_tools): """Test fallback parsing when XML tags are missing""" model_output = '''<function=get_current_weather> <parameter=city> @@ -322,7 +341,7 @@ TX request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) assert extracted_tool_calls.tools_called @@ -331,7 +350,7 @@ TX "get_current_weather") -def test_extract_tool_calls_type_conversion(qwen3_tool_parser): +def test_extract_tool_calls_type_conversion(qwen3_tool_parser_parametrized): """Test parameter type conversion based on tool schema""" tools = [ ChatCompletionToolsParam(type="function", @@ -381,7 +400,7 @@ hello world </tool_call>''' request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) @@ -536,9 +555,10 @@ circle ], "Let me calculate that area for you."), ], ) -def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, - sample_tools, model_output, - expected_tool_calls, expected_content): +def test_extract_tool_calls_streaming(qwen3_tool_parser_parametrized, + qwen3_tokenizer, sample_tools, + model_output, expected_tool_calls, + expected_content): """Test incremental streaming behavior including typed parameters""" request = ChatCompletionRequest(model=MODEL, messages=[], @@ -548,7 +568,8 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, tool_states = {} # Track state per tool index for delta_message in stream_delta_message_generator( - qwen3_tool_parser, qwen3_tokenizer, model_output, request): + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, + request): # role should never be streamed from tool parser assert not delta_message.role @@ -609,7 +630,7 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer, def test_extract_tool_calls_missing_closing_parameter_tag( - qwen3_tool_parser, sample_tools): + qwen3_tool_parser_parametrized, sample_tools): """Test handling of missing closing </parameter> tag""" # Using get_current_weather from sample_tools but with malformed XML model_output = '''Let me check the weather for you: @@ -629,7 +650,7 @@ fahrenheit request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) - extracted_tool_calls = qwen3_tool_parser.extract_tool_calls( + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( model_output, request=request) # The parser should handle the malformed XML gracefully @@ -652,7 +673,7 @@ fahrenheit def test_extract_tool_calls_streaming_missing_closing_tag( - qwen3_tool_parser, qwen3_tokenizer, sample_tools): + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools): """Test streaming with missing closing </parameter> tag""" # Using get_current_weather from sample_tools but with malformed XML model_output = '''Let me check the weather for you: @@ -677,7 +698,8 @@ fahrenheit tool_states = {} for delta_message in stream_delta_message_generator( - qwen3_tool_parser, qwen3_tokenizer, model_output, request): + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, + request): if delta_message.content: other_content += delta_message.content @@ -727,9 +749,8 @@ fahrenheit assert args["unit"] == "fahrenheit" -def test_extract_tool_calls_streaming_incremental(qwen3_tool_parser, - qwen3_tokenizer, - sample_tools): +def test_extract_tool_calls_streaming_incremental( + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools): """Test that streaming is truly incremental""" model_output = '''I'll check the weather.<tool_call> <function=get_current_weather> @@ -748,7 +769,8 @@ TX chunks = [] for delta_message in stream_delta_message_generator( - qwen3_tool_parser, qwen3_tokenizer, model_output, request): + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, + request): chunks.append(delta_message) # Should have multiple chunks @@ -784,3 +806,49 @@ TX parsed_args = json.loads(full_args) assert parsed_args["city"] == "Dallas" assert parsed_args["state"] == "TX" + + +def test_extract_tool_calls_complex_type_with_single_quote( + qwen3_tool_parser_parametrized): + """Test parameter type conversion based on tool schema""" + tools = [ + ChatCompletionToolsParam(type="function", + function={ + "name": "test_types", + "parameters": { + "type": "object", + "properties": { + "int_param": { + "type": "integer" + }, + "float_param": { + "type": "float" + }, + "bool_param": { + "type": "boolean" + }, + "str_param": { + "type": "string" + }, + "obj_param": { + "type": "object" + } + } + } + }) + ] + + model_output = '''<tool_call> +<function=test_types> +<parameter=obj_param> +{'key': 'value'} +</parameter> +</function> +</tool_call>''' + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( + model_output, request=request) + + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args["obj_param"] == {"key": "value"} diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 35096b0461361..5e77c406b8d92 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -20,6 +20,7 @@ from .openai_tool_parser import OpenAIToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser from .pythonic_tool_parser import PythonicToolParser from .qwen3coder_tool_parser import Qwen3CoderToolParser +from .qwen3xml_tool_parser import Qwen3XMLToolParser from .seed_oss_tool_parser import SeedOssToolParser from .step3_tool_parser import Step3ToolParser from .xlam_tool_parser import xLAMToolParser @@ -45,6 +46,7 @@ __all__ = [ "HunyuanA13BToolParser", "Glm4MoeModelToolParser", "Qwen3CoderToolParser", + "Qwen3XMLToolParser", "SeedOssToolParser", "Step3ToolParser", "OpenAIToolParser", diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py new file mode 100644 index 0000000000000..4ab67dfea104c --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -0,0 +1,1137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import json +import uuid +from collections.abc import Sequence +from typing import Any, Optional, Union +from xml.parsers.expat import ParserCreate + +import regex as re + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class StreamingXMLToolCallParser: + """ + Simplified streaming XML tool call parser + Supports streaming input, parsing, and output + """ + + def __init__(self): + self.reset_streaming_state() + + # Tool configuration information + self.tools: Union[list[ChatCompletionToolsParam], None] = None + self.tool_call_start_token: str = '<tool_call>' + self.tool_call_end_token: str = '</tool_call>' + self.function_start_token: str = '<function=' + self.function_end_token: str = '</function>' + self.parameter_start_token: str = '<parameter=' + self.parameter_end_token: str = '</parameter>' + + def reset_streaming_state(self): + """Reset streaming parsing state""" + + self.deltas = [] + # state for streaming + self.tool_call_index = 0 + self.current_call_id = None + self.last_completed_call_id = None + self.current_function_name = None + self.current_function_open = False + self.parameters = {} + self.current_param_name = None + self.current_param_value = '' + self.current_param_value_converted = '' + self.current_param_is_first = False + self.should_emit_end_newline = False + self.start_quote_emitted = False + + self.streaming_buffer = '' + self.last_processed_pos = 0 + + self.text_content_buffer = '' + + # state for preprocessing and deferred parsing + self._pre_inside_parameter = False + self._pre_param_buffer = "" + self._pre_current_param_name = None + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + + # recreate parser + self.parser = ParserCreate() + self.setup_parser() + + def parse_single_streaming_chunks(self, xml_chunk: str) -> DeltaMessage: + """ + Parse single streaming XML chunk and return Delta response + This is the actual streaming interface that receives chunks + one by one and maintains internal state + + Args: + xml_chunk: Single XML chunk string + Returns: + DeltaMessage: Contains delta information generated by this chunk, + returns empty response if no complete elements + """ + # Record delta count before processing + initial_delta_count = len(self.deltas) + + self.streaming_buffer += xml_chunk + + found_elements = self._process_complete_xml_elements() + + if found_elements: + # If complete elements found, check if end events were missed + # some tags may not have been triggered + try: + new_deltas = self.deltas[initial_delta_count:] + # If this chunk contains </function> + # but didn't generate '}', then complete it + if (self.current_call_id is not None + and self.function_end_token in xml_chunk): + + # - Added '}' (non-empty parameter ending) + # - Added '{}' (empty parameter function) + has_function_close = any((td.tool_calls and any( + (tc.function and tc.id == self.current_call_id + and isinstance(tc.function.arguments, str) and + (tc.function.arguments in ('}', '{}'))) + for tc in td.tool_calls)) for td in new_deltas) + if not has_function_close: + # Close potentially unclosed element + if self.current_param_name: + self._end_element('parameter') + if self.current_function_name: + self._end_element('function') + # If this chunk contains </tool_call> + # but didn't generate final empty delta, then complete it + if (self.current_call_id is not None + and self.tool_call_end_token in xml_chunk): + has_toolcall_close = any((td.tool_calls and any( + (tc.type == 'function' and tc.function and tc.function. + arguments == '' and tc.id == self.current_call_id) + for tc in td.tool_calls)) for td in new_deltas) + if not has_toolcall_close: + # Close potentially unclosed element + if self.current_param_name: + self._end_element('parameter') + if self.current_function_name: + self._end_element('function') + self._end_element('tool_call') + except Exception as e: + logger.warning("Error with fallback parsing: %s", e) + # Merge newly generated deltas into single response + result_delta = self._merge_new_deltas_to_single_response( + initial_delta_count) + return result_delta + else: + # No complete elements, check if there's unoutput text content + if self.text_content_buffer and self.tool_call_index == 0: + # Has text content but no tool_call yet, output text content + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + # Clear buffer to avoid duplicate output + self.text_content_buffer = '' + return text_delta + + # If this chunk contains end tags but wasn't triggered by parser, + # manually complete end events + # Only execute when still on the same call as when entered, + # to prevent accidentally closing new calls + # in multi <tool_call> scenarios + if (self.current_call_id is not None + and (self.function_end_token in xml_chunk + or self.tool_call_end_token in xml_chunk)): + # Close potentially unclosed element + if self.current_param_name: + self._end_element('parameter') + if self.function_end_token in xml_chunk and \ + self.current_function_name: + self._end_element('function') + if self.tool_call_end_token in xml_chunk: + self._end_element('tool_call') + # Return the merged delta result generated by this fallback + result_delta = self._merge_new_deltas_to_single_response( + initial_delta_count) + return result_delta + + # No complete elements, return empty response + return DeltaMessage(content=None) + + def _escape_xml_special_chars(self, text: str) -> str: + """ + Escape XML special characters + Args: + text: Original text + Returns: + Escaped text + """ + xml_escapes = { + '&': '&', + '<': '<', + '>': '>', + '"': '"', + "'": ''' + } + + for char, escape in xml_escapes.items(): + text = text.replace(char, escape) + + return text + + def _process_complete_xml_elements(self) -> bool: + """ + Process complete XML elements in buffer + + Returns: + bool: Whether complete elements were found and processed + """ + found_any = False + + while self.last_processed_pos < len(self.streaming_buffer): + # Find next complete xml element + element, end_pos = self._find_next_complete_element( + self.last_processed_pos) + if element is None: + # No complete element found, wait for more data + break + + # Check if this element should be skipped + if self._should_skip_element(element): + self.last_processed_pos = end_pos + continue + + # Found complete XML element, process it + try: + preprocessed_element = self._preprocess_xml_chunk(element) + # Check if this is the first tool_call start + if ((preprocessed_element.strip().startswith('<tool_call>') or + preprocessed_element.strip().startswith('<function name=') + ) and self.tool_call_index + == 0) and self.text_content_buffer: + # First tool_call starts, + # output previously collected text content first + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + # Clear buffer for potential subsequent text content + self.text_content_buffer = '' + + # If a new tool_call starts and + # there are already completed tool_calls + if (preprocessed_element.strip().startswith('<tool_call>') + and self.tool_call_index > 0 and self.current_call_id): + # Reset parser state but preserve generated deltas + if self.current_param_name: + self._end_element('parameter') + if self.current_function_open or self.current_function_name: + self._end_element('function') + # Output final tool_call tail delta + final_delta = DeltaMessage( + role=None, + content=None, + reasoning_content=None, + tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments='')) + ]) + self._emit_delta(final_delta) + # Reset XML parser and current call state + self._reset_xml_parser_after_tool_call() + # Parse preprocessed element + self.parser.Parse(preprocessed_element, False) + found_any = True + + except Exception as e: + logger.warning("Error when parsing XML elements: %s", e) + + # Update processed position + self.last_processed_pos = end_pos + + return found_any + + def _should_skip_element(self, element: str) -> bool: + """ + Determine whether an element should be skipped + + Args: + element: Element to evaluate + + Returns: + bool: True means should skip, False means should process + """ + + # If it's a tool_call XML tag, don't skip + if element.startswith( + self.tool_call_start_token) or element.startswith( + self.function_start_token) or element.startswith( + self.parameter_start_token): + return False + + # If currently not parsing tool calls and not blank, + # collect this text instead of skipping + # Only process other XML elements after tool_call appears, + # otherwise treat as plain text + if self.current_call_id is None and element: + # Collect text content to buffer + self.text_content_buffer += element + return True # Still skip, but content has been collected + + # If currently parsing tool calls, + # this might be parameter value, don't skip + if self.current_call_id is not None: + return False + + # Skip blank content + return not element + + def _find_next_complete_element( + self, start_pos: int) -> tuple[Optional[str], int]: + """ + Find next complete XML element from specified position + + Args: + start_pos: Position to start searching + + Returns: + (Complete element string, element end position), + returns (None, start_pos) if no complete element found + """ + buffer = self.streaming_buffer[start_pos:] + + if not buffer: + return None, start_pos + + if buffer.startswith('<'): + # Need to ensure no new < appears, + # find the nearest one between < and > + tag_end = buffer.find('<', 1) + tag_end2 = buffer.find('>', 1) + if tag_end != -1 and tag_end2 != -1: + # Next nearest is < + if tag_end < tag_end2: + return buffer[:tag_end], start_pos + tag_end + # Next nearest is >, means found XML element + else: + return buffer[:tag_end2 + 1], start_pos + tag_end2 + 1 + elif tag_end != -1: + return buffer[:tag_end], start_pos + tag_end + elif tag_end2 != -1: + return buffer[:tag_end2 + 1], start_pos + tag_end2 + 1 + else: + # If currently not parsing tool calls (entering a tool_call), + # check if starts with <tool_call> + if self.current_call_id is None: + # Check if might be start of <tool_call> + if buffer == '<tool_call>'[:len(buffer)]: + # Might be start of <tool_call>, wait for more data + return None, start_pos + else: + # Not start of <tool_call>, treat as text + return buffer, start_pos + len(buffer) + else: + # When parsing tool calls, + # wait for more data to get complete tag + return None, start_pos + else: + # Find text content (until next < or buffer end) + next_tag_pos = buffer.find('<') + if next_tag_pos != -1: + # Found text content + text_content = buffer[:next_tag_pos] + return text_content, start_pos + next_tag_pos + else: + # Buffer end is all text, process + # (no longer wait for more data) + remaining = buffer + return remaining, start_pos + len(remaining) + + def _merge_new_deltas_to_single_response( + self, initial_count: int) -> DeltaMessage: + """ + Merge newly generated deltas from this processing + into a single DeltaMessage + + Args: + initial_count: Delta count before processing + + Returns: + Merged DeltaMessage containing all newly generated delta information + """ + if len(self.deltas) <= initial_count: + return DeltaMessage(content=None) + + # Get newly generated deltas + new_deltas = self.deltas[initial_count:] + + if len(new_deltas) == 1: + # Only one new delta, return directly + return new_deltas[0] + + # Merge multiple new deltas + merged_tool_calls: list[DeltaToolCall] = [] + merged_content: str = '' + + for delta in new_deltas: + if delta.content: + merged_content += delta.content + if delta.tool_calls: + # For tool_calls, we need to intelligently merge arguments + for tool_call in delta.tool_calls: + # Find if there's already a tool_call with the same call_id + existing_call = None + for existing in merged_tool_calls: + if existing.id == tool_call.id: + existing_call = existing + break + + if existing_call and existing_call.function: + # Merge to existing tool_call + if tool_call.function and tool_call.function.name: + existing_call.function.name = \ + tool_call.function.name + if tool_call.function \ + and tool_call.function.arguments is not None: + if existing_call.function.arguments is None: + existing_call.function.arguments = '' + + # For streaming JSON parameters, + # simply concatenate in order + new_args = tool_call.function.arguments + existing_call.function.arguments += new_args + if tool_call.type: + existing_call.type = tool_call.type + else: + # Add new tool_call + merged_tool_calls.append(tool_call) + + return DeltaMessage(content=merged_content if merged_content else None, + tool_calls=merged_tool_calls) + + def _preprocess_xml_chunk(self, chunk: str) -> str: + """ + Preprocess XML chunk, handle non-standard formats, + and escape special characters + + Args: + chunk: Original XML chunk + + Returns: + Processed XML chunk + """ + + # Check if this is a tool_call related element + is_tool_call = False + if chunk.startswith(self.tool_call_start_token) or chunk.startswith( + self.tool_call_end_token): + is_tool_call = True + if chunk.startswith(self.function_start_token) or chunk.startswith( + self.function_end_token): + is_tool_call = True + if chunk.startswith(self.parameter_start_token) or chunk.startswith( + self.parameter_end_token): + is_tool_call = True + # Handle <function=name> format -> <function name="name"> + processed = re.sub(r'<function=([^>]+)>', r'<function name="\1">', + chunk) + # Handle <parameter=name> format -> <parameter name="name"> + processed = re.sub(r'<parameter=([^>]+)>', r'<parameter name="\1">', + processed) + + original_chunk = chunk + # If in parameter value accumulation mode + if self._pre_inside_parameter: + # Parameter end: output accumulated raw text + # safely then return </parameter> + if processed.startswith('</parameter>'): + body_text = self._pre_param_buffer + # Trigger deferred parsing mode + # literal_eval+json output in end_element + self.defer_current_parameter = True + self.deferred_param_raw_value = body_text + # Clean up state + self._pre_inside_parameter = False + self._pre_param_buffer = "" + self._pre_current_param_name = None + safe_text = self._escape_xml_special_chars(body_text) + return f"{safe_text}</parameter>" + else: + # If this is the first block of content after entering parameter + # evaluate if deferred parsing is needed; + # If not needed, exit accumulation mode + # and pass through directly + if self._pre_param_buffer == "": + # Get current parameter type + param_type = self._get_param_type( + self._pre_current_param_name + ) if self._pre_current_param_name else 'string' + # Only these types need deferred parsing to + # handle Python literals containing single quotes + is_object_type = param_type in ["object"] + is_complex_type = (param_type + in ["array", "arr", "sequence"] + or param_type.startswith("dict") + or param_type.startswith("list")) + + # Only delay when contains container symbols + # and has single quotes and is complex type + has_container_hint = ('[' in original_chunk) or ( + '{' in original_chunk) or ('(' in original_chunk) + + # Determine if deferred parsing is needed + need_defer = False + if is_complex_type: + # Complex type, always need deferred parsing + need_defer = True + elif is_object_type and has_container_hint and ( + "'" in original_chunk): + # Object type with container symbols + # and single quotes, need deferred parsing + need_defer = True + + if not need_defer: + # No need for deferred parsing, + # exit parameter mode directly + self._pre_inside_parameter = False + return self._escape_xml_special_chars(original_chunk) + self._pre_param_buffer += original_chunk + return "" + + # Parameter start: enable accumulation + if processed.startswith('<parameter name='): + m = re.match(r'<parameter name="([^"]+)">', processed) + if m: + self._pre_current_param_name = m.group(1) + self._pre_inside_parameter = True + self._pre_param_buffer = "" + return processed + + # If processed doesn't contain special_token, escape processed + # This is because XML parsing encounters special characters + # and reports errors, so escaping is needed + if not is_tool_call: + processed = self._escape_xml_special_chars(processed) + return processed + + def _emit_delta(self, delta: DeltaMessage): + """Emit Delta response (streaming output)""" + self.deltas.append(delta) + + def _auto_close_open_parameter_if_needed(self, + incoming_tag: Optional[str] = None + ): + """Before starting to process new elements, + if there are unclosed tags from before, + automatically complete their endings to the parser. + - If there are unclosed parameters, + it's equivalent to feeding `</parameter>` + - When about to start a new function or tool_call, + if there are unclosed functions, complete `</function>`. + - When about to start a new tool_call, + if there are unclosed tool_calls, complete `</tool_call>`. + """ + # First close unclosed parameters + if self.current_param_name: + self._end_element('parameter') + + # If about to start new function or tool_call, + # and there are unclosed functions, close function first + if incoming_tag in ('function', + 'tool_call') and self.current_function_name: + self._end_element('function') + + # If about to start new tool_call, + # and there are unclosed tool_calls, close tool_call first + if incoming_tag == 'tool_call' and self.current_call_id: + self._end_element('tool_call') + + def _start_element(self, name: str, attrs: dict[str, str]): + """Handle XML start element events""" + + if name == 'root': + return + + if name == 'tool_call': + # Before opening new tool_call, + # automatically complete previous unclosed tags + self._auto_close_open_parameter_if_needed('tool_call') + + self.parameters = {} + self.current_call_id = self._get_next_call_id() + self.current_param_is_first = True + self.tool_call_index += 1 + elif name.startswith('function') or (name == 'function'): + # If missing tool_call, manually complete + if not self.current_call_id: + self._start_element('tool_call', {}) + # Before opening new function, + # automatically complete previous unclosed tags (parameter/function) + self._auto_close_open_parameter_if_needed('function') + function_name = self._extract_function_name(name, attrs) + self.current_function_name = function_name + self.current_function_open = True + if function_name: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=function_name, arguments='')) + ]) + self._emit_delta(delta) + elif name.startswith('parameter') or (name == 'parameter'): + # If previous parameter hasn't ended normally, + # complete its end first, then start new parameter + self._auto_close_open_parameter_if_needed('parameter') + param_name = self._extract_parameter_name(name, attrs) + self.current_param_name = param_name + self.current_param_value = '' + self.current_param_value_converted = '' + self.start_quote_emitted = False # Reset start quote flag + + # Only output parameter name and colon, + # don't output quotes + # decide after parameter value type is determined + if param_name: + if not self.parameters: + # First parameter + # start JSON, only output parameter name and colon + json_start = f'{{"{param_name}": ' + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments=json_start)) + ]) + self._emit_delta(delta) + self.current_param_is_first = True + else: + # Subsequent parameters + # add comma and parameter name, no quotes + json_continue = f', "{param_name}": ' + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments=json_continue)) + ]) + self._emit_delta(delta) + self.current_param_is_first = False + + def _char_data(self, data: str): + """Handle XML character data events""" + if data and self.current_param_name: + # If preprocessing stage determines deferred parsing is needed, + # only cache character data, no streaming output + if self.defer_current_parameter: + original_data = data + if self.should_emit_end_newline: + original_data = '\n' + original_data + self.should_emit_end_newline = False + if original_data.endswith('\n'): + self.should_emit_end_newline = True + original_data = original_data[:-1] + self.current_param_value += original_data + return + + param_type = self._get_param_type(self.current_param_name) + + # Check if this is the first time receiving data for this parameter + # If this is the first packet of data and starts with \n, remove \n + if not self.current_param_value and data.startswith('\n'): + data = data[1:] + + # Output start quote for string type (if not already output) + if (param_type + in ['string', 'str', 'text', 'varchar', 'char', 'enum'] + and not self.start_quote_emitted): + quote_delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='"')) + ]) + self._emit_delta(quote_delta) + self.start_quote_emitted = True + + if not data: + return + + original_data = data + # Delay output of trailing newline + if self.should_emit_end_newline: + original_data = '\n' + original_data + self.should_emit_end_newline = False + if original_data.endswith('\n'): + self.should_emit_end_newline = True + original_data = original_data[:-1] + self.current_param_value += original_data + + # convert parameter value by param_type + converted_value = self._convert_param_value( + self.current_param_value, param_type) + output_data = self._convert_for_json_streaming( + converted_value, param_type) + + delta_data = output_data[len(self.current_param_value_converted):] + self.current_param_value_converted = output_data + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments=delta_data)) + ]) + self._emit_delta(delta) + + def _end_element(self, name: str): + """Handle XML end element events""" + + if name == 'root': + return + + # If function or tool_call ends and there are still unclosed parameters, + # complete parameter end first + if (name.startswith('function') or name == 'function' + or name == 'tool_call') and self.current_param_name: + self._auto_close_open_parameter_if_needed() + + if (name.startswith('parameter') + or name == 'parameter') and self.current_param_name: + # End current parameter + param_name = self.current_param_name + param_value = self.current_param_value + + # If in deferred parsing mode, + # perform overall parsing on raw content + # accumulated in preprocessing stage and output once + if self.defer_current_parameter: + raw_text = self.deferred_param_raw_value \ + if self.deferred_param_raw_value else param_value + parsed_value = None + output_arguments = None + try: + # If previously delayed trailing newline, + # add it back before parsing + if self.should_emit_end_newline: + raw_for_parse = raw_text + '\n' + else: + raw_for_parse = raw_text + parsed_value = ast.literal_eval(raw_for_parse) + output_arguments = json.dumps(parsed_value, + ensure_ascii=False) + except Exception: + # Fallback: output as string as-is + output_arguments = json.dumps(raw_text, ensure_ascii=False) + parsed_value = raw_text + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments=output_arguments)) + ]) + self._emit_delta(delta) + + # Clean up and store + self.should_emit_end_newline = False + self.parameters[param_name] = parsed_value + self.current_param_name = None + self.current_param_value = "" + self.current_param_value_converted = "" + self.start_quote_emitted = False + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + return + + param_type = self._get_param_type(param_name) + + # convert complete parameter value by param_type + converted_value = self._convert_param_value( + param_value, param_type) + + # Decide whether to add end quote based on parameter type + if param_type in [ + 'string', 'str', 'text', 'varchar', 'char', 'enum' + ]: + # For empty string parameters, need special handling + if not param_value and not self.start_quote_emitted: + # No start quote output, + # directly output complete empty string + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments='""')) + ]) + self._emit_delta(delta) + else: + # Non-empty parameter value, output end quote + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall( + name=None, arguments='"')) + ]) + self._emit_delta(delta) + + self.should_emit_end_newline = False + # Store converted value + self.parameters[param_name] = converted_value + self.current_param_name = None + self.current_param_value = '' + self.current_param_value_converted = '' + self.start_quote_emitted = False + + elif name.startswith('function') or name == 'function': + # if there are parameters, close JSON object + if self.parameters: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='}')) + ]) + self._emit_delta(delta) + # return empty object + else: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='{}')) + ]) + self._emit_delta(delta) + self.current_function_open = False + + elif name == 'tool_call': + # Before ending tool_call, + # ensure function is closed to complete missing right brace + if self.current_function_open: + # If there are still unclosed parameters, close them first + if self.current_param_name: + self._end_element('parameter') + # Close function, ensure output '}' or '{}' + self._end_element('function') + # Final Delta + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.tool_call_index - 1, + id=self.current_call_id, + type='function', + function=DeltaFunctionCall(name=None, + arguments='')) + ]) + self._emit_delta(delta) + + # Check if there's text content to output (between tool_calls) + if self.text_content_buffer.strip(): + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + + self._reset_xml_parser_after_tool_call() + + def setup_parser(self): + """Set up XML parser event handlers""" + self.parser.buffer_text = True + self.parser.StartElementHandler = self._start_element + self.parser.EndElementHandler = self._end_element + self.parser.CharacterDataHandler = self._char_data + + def set_tools(self, tools: Union[list[ChatCompletionToolsParam], None]): + """Set tool configuration information""" + self.tools = tools + + def _get_next_call_id(self): + """Generate unique call ID""" + return f'call_{uuid.uuid4().hex[:24]}' + + def _extract_function_name(self, name: str, + attrs: dict[str, str]) -> Optional[str]: + """Extract function name from various formats""" + if attrs and 'name' in attrs: + return attrs['name'] + + if '=' in name: + parts = name.split('=', 1) + if len(parts) == 2 and parts[0] == 'function': + return parts[1] + + return None + + def _extract_parameter_name(self, name: str, + attrs: dict[str, str]) -> Optional[str]: + """Extract parameter name from various formats""" + if attrs and 'name' in attrs: + return attrs['name'] + + if '=' in name: + parts = name.split('=', 1) + if len(parts) == 2 and parts[0] == 'parameter': + return parts[1] + + return None + + def _get_param_type(self, param_name: str) -> str: + """Get parameter type based on tool configuration, defaults to string + Args: + param_name: Parameter name + + Returns: + Parameter type + """ + if not self.tools or not self.current_function_name: + return 'string' + + for tool in self.tools: + if not hasattr(tool, 'type') or not (hasattr( + tool, 'function') and hasattr(tool.function, 'name')): + continue + if tool.type == 'function' and \ + tool.function.name == self.current_function_name: + if not hasattr(tool.function, 'parameters'): + return 'string' + params = tool.function.parameters + if isinstance(params, dict) and 'properties' in params: + properties = params['properties'] + if param_name in properties and isinstance( + properties[param_name], dict): + return self.repair_param_type( + str(properties[param_name].get('type', 'string'))) + elif isinstance(params, dict) and param_name in params: + param_config = params[param_name] + if isinstance(param_config, dict): + return self.repair_param_type( + str(param_config.get('type', 'string'))) + break + return 'string' + + def repair_param_type(self, param_type: str) -> str: + """Repair unknown parameter types by treating them as string + Args: + param_type: Parameter type + + Returns: + Repaired parameter type + """ + if param_type in [ + 'string', 'str', 'text', 'varchar', 'char', 'enum' + ] or param_type.startswith('int') or param_type.startswith( + 'uint' + ) or param_type.startswith('long') or param_type.startswith( + 'short' + ) or param_type.startswith('unsigned') or param_type.startswith( + 'num') or param_type.startswith('float') or param_type in [ + 'boolean', 'bool', 'binary' + ] or (param_type in ["object", "array", "arr", "sequence"] + or param_type.startswith("dict") + or param_type.startswith("list")): + return param_type + else: + return 'string' + + def _convert_param_value(self, param_value: str, param_type: str) -> Any: + """Convert value based on parameter type + Args: + param_value: Parameter value + param_type: Parameter type + + Returns: + Converted value + """ + if param_value.lower() == 'null': + return None + + param_type = param_type.strip().lower() + if param_type in ['string', 'str', 'text', 'varchar', 'char', 'enum']: + return param_value + elif (param_type.startswith('int') or param_type.startswith('uint') + or param_type.startswith('long') + or param_type.startswith('short') + or param_type.startswith('unsigned')): + try: + return int(param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not an integer " + "in tool '%s', degenerating to string.", param_value) + return param_value + elif param_type.startswith('num') or param_type.startswith('float'): + try: + float_param_value: float = float(param_value) + return float_param_value if float_param_value - int( + float_param_value) != 0 else int(float_param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not a float " + "in tool '%s', degenerating to string.", param_value) + return param_value + elif param_type in ['boolean', 'bool', 'binary']: + param_value = param_value.lower() + return param_value == 'true' + else: + return param_value + + def _convert_for_json_streaming(self, converted_value: Any, + param_type: str) -> str: + """Convert converted_value based on + whether it's empty and if type is string + Args: + converted_value: Converted value + param_type: Parameter type + + Returns: + Converted string for streaming output + """ + # Check if value is empty, but exclude numeric 0 + if converted_value is None or converted_value == '': + return '' + + if param_type in ['string', 'str', 'text', 'varchar', 'char', 'enum']: + # String type, remove double quotes + return json.dumps(converted_value, ensure_ascii=False)[1:-1] + else: + # Non-string type, return complete JSON string + if not isinstance(converted_value, str): + return json.dumps(converted_value, ensure_ascii=False) + else: + return converted_value + + def _reset_xml_parser_after_tool_call(self): + """ + Each tool_call is treated as a separate XML document, + so we need to reset the parser after each tool_call. + """ + + # recreate XML parser + self.parser = ParserCreate() + self.setup_parser() + + # Reset current tool_call state + if self.current_call_id: + self.last_completed_call_id = self.current_call_id + self.current_call_id = None + self.current_function_name = None + self.current_function_open = False + self.parameters = {} + self.current_param_name = None + self.current_param_value = '' + self.current_param_value_converted = '' + self.current_param_is_first = False + self.should_emit_end_newline = False + self.start_quote_emitted = False + self.text_content_buffer = '' + + # Reset preprocessing and deferred parsing state + self._pre_inside_parameter = False + self._pre_param_buffer = "" + self._pre_current_param_name = None + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + + +@ToolParserManager.register_module("qwen3_xml") +class Qwen3XMLToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.parser = StreamingXMLToolCallParser() + + logger.info("vLLM Successfully import tool parser %s !", + self.__class__.__name__) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + self.parser.reset_streaming_state() + if request: + self.parser.set_tools(request.tools) + result = self.parser.parse_single_streaming_chunks(model_output) + if not result.tool_calls: + return ExtractedToolCallInformation( + tool_calls=[], + tools_called=False, + content=result.content, + ) + else: + tool_calls = [] + for tool_call in result.tool_calls: + if tool_call.function and tool_call.function.name: + tool_calls.append( + ToolCall( + id=tool_call.id, + type=tool_call.type, + function=FunctionCall( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ), + )) + return ExtractedToolCallInformation( + tool_calls=tool_calls, + tools_called=len(tool_calls) > 0, + content=result.content, + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + if not previous_text: + self.parser.reset_streaming_state() + if request: + self.parser.set_tools(request.tools) + + # Model sometimes outputs separately causing delta_text to be empty. + # If there were tool_calls before and all current tool_calls have ended, + # return an empty tool_call for outer streaming output + # to correctly output tool_call field + if not delta_text and delta_token_ids: + open_calls = current_text.count( + self.parser.tool_call_start_token) - current_text.count( + self.parser.tool_call_end_token) + if open_calls == 0 and self.parser.tool_call_index > 0: + # If current_call_id is None, use last_completed_call_id + call_id = self.parser.current_call_id or \ + self.parser.last_completed_call_id + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.parser.tool_call_index - 1, + id=call_id, + function=DeltaFunctionCall(arguments=''), + type='function', + ) + ]) + + return self.parser.parse_single_streaming_chunks(delta_text) From babad6e5dde380b453be716c7ebc2b6928658c18 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 23 Sep 2025 17:20:52 +0800 Subject: [PATCH 581/584] [Misc] Move DP for ViT code inside model executor dir (#25459) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- tests/models/test_vision.py | 424 ++++++++++++++++- tests/multimodal/test_utils.py | 426 +----------------- vllm/model_executor/models/glm4_1v.py | 3 +- .../models/idefics2_vision_model.py | 3 +- vllm/model_executor/models/intern_vit.py | 3 +- vllm/model_executor/models/kimi_vl.py | 2 +- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 3 +- vllm/model_executor/models/qwen2_vl.py | 3 +- vllm/model_executor/models/qwen3_vl.py | 6 +- vllm/model_executor/models/step3_vl.py | 2 +- vllm/model_executor/models/vision.py | 281 +++++++++++- vllm/multimodal/utils.py | 293 +----------- 13 files changed, 721 insertions(+), 730 deletions(-) diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index 310d3a3719b65..8744bcbd3a2a6 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -1,10 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math import pytest import torch +import torch.multiprocessing as mp -from vllm.model_executor.models.vision import resolve_visual_encoder_outputs +from tests.utils import multi_gpu_test +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import (init_distributed_environment, + initialize_model_parallel) +from vllm.model_executor.models.vision import ( + get_load_balance_assignment, resolve_visual_encoder_outputs, + run_dp_sharded_mrope_vision_model, run_dp_sharded_vision_model) +from vllm.platforms import current_platform +from vllm.utils import get_open_port, update_environment_variables @pytest.mark.parametrize( @@ -33,3 +43,415 @@ def test_resolve_visual_encoder_outputs(feature_sample_layers, post_layer_norm=None, max_possible_layers=max_possible_layers) assert torch.equal(torch.tensor(expected_features), output_tensor) + + +class SimpleLinearModel(torch.nn.Module): + """A simple linear vision model for testing.""" + + def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32): + super().__init__() + self.flatten = torch.nn.Flatten() + self.linear = torch.nn.Linear(input_dim, output_dim) + + def forward(self, x: torch.Tensor): + # Flatten the input and apply linear transformation + x = self.flatten(x) + return self.linear(x) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "batch_size", + [ + 1, # Single image + 4, # Small batch + 5, # Odd batch size (for testing padding) + ], +) +def test_run_dp_sharded_vision_model(batch_size: int): + world_size = 2 + # Launch processes + mp.spawn( + run_dp_sharded_vision_model_vs_direct, + args=( + world_size, + batch_size, + get_open_port(), + ), + nprocs=world_size, + ) + + +def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int, + batch_size: int, master_port: int): + """ + Test that run_dp_sharded_vision_model produces the same results as + calling the model directly. + """ + + # Set random seed for reproducibility + current_platform.seed_everything(0) + + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + # initialize distributed + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create a test input tensor + image_input = torch.randn(batch_size, 3, 224, 224) + + # Create a simple linear model + vision_model = SimpleLinearModel() + + # Run the model directly on the full input + with torch.inference_mode(): + direct_output = vision_model(image_input) + + # Run the model through the sharded function + with torch.inference_mode(): + sharded_output = run_dp_sharded_vision_model(image_input, vision_model) + + # Check that the world size is set up correctly + assert get_tensor_model_parallel_world_size() == world_size + + # Check that the outputs have the same shape + assert direct_output.shape == sharded_output.shape + + # Check that the outputs are close (they should be identical) + assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5) + + +@pytest.mark.parametrize( + "sizes,num_gpus,expected_shuffle_indices,expected_gpu_sample_counts," + "expected_grouped_sizes_per_gpu,test_description", + [ + # Empty input + ([], 2, [], [0, 0], [0, 0], "empty input"), + + # Fewer samples than GPUs + ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0 + ], "fewer samples than GPUs"), + + # Single GPU + ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"), + + # Balanced assignment + ([100, 100, 100, 100 + ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"), + + # Unbalanced sizes - this one is trickier since the algorithm is greedy + ([1000, 100, 200, 50], 2, [0, 2, 1, 3 + ], [1, 3], [1000, 350], "unbalanced sizes"), + ], +) +def test_get_load_balance_assignment_cases(sizes, num_gpus, + expected_shuffle_indices, + expected_gpu_sample_counts, + expected_grouped_sizes_per_gpu, + test_description): + """Test get_load_balance_assignment with various input cases.""" + result = get_load_balance_assignment(sizes, num_gpus=num_gpus) + (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result + + # Common assertions for all cases + assert len(shuffle_indices) == len(sizes) + assert len(gpu_sample_counts) == num_gpus + assert len(grouped_sizes_per_gpu) == num_gpus + assert sum(gpu_sample_counts) == len(sizes) + + assert shuffle_indices == expected_shuffle_indices + + assert gpu_sample_counts == expected_gpu_sample_counts + assert grouped_sizes_per_gpu == expected_grouped_sizes_per_gpu + + +class SimpleMRopeVisionModel(torch.nn.Module): + """A simple vision model for testing mrope functionality.""" + + def __init__(self, spatial_merge_size: int = 2, out_hidden_size: int = 64): + super().__init__() + self.spatial_merge_size = spatial_merge_size + self.out_hidden_size = out_hidden_size + self.linear = torch.nn.Linear(768, out_hidden_size) + + def forward(self, pixel_values: torch.Tensor, + grid_thw_list: list[list[int]]): + """Simple forward pass that simulates spatial merging.""" + # Apply linear transformation + embeddings = self.linear(pixel_values) + + # Simulate spatial merging by reducing the number of patches + merge_factor = self.spatial_merge_size * self.spatial_merge_size + + # Group patches and merge spatially + merged_embeddings = [] + start_idx = 0 + + for grid_thw in grid_thw_list: + num_patches = math.prod(grid_thw) + end_idx = start_idx + num_patches + + # Get patches for this image + image_patches = embeddings[start_idx:end_idx] + + # Simulate spatial merging by averaging groups of patches + merged_patches = num_patches // merge_factor + if merged_patches > 0: + # Reshape and average to simulate merging + reshaped = image_patches[:merged_patches * merge_factor].view( + merged_patches, merge_factor, -1) + merged = reshaped.mean(dim=1) + merged_embeddings.append(merged) + + start_idx = end_idx + + if merged_embeddings: + return torch.cat(merged_embeddings, dim=0) + else: + return torch.empty((0, self.out_hidden_size), + device=pixel_values.device, + dtype=pixel_values.dtype) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "batch_size", + [ + 1, # Single image + 3, # Small batch + 5, # Odd batch size (for testing padding) + ], +) +def test_run_dp_sharded_mrope_vision_model(batch_size: int): + world_size = 2 + # Launch processes + mp.spawn( + run_dp_sharded_mrope_vision_model_vs_direct, + args=( + world_size, + batch_size, + get_open_port(), + ), + nprocs=world_size, + ) + + +def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int, + world_size: int, + batch_size: int, + master_port: int): + """ + Test that run_dp_sharded_mrope_vision_model produces the same results as + calling the model directly. + """ + # Set random seed for reproducibility + current_platform.seed_everything(0) + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + # initialize distributed + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create test data + grid_thw_list = [] + pixel_values_list = [] + + for i in range(batch_size): + # Varying image sizes for better testing + t, h, w = 1, 4 + i, 4 + i + grid_thw_list.append([t, h, w]) + + num_patches = t * h * w + # Create random pixel values for this image + image_pixels = torch.randn(num_patches, 768) + pixel_values_list.append(image_pixels) + + # Concatenate all pixel values + pixel_values = torch.cat(pixel_values_list, dim=0) + + # Create a simple mrope vision model + vision_model = SimpleMRopeVisionModel() + + # Run the model directly on the full input (only on rank 0) + if local_rank == 0: + with torch.inference_mode(): + direct_output = vision_model(pixel_values, grid_thw_list) + + # Run the model through the sharded function + with torch.inference_mode(): + sharded_output = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + sharded_output = torch.cat(sharded_output, dim=0) + + # Check that the world size is set up correctly + assert get_tensor_model_parallel_world_size() == world_size + + # Compare outputs (only on rank 0) + if local_rank == 0: + # Check that the outputs have the same shape + assert direct_output.shape == sharded_output.shape + # Check that the outputs are close (they should be identical) + assert torch.allclose(direct_output, + sharded_output, + rtol=1e-5, + atol=1e-5) + + +@multi_gpu_test(num_gpus=2) +def test_run_dp_sharded_mrope_vision_model_empty_input(): + world_size = 2 + mp.spawn( + run_dp_sharded_mrope_vision_model_empty_input_worker, + args=(world_size, get_open_port()), + nprocs=world_size, + ) + + +def run_dp_sharded_mrope_vision_model_empty_input_worker( + local_rank: int, world_size: int, master_port: int): + """Test run_dp_sharded_mrope_vision_model with empty input.""" + # Set up distributed environment + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create empty inputs + pixel_values = torch.empty((0, 768)) + grid_thw_list: list[list[int]] = [] + + vision_model = SimpleMRopeVisionModel() + + # Should handle empty input gracefully + with torch.inference_mode(): + output = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + + assert len(output) == 0 + + +@multi_gpu_test(num_gpus=4) +def test_run_dp_sharded_mrope_vision_model_uneven_load(): + world_size = 4 + mp.spawn( + run_dp_sharded_mrope_vision_model_uneven_load_worker, + args=(world_size, get_open_port()), + nprocs=world_size, + ) + + +def run_dp_sharded_mrope_vision_model_uneven_load_worker( + local_rank: int, world_size: int, master_port: int): + """Test run_dp_sharded_mrope_vision_model with uneven load distribution.""" + # Set up distributed environment + current_platform.seed_everything(123) + device = f"{current_platform.device_name}:{local_rank}" + current_platform.set_device(device) + torch.set_default_device(device) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': str(master_port), + }) + + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Create images with very different sizes + grid_thw_list = [ + [1, 2, 2], # Small: 4 patches + [1, 8, 8], # Large: 64 patches + [1, 3, 3], # Medium: 9 patches + ] + + pixel_values_list = [] + for grid_thw in grid_thw_list: + num_patches = math.prod(grid_thw) + image_pixels = torch.randn(num_patches, 768) + pixel_values_list.append(image_pixels) + + pixel_values = torch.cat(pixel_values_list, dim=0) + vision_model = SimpleMRopeVisionModel() + + # Should handle uneven distribution without errors + with torch.inference_mode(): + output_tuple = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + + # Verify output shape is reasonable + merge_factor = vision_model.spatial_merge_size**2 + expected_output_patches = list( + math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list) + + for i, output in enumerate(output_tuple): + assert output.shape[0] == expected_output_patches[i] + assert output.shape[1] == vision_model.out_hidden_size + + +@pytest.mark.parametrize("spatial_merge_size", [2, 4]) +def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int): + """Test SimpleMRopeVisionModel with different spatial merge sizes.""" + device = current_platform.device_type + + grid_thw_list = [[1, 4, 4], [1, 6, 6]] # Two images + pixel_values_list = [] + + for grid_thw in grid_thw_list: + num_patches = math.prod(grid_thw) + image_pixels = torch.randn(num_patches, 768, device=device) + pixel_values_list.append(image_pixels) + + pixel_values = torch.cat(pixel_values_list, dim=0) + vision_model = SimpleMRopeVisionModel( + spatial_merge_size=spatial_merge_size).to(device) + + with torch.inference_mode(): + output = vision_model(pixel_values, grid_thw_list) + + # Verify output dimensions based on spatial merging + total_patches = sum(math.prod(grid_thw) for grid_thw in grid_thw_list) + merge_factor = spatial_merge_size**2 + expected_output_patches = total_patches // merge_factor + + assert output.shape[0] == expected_output_patches + assert output.shape[1] == vision_model.out_hidden_size diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index e1e8282dd66d4..f36d94ca01551 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 -import math import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory @@ -10,22 +9,11 @@ from typing import TYPE_CHECKING, NamedTuple import numpy as np import pytest -import torch -import torch.multiprocessing as mp from PIL import Image, ImageChops -from tests.utils import multi_gpu_test -from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.distributed.parallel_state import (init_distributed_environment, - initialize_model_parallel) from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions, - get_load_balance_assignment, - run_dp_sharded_mrope_vision_model, - run_dp_sharded_vision_model) -from vllm.platforms import current_platform -from vllm.utils import get_open_port, update_environment_variables +from vllm.multimodal.utils import MediaConnector, argsort_mm_positions if TYPE_CHECKING: from vllm.multimodal.inputs import MultiModalPlaceholderDict @@ -404,415 +392,3 @@ def test_argsort_mm_positions(): modality_idxs = argsort_mm_positions(mm_positions) assert modality_idxs == expected_modality_idxs - - -class SimpleLinearModel(torch.nn.Module): - """A simple linear vision model for testing.""" - - def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32): - super().__init__() - self.flatten = torch.nn.Flatten() - self.linear = torch.nn.Linear(input_dim, output_dim) - - def forward(self, x: torch.Tensor): - # Flatten the input and apply linear transformation - x = self.flatten(x) - return self.linear(x) - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "batch_size", - [ - 1, # Single image - 4, # Small batch - 5, # Odd batch size (for testing padding) - ], -) -def test_run_dp_sharded_vision_model(batch_size: int): - world_size = 2 - # Launch processes - mp.spawn( - run_dp_sharded_vision_model_vs_direct, - args=( - world_size, - batch_size, - get_open_port(), - ), - nprocs=world_size, - ) - - -def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int, - batch_size: int, master_port: int): - """ - Test that run_dp_sharded_vision_model produces the same results as - calling the model directly. - """ - - # Set random seed for reproducibility - current_platform.seed_everything(0) - - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - # initialize distributed - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create a test input tensor - image_input = torch.randn(batch_size, 3, 224, 224) - - # Create a simple linear model - vision_model = SimpleLinearModel() - - # Run the model directly on the full input - with torch.inference_mode(): - direct_output = vision_model(image_input) - - # Run the model through the sharded function - with torch.inference_mode(): - sharded_output = run_dp_sharded_vision_model(image_input, vision_model) - - # Check that the world size is set up correctly - assert get_tensor_model_parallel_world_size() == world_size - - # Check that the outputs have the same shape - assert direct_output.shape == sharded_output.shape - - # Check that the outputs are close (they should be identical) - assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5) - - -@pytest.mark.parametrize( - "sizes,num_gpus,expected_shuffle_indices,expected_gpu_sample_counts," - "expected_grouped_sizes_per_gpu,test_description", - [ - # Empty input - ([], 2, [], [0, 0], [0, 0], "empty input"), - - # Fewer samples than GPUs - ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0 - ], "fewer samples than GPUs"), - - # Single GPU - ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"), - - # Balanced assignment - ([100, 100, 100, 100 - ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"), - - # Unbalanced sizes - this one is trickier since the algorithm is greedy - ([1000, 100, 200, 50], 2, [0, 2, 1, 3 - ], [1, 3], [1000, 350], "unbalanced sizes"), - ], -) -def test_get_load_balance_assignment_cases(sizes, num_gpus, - expected_shuffle_indices, - expected_gpu_sample_counts, - expected_grouped_sizes_per_gpu, - test_description): - """Test get_load_balance_assignment with various input cases.""" - result = get_load_balance_assignment(sizes, num_gpus=num_gpus) - (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result - - # Common assertions for all cases - assert len(shuffle_indices) == len(sizes) - assert len(gpu_sample_counts) == num_gpus - assert len(grouped_sizes_per_gpu) == num_gpus - assert sum(gpu_sample_counts) == len(sizes) - - assert shuffle_indices == expected_shuffle_indices - - assert gpu_sample_counts == expected_gpu_sample_counts - assert grouped_sizes_per_gpu == expected_grouped_sizes_per_gpu - - -class SimpleMRopeVisionModel(torch.nn.Module): - """A simple vision model for testing mrope functionality.""" - - def __init__(self, spatial_merge_size: int = 2, out_hidden_size: int = 64): - super().__init__() - self.spatial_merge_size = spatial_merge_size - self.out_hidden_size = out_hidden_size - self.linear = torch.nn.Linear(768, out_hidden_size) - - def forward(self, pixel_values: torch.Tensor, - grid_thw_list: list[list[int]]): - """Simple forward pass that simulates spatial merging.""" - # Apply linear transformation - embeddings = self.linear(pixel_values) - - # Simulate spatial merging by reducing the number of patches - merge_factor = self.spatial_merge_size * self.spatial_merge_size - - # Group patches and merge spatially - merged_embeddings = [] - start_idx = 0 - - for grid_thw in grid_thw_list: - num_patches = math.prod(grid_thw) - end_idx = start_idx + num_patches - - # Get patches for this image - image_patches = embeddings[start_idx:end_idx] - - # Simulate spatial merging by averaging groups of patches - merged_patches = num_patches // merge_factor - if merged_patches > 0: - # Reshape and average to simulate merging - reshaped = image_patches[:merged_patches * merge_factor].view( - merged_patches, merge_factor, -1) - merged = reshaped.mean(dim=1) - merged_embeddings.append(merged) - - start_idx = end_idx - - if merged_embeddings: - return torch.cat(merged_embeddings, dim=0) - else: - return torch.empty((0, self.out_hidden_size), - device=pixel_values.device, - dtype=pixel_values.dtype) - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "batch_size", - [ - 1, # Single image - 3, # Small batch - 5, # Odd batch size (for testing padding) - ], -) -def test_run_dp_sharded_mrope_vision_model(batch_size: int): - world_size = 2 - # Launch processes - mp.spawn( - run_dp_sharded_mrope_vision_model_vs_direct, - args=( - world_size, - batch_size, - get_open_port(), - ), - nprocs=world_size, - ) - - -def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int, - world_size: int, - batch_size: int, - master_port: int): - """ - Test that run_dp_sharded_mrope_vision_model produces the same results as - calling the model directly. - """ - # Set random seed for reproducibility - current_platform.seed_everything(0) - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - # initialize distributed - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create test data - grid_thw_list = [] - pixel_values_list = [] - - for i in range(batch_size): - # Varying image sizes for better testing - t, h, w = 1, 4 + i, 4 + i - grid_thw_list.append([t, h, w]) - - num_patches = t * h * w - # Create random pixel values for this image - image_pixels = torch.randn(num_patches, 768) - pixel_values_list.append(image_pixels) - - # Concatenate all pixel values - pixel_values = torch.cat(pixel_values_list, dim=0) - - # Create a simple mrope vision model - vision_model = SimpleMRopeVisionModel() - - # Run the model directly on the full input (only on rank 0) - if local_rank == 0: - with torch.inference_mode(): - direct_output = vision_model(pixel_values, grid_thw_list) - - # Run the model through the sharded function - with torch.inference_mode(): - sharded_output = run_dp_sharded_mrope_vision_model(vision_model, - pixel_values, - grid_thw_list, - rope_type="rope_3d") - sharded_output = torch.cat(sharded_output, dim=0) - - # Check that the world size is set up correctly - assert get_tensor_model_parallel_world_size() == world_size - - # Compare outputs (only on rank 0) - if local_rank == 0: - # Check that the outputs have the same shape - assert direct_output.shape == sharded_output.shape - # Check that the outputs are close (they should be identical) - assert torch.allclose(direct_output, - sharded_output, - rtol=1e-5, - atol=1e-5) - - -@multi_gpu_test(num_gpus=2) -def test_run_dp_sharded_mrope_vision_model_empty_input(): - world_size = 2 - mp.spawn( - run_dp_sharded_mrope_vision_model_empty_input_worker, - args=(world_size, get_open_port()), - nprocs=world_size, - ) - - -def run_dp_sharded_mrope_vision_model_empty_input_worker( - local_rank: int, world_size: int, master_port: int): - """Test run_dp_sharded_mrope_vision_model with empty input.""" - # Set up distributed environment - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create empty inputs - pixel_values = torch.empty((0, 768)) - grid_thw_list: list[list[int]] = [] - - vision_model = SimpleMRopeVisionModel() - - # Should handle empty input gracefully - with torch.inference_mode(): - output = run_dp_sharded_mrope_vision_model(vision_model, - pixel_values, - grid_thw_list, - rope_type="rope_3d") - - assert len(output) == 0 - - -@multi_gpu_test(num_gpus=4) -def test_run_dp_sharded_mrope_vision_model_uneven_load(): - world_size = 4 - mp.spawn( - run_dp_sharded_mrope_vision_model_uneven_load_worker, - args=(world_size, get_open_port()), - nprocs=world_size, - ) - - -def run_dp_sharded_mrope_vision_model_uneven_load_worker( - local_rank: int, world_size: int, master_port: int): - """Test run_dp_sharded_mrope_vision_model with uneven load distribution.""" - # Set up distributed environment - current_platform.seed_everything(123) - device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) - torch.set_default_device(device) - - update_environment_variables({ - 'RANK': str(local_rank), - 'LOCAL_RANK': str(local_rank), - 'WORLD_SIZE': str(world_size), - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': str(master_port), - }) - - init_distributed_environment() - initialize_model_parallel(tensor_model_parallel_size=world_size) - - # Create images with very different sizes - grid_thw_list = [ - [1, 2, 2], # Small: 4 patches - [1, 8, 8], # Large: 64 patches - [1, 3, 3], # Medium: 9 patches - ] - - pixel_values_list = [] - for grid_thw in grid_thw_list: - num_patches = math.prod(grid_thw) - image_pixels = torch.randn(num_patches, 768) - pixel_values_list.append(image_pixels) - - pixel_values = torch.cat(pixel_values_list, dim=0) - vision_model = SimpleMRopeVisionModel() - - # Should handle uneven distribution without errors - with torch.inference_mode(): - output_tuple = run_dp_sharded_mrope_vision_model(vision_model, - pixel_values, - grid_thw_list, - rope_type="rope_3d") - - # Verify output shape is reasonable - merge_factor = vision_model.spatial_merge_size**2 - expected_output_patches = list( - math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list) - - for i, output in enumerate(output_tuple): - assert output.shape[0] == expected_output_patches[i] - assert output.shape[1] == vision_model.out_hidden_size - - -@pytest.mark.parametrize("spatial_merge_size", [2, 4]) -def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int): - """Test SimpleMRopeVisionModel with different spatial merge sizes.""" - device = current_platform.device_type - - grid_thw_list = [[1, 4, 4], [1, 6, 6]] # Two images - pixel_values_list = [] - - for grid_thw in grid_thw_list: - num_patches = math.prod(grid_thw) - image_pixels = torch.randn(num_patches, 768, device=device) - pixel_values_list.append(image_pixels) - - pixel_values = torch.cat(pixel_values_list, dim=0) - vision_model = SimpleMRopeVisionModel( - spatial_merge_size=spatial_merge_size).to(device) - - with torch.inference_mode(): - output = vision_model(pixel_values, grid_thw_list) - - # Verify output dimensions based on spatial merging - total_patches = sum(math.prod(grid_thw) for grid_thw in grid_thw_list) - merge_factor = spatial_merge_size**2 - expected_output_patches = total_patches // merge_factor - - assert output.shape[0] == expected_output_patches - assert output.shape[1] == vision_model.out_hidden_size diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 56ec634386909..b088e0c0dd241 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -69,7 +69,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -83,7 +82,7 @@ from .qwen2_vl import (_create_qwen2vl_field_factory, from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 76737a4428232..2f0c4240413be 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -34,7 +34,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import run_dp_sharded_vision_model + +from .vision import run_dp_sharded_vision_model class Idefics2VisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 892188c047228..2c341d2839719 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -28,7 +28,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import run_dp_sharded_vision_model + +from .vision import run_dp_sharded_vision_model NORM2FN = { 'rms_norm': RMSNorm, diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index f554077935bf3..503627865c4a5 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -76,13 +76,13 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix +from .vision import run_dp_sharded_mrope_vision_model # For dummy input only diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 131a66b713235..50521b5937862 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -50,7 +50,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape @@ -58,6 +57,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llama4 import Llama4ForCausalLM from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, merge_multimodal_embeddings) +from .vision import run_dp_sharded_vision_model class Llama4ImagePatchInputs(TensorSchema): diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 73b27572a8ebd..b740e6d87b745 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -59,7 +59,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -74,7 +73,7 @@ from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 88813490c0fbc..472e8b061a9e1 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -66,7 +66,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -78,7 +77,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE, from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 98d65dea27393..ee6703f7229e5 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -83,7 +83,7 @@ from .qwen2_vl import Qwen2VLProcessingInfo from .qwen3 import Qwen3ForCausalLM, Qwen3Model from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vit_attn_backend +from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model logger = init_logger(__name__) @@ -1214,8 +1214,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) if self.use_data_parallel: - from vllm.multimodal.utils import ( - run_dp_sharded_mrope_vision_model) return run_dp_sharded_mrope_vision_model(self.visual, pixel_values, grid_thw_list, @@ -1245,8 +1243,6 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) if self.use_data_parallel: - from vllm.multimodal.utils import ( - run_dp_sharded_mrope_vision_model) return run_dp_sharded_mrope_vision_model(self.visual, pixel_values_videos, grid_thw_list, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index f667266b77bfa..5f6ad58850439 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -31,7 +31,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Step3VisionEncoderConfig from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -40,6 +39,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) +from .vision import run_dp_sharded_vision_model class Step3VLImagePixelInputs(TypedDict): diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 81f86db7e1875..08ad8fbeb4246 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +import math from abc import ABC, abstractmethod -from typing import Final, Generic, Optional, Protocol, TypeVar, Union +from typing import Final, Generic, Literal, Optional, Protocol, TypeVar, Union import torch from transformers import PretrainedConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) from vllm.logger import init_logger from vllm.platforms import _Backend, current_platform @@ -123,3 +128,277 @@ def resolve_visual_encoder_outputs( if post_layer_norm is not None and uses_last_layer: hs_pool[-1] = post_layer_norm(encoder_outputs) return torch.cat(hs_pool, dim=-1) + + +def run_dp_sharded_vision_model(image_input: torch.Tensor, + vision_model: torch.nn.Module) -> torch.Tensor: + """Run a vision model with data parallelism (DP) sharding. The function + will shard the input image tensor on the first dimension and run the vision + model + + Args: + image_input (torch.Tensor): Image input tensor. + vision_model (torch.nn.Module): Vision model. + Returns: + torch.Tensor: Output image embeddings + """ + + num_chunks = image_input.shape[0] + mp_world_size = get_tensor_model_parallel_world_size() + num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size + num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks + pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks) + image_input_padded = torch.nn.functional.pad(image_input, pad) + rank = get_tensor_model_parallel_rank() + image_input_per_rank = image_input_padded[rank * + num_chunks_per_rank:(rank + 1) * + num_chunks_per_rank, ...] + + vision_embeddings = vision_model(image_input_per_rank) + # Ensure tensor is contiguous before all_gather + vision_embeddings = vision_embeddings.contiguous() + vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings, + dim=0) + vision_embeddings = vision_embeddings[:num_chunks, ...] + return vision_embeddings + + +def get_load_balance_assignment( + sizes: list[int], + num_gpus: int = 2, +) -> tuple[list[int], list[int], list[int]]: + """ + Generate load balancing assignment and metadata + for distributing data across GPUs. + The load is determined by the total image sizes, + not the number of images. + + Args: + sizes: The size of each image + num_gpus: Number of GPUs to balance across + + Returns: + shuffle_indices: + Indices to reorder data for balanced loading + gpu_sample_counts: + Number of samples assigned to each GPU + grouped_sizes_per_gpu: + Total size assigned to each GPU + + Example: + ``` + sizes = [1000, 100, 200, 50] + num_gpus=2 + ``` + + """ + + n_samples = len(sizes) + + # Handle edge cases + if n_samples == 0: + return [], [0] * num_gpus, [0] * num_gpus + + # Use greedy algorithm - balance by total size, not sample count + gpu_assignments = [list[int]() for _ in range(num_gpus)] + gpu_loads = [0] * num_gpus # This tracks total SIZE, not sample count + + # Sort indices by size (largest first for better load balancing) + # sizes = [1000, 100, 200, 50] + # large_to_small_indices = [0, 2, 1, 3] + large_to_small_indices = sorted(range(n_samples), + key=lambda i: sizes[i], + reverse=True) + + for idx in large_to_small_indices: + # Find GPU with minimum current load (by total size) + min_gpu = min(range(num_gpus), key=lambda i: gpu_loads[i]) + gpu_assignments[min_gpu].append(idx) + gpu_loads[min_gpu] += sizes[idx] + + # Create shuffle indices and counts + shuffle_indices = list[int]() + gpu_sample_counts = list[int]() + for gpu_id in range(num_gpus): + # GPU_0 = [1000] = [0] + # GPU_1 = [200, 100, 50] = [2, 1, 3] + # shuffle_indices = [0, 2, 1, 3] + shuffle_indices.extend(gpu_assignments[gpu_id]) + # GPU_0 = [1] + # GPU_1 = [3] + # gpu_sample_counts = [1, 3] + gpu_sample_counts.append(len(gpu_assignments[gpu_id])) + + return (shuffle_indices, gpu_sample_counts, gpu_loads) + + +def run_dp_sharded_mrope_vision_model( + vision_model: torch.nn.Module, + pixel_values: torch.Tensor, + grid_thw_list: list[list[int]], + *, + rope_type: Literal["rope_3d", "rope_2d"], +) -> tuple[torch.Tensor, ...]: + """Run a vision model with data parallelism (DP) sharding. + The function will shard the input image tensor on the + first dimension and run the vision model. + This function is used to run the vision model with mrope. + + Args: + vision_model (torch.nn.Module): Vision model. + pixel_values (torch.Tensor): Image/Video input tensor. + grid_thw_list: List of grid dimensions for each image + rope_type: Type of rope used in the vision model. + Different rope types have different dimension to do ViT. + "rope_3d" for 3D rope (e.g., Qwen2.5-VL) + "rope_2d" for 2D rope (e.g., Kimi-VL) + Returns: + torch.Tensor: Output image embeddings + + Example: + ``` + vision_model.out_hidden_size = 64 + vision_model.spatial_merge_size = 2 + pixel_values.shape = (1350, channel) + grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]] + tp_size=2 + ``` + + """ + tp_size = get_tensor_model_parallel_world_size() + + # GPU_0 tp_rank_local = 0 + # GPU_1 tp_rank_local = 1 + tp_rank_local = get_tensor_model_parallel_rank() + + # patches_per_image = [1000, 100, 200, 50] + patches_per_image = [math.prod(grid_thw) for grid_thw in grid_thw_list] + # patches_per_image = [0, 1000, 1100, 1300, 1350] + cum_patches_per_image = [0, *itertools.accumulate(patches_per_image)] + + # Get load balancing assignment with all metadata + # image_to_tp_rank = [0, 2, 1, 3] + # gpu_sample_counts = [1, 3] + # grouped_pixel_values_len = [1000, 350] + (image_to_tp_rank, gpu_sample_counts, + grouped_pixel_values_len) = get_load_balance_assignment( + patches_per_image, tp_size) + + # cu_gpu_sample_counts = [0, 1, 4] + cum_gpu_sample_counts = [0, *itertools.accumulate(gpu_sample_counts)] + + # GPU_0 image_idxs_local = [0] + # GPU_1 image_idxs_local = [2, 1, 3] + image_idxs_local = image_to_tp_rank[cum_gpu_sample_counts[tp_rank_local]: + cum_gpu_sample_counts[tp_rank_local + + 1]] + + # Get the pixel values for the local images based on the image_idxs_local + if len(image_idxs_local) > 0: + pixel_values_local = torch.cat([ + pixel_values[cum_patches_per_image[i]:cum_patches_per_image[i + 1]] + for i in image_idxs_local + ]) + else: + # Handle case where this rank has no images + pixel_values_local = torch.empty((0, pixel_values.shape[1]), + device=pixel_values.device, + dtype=pixel_values.dtype) + # embed_dim_reduction_factor = 2 * 2 + if rope_type == "rope_2d": + embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] * + vision_model.merge_kernel_size[1]) + else: + embed_dim_reduction_factor = (vision_model.spatial_merge_size * + vision_model.spatial_merge_size) + + # Find the max length across all ranks + # The output embedding of every DP rank has to be + # padded to this length for tensor_model_parallel_all_gather + # to work + max_len_per_rank = max( + grouped_pixel_values_len) // embed_dim_reduction_factor + local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] + + # Run the vision model on the local pixel_values_local + if rope_type == "rope_2d": + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model( + pixel_values_local, torch.tensor(local_grid_thw_list)) + if isinstance(image_embeds_local, list): + image_embeds_local = torch.cat(image_embeds_local, dim=0) + else: + out_dim = getattr(vision_model.config, "hidden_size", None) + image_embeds_local = torch.empty( + (0, embed_dim_reduction_factor, out_dim), + device=pixel_values.device, + dtype=pixel_values.dtype) + else: + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model(pixel_values_local, + local_grid_thw_list) + else: + # Handle empty case + image_embeds_local = torch.empty((0, vision_model.out_hidden_size), + device=pixel_values.device, + dtype=pixel_values.dtype) + + # Pad the output based on max_len_per_rank + # for tensor_model_parallel_all_gather to work + current_len = image_embeds_local.shape[0] + if current_len < max_len_per_rank: + padding_size = max_len_per_rank - current_len + if rope_type == "rope_2d": + padding = torch.empty((padding_size, image_embeds_local.shape[1], + image_embeds_local.shape[2]), + dtype=image_embeds_local.dtype, + device=image_embeds_local.device) + else: + padding = torch.empty((padding_size, image_embeds_local.shape[1]), + dtype=image_embeds_local.dtype, + device=image_embeds_local.device) + image_embeds_local_padded = torch.cat([image_embeds_local, padding], + dim=0) + else: + image_embeds_local_padded = image_embeds_local + + # Do all_gather to collect embeddings from all ranks + gathered_embeds = tensor_model_parallel_all_gather( + image_embeds_local_padded, dim=0) + + # Remove padding and reconstruct per-rank embeddings + rank_embeddings = list[torch.Tensor]() + for rank in range(tp_size): + start_idx = rank * max_len_per_rank + end_idx = start_idx + (grouped_pixel_values_len[rank] // + embed_dim_reduction_factor) + rank_embeddings.append(gathered_embeds[start_idx:end_idx]) + + patches_per_output_image = [(patch_size // embed_dim_reduction_factor) + for patch_size in patches_per_image] + + # Reconstruct embeddings in the original order + original_order_embeddings = [None] * len(grid_thw_list) + current_idx = 0 + for rank in range(tp_size): + count = gpu_sample_counts[rank] + if count > 0: + # Get images assigned to this rank in shuffled order + # GPU_0 = image_idxs_local [0] + # GPU_1 = image_idxs_local [2, 1, 3] + rank_images = image_to_tp_rank[current_idx:current_idx + count] + + rank_embed = rank_embeddings[rank] + # Split rank embeddings back to individual images + embed_start = 0 + for img_idx in rank_images: + img_patches = patches_per_output_image[img_idx] + original_order_embeddings[img_idx] = rank_embed[ + embed_start:embed_start + img_patches] + embed_start += img_patches + current_idx += count + out_embeddings = tuple(embed for embed in original_order_embeddings + if embed is not None) + assert len(out_embeddings) == len( + original_order_embeddings), "Found unassigned embeddings" + return out_embeddings diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f4e2ed72e2d7d..0f8aeceb39448 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,13 +3,11 @@ import asyncio import atexit -import itertools -import math from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from itertools import groupby from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse from urllib.request import url2pathname @@ -21,9 +19,6 @@ from typing_extensions import deprecated import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather) from .audio import AudioMediaIO from .base import MediaIO @@ -33,12 +28,10 @@ from .video import VideoMediaIO _M = TypeVar("_M") if TYPE_CHECKING: - from .inputs import (BatchedTensorInputs, MultiModalKwargs, - MultiModalKwargsItem, MultiModalKwargsItems, - MultiModalPlaceholderDict) + from .inputs import (BatchedTensorInputs, MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalPlaceholderDict) else: BatchedTensorInputs = Any - MultiModalKwargs = Any MultiModalKwargsItem = Any MultiModalKwargsItems = Any MultiModalPlaceholderDict = Any @@ -93,7 +86,7 @@ class MediaConnector: self, url_spec: ParseResult, media_io: MediaIO[_M], - ) -> _M: + ) -> _M: # type: ignore[type-var] data_spec, data = url_spec.path.split(",", 1) media_type, data_type = data_spec.split(";", 1) @@ -107,7 +100,7 @@ class MediaConnector: self, url_spec: ParseResult, media_io: MediaIO[_M], - ) -> _M: + ) -> _M: # type: ignore[type-var] allowed_local_media_path = self.allowed_local_media_path if allowed_local_media_path is None: raise RuntimeError("Cannot load local files without " @@ -127,7 +120,7 @@ class MediaConnector: media_io: MediaIO[_M], *, fetch_timeout: Optional[int] = None, - ) -> _M: + ) -> _M: # type: ignore[type-var] url_spec = urlparse(url) if url_spec.scheme.startswith("http"): @@ -434,280 +427,6 @@ def group_mm_kwargs_by_modality( yield modality, len(items_lst), mm_kwargs_group -def run_dp_sharded_vision_model(image_input: torch.Tensor, - vision_model: torch.nn.Module) -> torch.Tensor: - """Run a vision model with data parallelism (DP) sharding. The function - will shard the input image tensor on the first dimension and run the vision - model - - Args: - image_input (torch.Tensor): Image input tensor. - vision_model (torch.nn.Module): Vision model. - Returns: - torch.Tensor: Output image embeddings - """ - - num_chunks = image_input.shape[0] - mp_world_size = get_tensor_model_parallel_world_size() - num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size - num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks - pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks) - image_input_padded = torch.nn.functional.pad(image_input, pad) - rank = get_tensor_model_parallel_rank() - image_input_per_rank = image_input_padded[rank * - num_chunks_per_rank:(rank + 1) * - num_chunks_per_rank, ...] - - vision_embeddings = vision_model(image_input_per_rank) - # Ensure tensor is contiguous before all_gather - vision_embeddings = vision_embeddings.contiguous() - vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings, - dim=0) - vision_embeddings = vision_embeddings[:num_chunks, ...] - return vision_embeddings - - -def get_load_balance_assignment( - sizes: list[int], - num_gpus: int = 2, -) -> tuple[list[int], list[int], list[int]]: - """ - Generate load balancing assignment and metadata - for distributing data across GPUs. - The load is determined by the total image sizes, - not the number of images. - - Args: - sizes: The size of each image - num_gpus: Number of GPUs to balance across - - Returns: - shuffle_indices: - Indices to reorder data for balanced loading - gpu_sample_counts: - Number of samples assigned to each GPU - grouped_sizes_per_gpu: - Total size assigned to each GPU - - Example: - ``` - sizes = [1000, 100, 200, 50] - num_gpus=2 - ``` - - """ - - n_samples = len(sizes) - - # Handle edge cases - if n_samples == 0: - return [], [0] * num_gpus, [0] * num_gpus - - # Use greedy algorithm - balance by total size, not sample count - gpu_assignments = [list[int]() for _ in range(num_gpus)] - gpu_loads = [0] * num_gpus # This tracks total SIZE, not sample count - - # Sort indices by size (largest first for better load balancing) - # sizes = [1000, 100, 200, 50] - # large_to_small_indices = [0, 2, 1, 3] - large_to_small_indices = sorted(range(n_samples), - key=lambda i: sizes[i], - reverse=True) - - for idx in large_to_small_indices: - # Find GPU with minimum current load (by total size) - min_gpu = min(range(num_gpus), key=lambda i: gpu_loads[i]) - gpu_assignments[min_gpu].append(idx) - gpu_loads[min_gpu] += sizes[idx] - - # Create shuffle indices and counts - shuffle_indices = list[int]() - gpu_sample_counts = list[int]() - for gpu_id in range(num_gpus): - # GPU_0 = [1000] = [0] - # GPU_1 = [200, 100, 50] = [2, 1, 3] - # shuffle_indices = [0, 2, 1, 3] - shuffle_indices.extend(gpu_assignments[gpu_id]) - # GPU_0 = [1] - # GPU_1 = [3] - # gpu_sample_counts = [1, 3] - gpu_sample_counts.append(len(gpu_assignments[gpu_id])) - - return (shuffle_indices, gpu_sample_counts, gpu_loads) - - -def run_dp_sharded_mrope_vision_model( - vision_model: torch.nn.Module, - pixel_values: torch.Tensor, - grid_thw_list: list[list[int]], - *, - rope_type: Literal["rope_3d", "rope_2d"], -) -> tuple[torch.Tensor, ...]: - """Run a vision model with data parallelism (DP) sharding. - The function will shard the input image tensor on the - first dimension and run the vision model. - This function is used to run the vision model with mrope. - - Args: - vision_model (torch.nn.Module): Vision model. - pixel_values (torch.Tensor): Image/Video input tensor. - grid_thw_list: List of grid dimensions for each image - rope_type: Type of rope used in the vision model. - Different rope types have different dimension to do ViT. - "rope_3d" for 3D rope (e.g., Qwen2.5-VL) - "rope_2d" for 2D rope (e.g., Kimi-VL) - Returns: - torch.Tensor: Output image embeddings - - Example: - ``` - vision_model.out_hidden_size = 64 - vision_model.spatial_merge_size = 2 - pixel_values.shape = (1350, channel) - grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]] - tp_size=2 - ``` - - """ - tp_size = get_tensor_model_parallel_world_size() - - # GPU_0 tp_rank_local = 0 - # GPU_1 tp_rank_local = 1 - tp_rank_local = get_tensor_model_parallel_rank() - - # patches_per_image = [1000, 100, 200, 50] - patches_per_image = [math.prod(grid_thw) for grid_thw in grid_thw_list] - # patches_per_image = [0, 1000, 1100, 1300, 1350] - cum_patches_per_image = [0, *itertools.accumulate(patches_per_image)] - - # Get load balancing assignment with all metadata - # image_to_tp_rank = [0, 2, 1, 3] - # gpu_sample_counts = [1, 3] - # grouped_pixel_values_len = [1000, 350] - (image_to_tp_rank, gpu_sample_counts, - grouped_pixel_values_len) = get_load_balance_assignment( - patches_per_image, tp_size) - - # cu_gpu_sample_counts = [0, 1, 4] - cum_gpu_sample_counts = [0, *itertools.accumulate(gpu_sample_counts)] - - # GPU_0 image_idxs_local = [0] - # GPU_1 image_idxs_local = [2, 1, 3] - image_idxs_local = image_to_tp_rank[cum_gpu_sample_counts[tp_rank_local]: - cum_gpu_sample_counts[tp_rank_local + - 1]] - - # Get the pixel values for the local images based on the image_idxs_local - if len(image_idxs_local) > 0: - pixel_values_local = torch.cat([ - pixel_values[cum_patches_per_image[i]:cum_patches_per_image[i + 1]] - for i in image_idxs_local - ]) - else: - # Handle case where this rank has no images - pixel_values_local = torch.empty((0, pixel_values.shape[1]), - device=pixel_values.device, - dtype=pixel_values.dtype) - # embed_dim_reduction_factor = 2 * 2 - if rope_type == "rope_2d": - embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] * - vision_model.merge_kernel_size[1]) - else: - embed_dim_reduction_factor = (vision_model.spatial_merge_size * - vision_model.spatial_merge_size) - - # Find the max length across all ranks - # The output embedding of every DP rank has to be - # padded to this length for tensor_model_parallel_all_gather - # to work - max_len_per_rank = max( - grouped_pixel_values_len) // embed_dim_reduction_factor - local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] - - # Run the vision model on the local pixel_values_local - if rope_type == "rope_2d": - if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model( - pixel_values_local, torch.tensor(local_grid_thw_list)) - if isinstance(image_embeds_local, list): - image_embeds_local = torch.cat(image_embeds_local, dim=0) - else: - out_dim = getattr(vision_model.config, "hidden_size", None) - image_embeds_local = torch.empty( - (0, embed_dim_reduction_factor, out_dim), - device=pixel_values.device, - dtype=pixel_values.dtype) - else: - if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model(pixel_values_local, - local_grid_thw_list) - else: - # Handle empty case - image_embeds_local = torch.empty((0, vision_model.out_hidden_size), - device=pixel_values.device, - dtype=pixel_values.dtype) - - # Pad the output based on max_len_per_rank - # for tensor_model_parallel_all_gather to work - current_len = image_embeds_local.shape[0] - if current_len < max_len_per_rank: - padding_size = max_len_per_rank - current_len - if rope_type == "rope_2d": - padding = torch.empty((padding_size, image_embeds_local.shape[1], - image_embeds_local.shape[2]), - dtype=image_embeds_local.dtype, - device=image_embeds_local.device) - else: - padding = torch.empty((padding_size, image_embeds_local.shape[1]), - dtype=image_embeds_local.dtype, - device=image_embeds_local.device) - image_embeds_local_padded = torch.cat([image_embeds_local, padding], - dim=0) - else: - image_embeds_local_padded = image_embeds_local - - # Do all_gather to collect embeddings from all ranks - gathered_embeds = tensor_model_parallel_all_gather( - image_embeds_local_padded, dim=0) - - # Remove padding and reconstruct per-rank embeddings - rank_embeddings = list[torch.Tensor]() - for rank in range(tp_size): - start_idx = rank * max_len_per_rank - end_idx = start_idx + (grouped_pixel_values_len[rank] // - embed_dim_reduction_factor) - rank_embeddings.append(gathered_embeds[start_idx:end_idx]) - - patches_per_output_image = [(patch_size // embed_dim_reduction_factor) - for patch_size in patches_per_image] - - # Reconstruct embeddings in the original order - original_order_embeddings = [None] * len(grid_thw_list) - current_idx = 0 - for rank in range(tp_size): - count = gpu_sample_counts[rank] - if count > 0: - # Get images assigned to this rank in shuffled order - # GPU_0 = image_idxs_local [0] - # GPU_1 = image_idxs_local [2, 1, 3] - rank_images = image_to_tp_rank[current_idx:current_idx + count] - - rank_embed = rank_embeddings[rank] - # Split rank embeddings back to individual images - embed_start = 0 - for img_idx in rank_images: - img_patches = patches_per_output_image[img_idx] - original_order_embeddings[img_idx] = rank_embed[ - embed_start:embed_start + img_patches] - embed_start += img_patches - current_idx += count - out_embeddings = tuple(embed for embed in original_order_embeddings - if embed is not None) - assert len(out_embeddings) == len( - original_order_embeddings), "Found unassigned embeddings" - return out_embeddings - - def fetch_audio( audio_url: str, audio_io_kwargs: Optional[dict[str, Any]] = None, From 4322c553a64c3a6c95a39216f31b022eda04a5ad Mon Sep 17 00:00:00 2001 From: Andreas Hartel <andreas@hartel.me> Date: Tue, 23 Sep 2025 11:56:31 +0200 Subject: [PATCH 582/584] [Test]: Hermes tool parser stream output error in Qwen3 case (#25203) Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com> --- .../tool_parsers/test_hermes_tool_parser.py | 209 +++++++++++++++++- 1 file changed, 203 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index e0e6b2c07e171..1da06be2eba92 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -5,6 +5,11 @@ import json import pytest +from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import ( + Hermes2ProToolParser) +from vllm.transformers_utils.tokenizer import AnyTokenizer + from ....utils import RemoteOpenAIServer MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" @@ -37,7 +42,7 @@ TOOLS = [{ }, "unit": { "type": "string", - "enum": ["celsius", "fahrenheit"] + "enum": ["celsius", "fahrenheit"], }, }, "required": ["location"], @@ -75,7 +80,7 @@ PRODUCT_MESSAGES = [{ "user", "content": "Hi! Do you have any detailed information about the product id " - "7355608 and inserted true?" + "7355608 and inserted true?", }] @@ -144,8 +149,8 @@ async def test_streaming_tool_call(): if tool_chunk.function.name: tool_call_chunks[index]["name"] += tool_chunk.function.name if tool_chunk.function.arguments: - tool_call_chunks[index][ - "arguments"] += tool_chunk.function.arguments + tool_call_chunks[index]["arguments"] += ( + tool_chunk.function.arguments) assert len(tool_call_chunks) == 1 reconstructed_tool_call = tool_call_chunks[0] @@ -234,8 +239,8 @@ async def test_streaming_product_tool_call(): if tool_chunk.function.name: tool_call_chunks[index]["name"] += tool_chunk.function.name if tool_chunk.function.arguments: - tool_call_chunks[index][ - "arguments"] += tool_chunk.function.arguments + tool_call_chunks[index]["arguments"] += ( + tool_chunk.function.arguments) assert len(tool_call_chunks) == 1 reconstructed_tool_call = tool_call_chunks[0] @@ -258,3 +263,195 @@ async def test_streaming_product_tool_call(): print("\n[Streaming Product Test Passed]") print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") print(f"Reconstructed Arguments: {arguments}") + + +@pytest.fixture +def qwen_tokenizer() -> AnyTokenizer: + from vllm.transformers_utils.tokenizer import get_tokenizer + + return get_tokenizer("Qwen/Qwen3-32B") + + +@pytest.fixture +def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser: + return Hermes2ProToolParser(qwen_tokenizer) + + +@pytest.fixture +def any_chat_request() -> ChatCompletionRequest: + return ChatCompletionRequest( + seed=42, + model="Qwen/Qwen3-32B", + messages=[], + ) + + +def test_hermes_parser_streaming_just_forward_text( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = ( + """This is some prior text that has nothing to do with tool calling.""" + ) + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + delta_text = qwen_tokenizer.decode([token]) + current_text = previous_text + delta_text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + delta_messages.append(delta) + + for delta in delta_messages: + assert delta is not None + assert not delta.tool_calls + + print(delta_messages) + assert "".join([delta.content for delta in delta_messages]) == text + + +def test_hermes_parser_streaming_failure_case_bug_19056( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """<tool_call> +{"name": "final_answer", "arguments": {"trigger": true}} +</tool_call>""" + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + text = qwen_tokenizer.decode([token]) + current_text = previous_text + text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + if delta is not None: + delta_messages.append(delta) + + assert delta_messages[0].tool_calls[0].function.name == "final_answer" + tool_call_args = "".join(delta.tool_calls[0].function.arguments or "" + for delta in delta_messages) + assert tool_call_args == '{"trigger": true}' + + +def test_hermes_parser_streaming( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = '<tool_call>\ +{"name": "get_current_temperature",\ +"arguments": {"location":\ +"San Francisco, California, United States", "unit": "celsius"}}\ +</tool_call>' + + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + text = qwen_tokenizer.decode([token]) + current_text = previous_text + text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + if delta is not None: + delta_messages.append(delta) + print(delta_messages) + assert (delta_messages[0].tool_calls[0].function.name == + "get_current_temperature") + tool_call_args = "".join(delta.tool_calls[0].function.arguments or "" + for delta in delta_messages) + assert tool_call_args == ( + '{"location":"San Francisco, California, United States", ' + '"unit": "celsius"}') + + +def test_hermes_parser_non_streaming_no_tool_call( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """This is not a tool call.""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert not tool_call.tools_called + + +def test_hermes_parser_non_streaming_tool_call_between_tags( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """<tool_call> +{"name": "final_answer", "arguments": {"trigger": true}} +</tool_call>""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert tool_call.tools_called + assert tool_call.tool_calls[0].function.name == "final_answer" + assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' + + +def test_hermes_parser_non_streaming_tool_call_until_eos( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """<tool_call> +{"name": "final_answer", "arguments": {"trigger": true}}""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert tool_call.tools_called + assert tool_call.tool_calls[0].function.name == "final_answer" + assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' + + +def test_hermes_parser_non_streaming_tool_call_invalid_json( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + # Missing closing brace to trigger exception + text = """<tool_call> +{"name": "final_answer", "arguments": {"trigger": true}""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert not tool_call.tools_called From 231c2c63e4decd0cbf863690dfffe88e1d97a003 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Tue, 23 Sep 2025 18:06:48 +0800 Subject: [PATCH 583/584] [Bugfix] Fix idefics3 `tie_word_embeddings` (#25454) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- vllm/model_executor/models/idefics3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 18446d126b51a..79e130119ae83 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -608,7 +608,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, prefix=maybe_prefix(prefix, "lm_head"), ) if self.config.text_config.tie_word_embeddings: - self.lm_head.weight = self.model.text_model.wte.weight + self.lm_head.weight = self.model.text_model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.text_config.vocab_size) def _parse_and_validate_image_input( From 273690a50ac2a5fa79fa7acc5077e49aa1af427e Mon Sep 17 00:00:00 2001 From: Jee Jee Li <pandaleefree@gmail.com> Date: Tue, 23 Sep 2025 18:19:45 +0800 Subject: [PATCH 584/584] [Core] Optimize LoRA weight loading (#25403) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> --- tests/lora/test_layers.py | 26 ++++---- tests/lora/test_lora_manager.py | 12 ++-- tests/lora/utils.py | 8 +-- vllm/lora/layers/base_linear.py | 10 +-- vllm/lora/layers/column_parallel_linear.py | 67 ++++++++++---------- vllm/lora/layers/logits_processor.py | 8 +-- vllm/lora/layers/row_parallel_linear.py | 4 +- vllm/lora/layers/vocal_parallel_embedding.py | 10 +-- vllm/lora/lora_weights.py | 4 +- vllm/lora/models.py | 17 ++--- 10 files changed, 83 insertions(+), 83 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 6735b7cd9e436..ced0afc50cb91 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -164,8 +164,8 @@ def populate_loras( weight=layer_weights, generate_embeddings_tensor=generate_embeddings_tensor, ) - sublora.lora_b = sublora.lora_b[:, (sublora_len * - i):(sublora_len * (i + 1))] + sublora.lora_b = sublora.lora_b[(sublora_len * + i):(sublora_len * (i + 1)), :] sublora.optimize() subloras.append(sublora) @@ -304,9 +304,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: result = embedding(input_) after_a = F.embedding( input_, - lora.lora_a, + lora.lora_a.T, ) - result += (after_a @ lora.lora_b) + result += (after_a @ lora.lora_b.T) expected_results.append(result) expected_result = torch.cat(expected_results) @@ -445,9 +445,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, result = expanded_embedding(input_) after_a = F.embedding( original_input_, - lora.lora_a, + lora.lora_a.T, ) - result += (after_a @ lora.lora_b) + result += (after_a @ lora.lora_b.T) expected_results.append(result) expected_result = torch.cat(expected_results) @@ -575,7 +575,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, lm_head=linear, embedding_bias=None) result[:, vocab_size + embeddings_tensor_len:] = float("-inf") - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) logits_processor.org_vocab_size = vocab_size @@ -692,9 +692,10 @@ def test_linear_replicated( expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): + lora = lora_dict[lora_id] result = linear(input_)[0] - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) @@ -817,7 +818,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) @@ -965,9 +966,10 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, result = linear(input_)[0] subloras = sublora_dict[lora_id] for i, sublora in enumerate(subloras): - result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * - (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b * - sublora.scaling) + result[:, sublora.lora_b.shape[0] * i:sublora.lora_b.shape[0] * + (i + 1)] += ( + input_ @ sublora.lora_a.T @ sublora.lora_b.T * + sublora.scaling) expected_results.append(result) expected_result = torch.cat(expected_results) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index d7684fbf34abb..6f0a852314081 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -63,9 +63,9 @@ def test_from_lora_tensors(sql_lora_files, device): assert lora.lora_b is not None assert lora.lora_a.device == torch.device(device) assert lora.lora_b.device == torch.device(device) - assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] + assert (lora.lora_a.shape[0] == lora.lora_b.shape[1] ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" - assert lora.lora_a.shape[1] == 8 + assert lora.lora_a.shape[0] == 8 embeddings_module = next( (k for k in EMBEDDING_MODULES if k in module_name), None) if embeddings_module: @@ -86,8 +86,8 @@ def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str], name, 8, 16, - torch.rand([w.shape[1], 8], device=device), - torch.rand([8, w.shape[0]], device=device), + torch.rand([8, w.shape[1]], device=device), + torch.rand([w.shape[0], 8], device=device), ) return LoRAModel(lora_id, 8, loras) @@ -109,8 +109,8 @@ def create_packed_lora( replaced_module_name, 8, 16, - torch.rand([w.shape[1], 8], device=device), - torch.rand([8, w.shape[0] // len(replaced_module_names)], + torch.rand([8, w.shape[1]], device=device), + torch.rand([w.shape[0] // len(replaced_module_names), 8], device=device), ) return LoRAModel(lora_id, 8, loras) diff --git a/tests/lora/utils.py b/tests/lora/utils.py index ab475904d4938..0432a1a9bba07 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -36,10 +36,10 @@ class DummyLoRAManager: module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([weight.shape[1], rank], + lora_a=torch.rand([rank, weight.shape[1]], dtype=weight.dtype, device=self._device), - lora_b=torch.rand([rank, weight.shape[0]], + lora_b=torch.rand([weight.shape[0], rank], dtype=weight.dtype, device=self._device), ) @@ -67,8 +67,8 @@ class DummyLoRAManager: module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([input_dim, rank], device="cuda"), - lora_b=torch.rand([rank, output_dim], device="cuda"), + lora_a=torch.rand([rank, input_dim], device="cuda"), + lora_b=torch.rand([output_dim, input_dim], device="cuda"), embeddings_tensor=embeddings_tensor, ) self.set_module_lora(module_name, lora) diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 85a1f86ce6bf2..6cf5815ef12da 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -121,18 +121,18 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): lora_bias = self.slice_bias(lora_bias) self.lora_a_stacked[0][index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) + 0, :lora_a.shape[0], :lora_a.shape[1]].copy_( + lora_a, non_blocking=True) self.lora_b_stacked[0][index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + 0, :lora_b.shape[0], :lora_b.shape[1]].copy_( + lora_b, non_blocking=True) if lora_bias is not None: self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], self.lora_bias_stacked) assert len(self.lora_bias_stacked) self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( - lora_bias.T, non_blocking=True) + lora_bias, non_blocking=True) def apply(self, x: torch.Tensor, diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 658fd23165da0..fa4eb272a69fe 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -99,13 +99,13 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): if self.is_merged_col_linear: tp_rank = get_tensor_model_parallel_rank() shard_size = self.output_size // 2 - offset = lora_b.shape[-1] // 2 + offset = lora_b.shape[0] // 2 - left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) * - shard_size] - right_weight = lora_b[:, offset + tp_rank * shard_size:offset + - (tp_rank + 1) * shard_size] - lora_b = torch.cat([left_weight, right_weight], dim=1) + left_weight = lora_b[tp_rank * shard_size:(tp_rank + 1) * + shard_size, :] + right_weight = lora_b[offset + tp_rank * shard_size:offset + + (tp_rank + 1) * shard_size, :] + lora_b = torch.cat([left_weight, right_weight], dim=0) # Applicable to cases where the base_layer is # ColumnParallelLinear. else: @@ -113,7 +113,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): shard_size = self.output_size start_idx = tensor_model_parallel_rank * shard_size end_idx = (tensor_model_parallel_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] + lora_b = lora_b[start_idx:end_idx, :] return lora_b def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: @@ -251,9 +251,8 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (lora_b_i := lora_b[i]) is not None: - sliced_lora_b[i] = lora_b_i[:, - shard_size * shard_id:shard_size * - (shard_id + 1)] + sliced_lora_b[i] = lora_b_i[shard_size * shard_id:shard_size * + (shard_id + 1), :] return sliced_lora_b def slice_bias( @@ -285,12 +284,12 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): for i in range(self.n_slices): if (lora_a_i := lora_a[i]) is not None: self.lora_a_stacked[i][ - index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_( - lora_a_i.T, non_blocking=True) + index, 0, :lora_a_i.shape[0], :lora_a_i.shape[1]].copy_( + lora_a_i, non_blocking=True) if (lora_b_i := lora_b[i]) is not None: self.lora_b_stacked[i][ - index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_( - lora_b_i.T, non_blocking=True) + index, 0, :lora_b_i.shape[0], :lora_b_i.shape[1]].copy_( + lora_b_i, non_blocking=True) if lora_bias is not None: self.lora_bias_stacked = cast(tuple[torch.Tensor, ...], @@ -299,7 +298,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): if (lora_bias_i := lora_bias[i]) is not None: self.lora_bias_stacked[i][index, 0, :lora_bias_i.shape[0]].copy_( - lora_bias_i.T, + lora_bias_i, non_blocking=True) @classmethod @@ -345,18 +344,18 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): tp_rank = get_tensor_model_parallel_rank() self.q_shard_id = tp_rank self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas - lora_b_q = lora_b[:, self.q_proj_shard_size * + lora_b_q = lora_b[self.q_proj_shard_size * self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] + (self.q_shard_id + 1), :] k_offset = self.q_proj_total_size - lora_b_k = lora_b[:, k_offset + + lora_b_k = lora_b[k_offset + self.kv_proj_shard_size * self.kv_shard_id:k_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] + self.kv_proj_shard_size * (self.kv_shard_id + 1), :] v_offset = k_offset + self.kv_proj_total_size - lora_b_v = lora_b[:, v_offset + + lora_b_v = lora_b[v_offset + self.kv_proj_shard_size * self.kv_shard_id:v_offset + - self.kv_proj_shard_size * (self.kv_shard_id + 1)] - lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) + self.kv_proj_shard_size * (self.kv_shard_id + 1), :] + lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=0) return lora_b def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: @@ -465,7 +464,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): tp_rank = get_tensor_model_parallel_rank() shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size - lora_a = lora_a[:, start_idx:start_idx + shard_size] + lora_a = lora_a[start_idx:start_idx + shard_size, :] return lora_a def apply(self, @@ -508,10 +507,10 @@ class MergedColumnParallelLinearWithShardedLoRA( output_shard_size = self.lora_a_stacked[0].shape[2] output_start_idx = self.tp_rank * output_shard_size lora_a = [ - lora_a[0][:, output_start_idx:output_start_idx + - output_shard_size] if lora_a[0] is not None else None, - lora_a[1][:, output_start_idx:output_start_idx + - output_shard_size] if lora_a[1] is not None else None, + lora_a[0][output_start_idx:output_start_idx + + output_shard_size, :] if lora_a[0] is not None else None, + lora_a[1][output_start_idx:output_start_idx + + output_shard_size, :] if lora_a[1] is not None else None, ] return lora_a @@ -551,7 +550,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): tp_rank = get_tensor_model_parallel_rank() shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size - lora_a = lora_a[:, start_idx:start_idx + shard_size] + lora_a = lora_a[start_idx:start_idx + shard_size, :] return lora_a def apply(self, @@ -589,12 +588,12 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] start_idx = [self.tp_rank * shard_size[i] for i in range(3)] lora_a = [ - lora_a[0][:, start_idx[0]:start_idx[0] + - shard_size[0]] if lora_a[0] is not None else None, - lora_a[1][:, start_idx[1]:start_idx[1] + - shard_size[1]] if lora_a[1] is not None else None, - lora_a[2][:, start_idx[2]:start_idx[2] + - shard_size[2]] if lora_a[2] is not None else None, + lora_a[0][start_idx[0]:start_idx[0] + + shard_size[0], :] if lora_a[0] is not None else None, + lora_a[1][start_idx[1]:start_idx[1] + + shard_size[1], :] if lora_a[1] is not None else None, + lora_a[2][start_idx[2]:start_idx[2] + + shard_size[2], :] if lora_a[2] is not None else None, ] return lora_a diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index a50dcfa748f2f..b8fbad3a4af01 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -140,11 +140,11 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): ): self.reset_lora(index) self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) + 0, :lora_a.shape[0], :lora_a.shape[1]].copy_( + lora_a, non_blocking=True) self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + 0, :lora_b.shape[0], :lora_b.shape[1]].copy_( + lora_b, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 18ef6fd1ddd78..cac2c92136dca 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -39,7 +39,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): shard_size = self.input_size start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - lora_a = lora_a[start_idx:end_idx, :] + lora_a = lora_a[:,start_idx:end_idx] return lora_a def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: @@ -122,7 +122,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): shard_size = self.lora_b_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] + lora_b = lora_b[ start_idx:end_idx,:] return lora_b def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index 4d6218d970977..ca01c7e17fff4 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -95,11 +95,13 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) - self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( - lora_a, non_blocking=True) + # NOTE self.lora_a_stacked is row-major, and lora_a is col-major, + # so we need transpose here + self.lora_a_stacked[index, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + 0, :lora_b.shape[0], :lora_b.shape[1]].copy_( + lora_b, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index 958364fca592f..e3198fb3d3ae4 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -86,11 +86,11 @@ class LoRALayerWeights: embeddings_tensor_dim: Optional[int] = None, bias_enabled: Optional[bool] = False) -> "LoRALayerWeights": pin_memory = str(device) == "cpu" and is_pin_memory_available() - lora_a = torch.zeros([input_dim, rank], + lora_a = torch.zeros([rank, input_dim], dtype=dtype, device=device, pin_memory=pin_memory) - lora_b = torch.zeros([rank, output_dim], + lora_b = torch.zeros([output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 9ea46be65cff3..cc64cc78affa7 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -152,30 +152,29 @@ class LoRAModel: module_name, peft_helper, lora_embeddings_tensor) if is_bias: - loras[module_name].bias = tensor.to(device=device, - dtype=dtype).t() - bias = tensor.to(device=device, dtype=dtype).t() + loras[module_name].bias = tensor.to(device=device, dtype=dtype) + bias = tensor.to(device=device, dtype=dtype) if pin_memory: bias = bias.pin_memory() loras[module_name].bias = bias elif is_lora_a: loras[module_name].lora_a = tensor.to(device=device, - dtype=dtype).t() + dtype=dtype) if pin_memory: loras[module_name].lora_a = loras[ module_name].lora_a.pin_memory() else: loras[module_name].lora_b = tensor.to(device=device, - dtype=dtype).t() + dtype=dtype) assert embedding_padding_modules is not None if any(name in module_name for name in embedding_padding_modules ) and target_embedding_padding is not None: lora_b = loras[module_name].lora_b - assert target_embedding_padding >= lora_b.shape[1] - addition = target_embedding_padding - lora_b.shape[1] + assert target_embedding_padding >= lora_b.shape[0] + addition = target_embedding_padding - lora_b.shape[0] loras[module_name].lora_b = torch.nn.functional.pad( - lora_b, (0, addition)) + lora_b, (0, 0, 0, addition)) if pin_memory: loras[module_name].lora_b = loras[ module_name].lora_b.pin_memory() @@ -585,7 +584,6 @@ class LoRAModelManager: "cpu", bias_enabled=bias_enabled, ) - lora.optimize() else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] @@ -600,7 +598,6 @@ class LoRAModelManager: "cpu", bias_enabled=bias_enabled, ) - lora.optimize() subloras.append(lora) lora = PackedLoRALayerWeights.pack(subloras) model.loras[module_name] = lora